46 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
47 int32_t strideU = pKerPrivArgs->
strideU;
48 int32_t dataSize =
sizeof(dataType);
50 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
51 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
52 __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
53 __SE_TEMPLATE_v1 se2Params = __gen_SE_TEMPLATE_v1();
54 __SA_TEMPLATE_v1 sa2Params = __gen_SA_TEMPLATE_v1();
55 __SA_TEMPLATE_v1 sa3Params = __gen_SA_TEMPLATE_v1();
56 __SA_TEMPLATE_v1 sa4Params = __gen_SA_TEMPLATE_v1();
57 __SE_TEMPLATE_v1 se3Params = __gen_SE_TEMPLATE_v1();
58 __SA_TEMPLATE_v1 sa5Params = __gen_SA_TEMPLATE_v1();
60 typedef typename c7x::make_full_vector<dataType>::type vec;
61 int32_t eleCount = c7x::element_count_of<vec>::value;
62 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
63 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
64 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
66 int32_t colUStride = strideU /
sizeof(dataType);
69 se0Params.ICNT1 = eleCount;
70 se0Params.DIM1 = colUStride;
71 se0Params.DIM2 = eleCount * colUStride;
74 se0Params.DIMFMT = __SE_DIMFMT_4D;
75 se0Params.ELETYPE = SE_ELETYPE;
76 se0Params.VECLEN = SE_VECLEN;
79 se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
82 se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
87 sa0Params.DIMFMT = __SA_DIMFMT_2D;
88 sa0Params.VECLEN = SA_VECLEN;
92 se1Params.DIMFMT = __SE_DIMFMT_2D;
93 se1Params.ELETYPE = SE_ELETYPE;
94 se1Params.VECLEN = SE_VECLEN;
97 se2Params.ICNT0 = eleCount * lenTile;
98 se2Params.DIM1 = colUStride * 2;
99 se2Params.DIM2 = eleCount * lenTile;
102 se2Params.DIMFMT = __SE_DIMFMT_4D;
103 se2Params.ELETYPE = SE_ELETYPE;
104 se2Params.VECLEN = SE_VECLEN;
105 se2Params.DECDIM1 = __SE_DECDIM_DIM2;
107 sa2Params.ICNT0 = eleCount * lenTile;
108 sa2Params.DIM1 = colUStride * 2;
109 sa2Params.DIM2 = eleCount * lenTile;
110 sa2Params.DIMFMT = __SA_DIMFMT_3D;
111 sa2Params.VECLEN = SA_VECLEN;
112 sa2Params.DECDIM1 = __SA_DECDIM_DIM2;
115 sa3Params.DIMFMT = __SA_DIMFMT_2D;
116 sa3Params.VECLEN = __SA_VECLEN_1ELEM;
118 sa4Params.ICNT0 = eleCount;
119 sa4Params.DIM1 = eleCount;
122 sa4Params.DIMFMT = __SA_DIMFMT_3D;
123 sa4Params.VECLEN = SA_VECLEN;
124 sa4Params.DECDIM1 = __SA_DECDIM_DIM1;
126 int32_t rowBlock = 8;
127 se3Params.ICNT0 = eleCount;
128 se3Params.ICNT1 = rowBlock;
129 se3Params.DIM1 = colUStride;
130 se3Params.DIM2 = eleCount;
131 se3Params.DIM3 = 2 * rowBlock * colUStride;
132 se3Params.DIMFMT = __SE_DIMFMT_4D;
133 se3Params.ELETYPE = SE_ELETYPE;
134 se3Params.VECLEN = SE_VECLEN;
135 se3Params.DECDIM2 = __SE_DECDIM_DIM2;
137 sa5Params.ICNT0 = eleCount;
138 sa5Params.ICNT1 = rowBlock;
139 sa5Params.DIM1 = colUStride;
140 sa5Params.DIM2 = eleCount;
141 sa5Params.DIM3 = 2 * rowBlock * colUStride;
142 sa5Params.DIMFMT = __SA_DIMFMT_4D;
143 sa5Params.VECLEN = SA_VECLEN;
144 sa5Params.DECDIM2 = __SA_DECDIM_DIM2;
146 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE)) = se0Params;
147 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = sa0Params;
148 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = se1Params;
149 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = se2Params;
150 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = sa2Params;
151 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = sa3Params;
152 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = sa4Params;
153 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = se3Params;
154 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = sa5Params;
171 template <
typename dataType>
176 dataType *half_norm_squared,
183 __SE_TEMPLATE_v1 se0Params, se1Params;
184 __SA_TEMPLATE_v1 sa0Params, sa1Params;
185 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
186 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
187 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
188 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
190 typedef typename c7x::make_full_vector<dataType>::type vec;
191 int32_t eleCount = c7x::element_count_of<vec>::value;
193 int32_t nVec = Nrows / eleCount;
194 int32_t se0ICNT2 = nVec / 2;
195 int32_t se1ICNT2 = nVec - se0ICNT2;
196 int32_t remainingEle = Nrows - (nVec * eleCount);
200 se0Params.ICNT2 = se0ICNT2;
203 se1Params.ICNT2 = se1ICNT2;
206 sa0Params.ICNT0 = se0ICNT2 * eleCount;
207 sa1Params.ICNT0 = Nrows - (se0ICNT2 * eleCount);
209 vec acc1, acc2, acc3, acc4, acc5, acc6;
210 acc1 = acc2 = acc3 = acc4 = acc5 = acc6 = (vec) 0;
213 dataType *pSE1 = U + (se0ICNT2 * colUStride * eleCount);
216 __SE1_OPEN(pSE1, se1Params);
219 __SE0_OPEN(pSE0, se0Params);
221 int32_t iterloop1 = se0ICNT2 / 3;
222 int32_t vertical = iterloop1 * 3;
223 for (int32_t iter = 0; iter < iterloop1; iter++) {
224 vec v1 = c7x::strm_eng<0, vec>::get_adv();
225 vec v2 = c7x::strm_eng<1, vec>::get_adv();
226 vec v3 = c7x::strm_eng<0, vec>::get_adv();
227 vec v4 = c7x::strm_eng<1, vec>::get_adv();
228 vec v5 = c7x::strm_eng<0, vec>::get_adv();
229 vec v6 = c7x::strm_eng<1, vec>::get_adv();
238 for (; vertical < se0ICNT2 - 1; vertical += 2) {
239 vec v1 = c7x::strm_eng<0, vec>::get_adv();
240 vec v2 = c7x::strm_eng<1, vec>::get_adv();
241 vec v3 = c7x::strm_eng<0, vec>::get_adv();
242 vec v4 = c7x::strm_eng<1, vec>::get_adv();
249 for (; vertical < se0ICNT2; vertical++) {
250 vec v1 = c7x::strm_eng<0, vec>::get_adv();
251 vec v2 = c7x::strm_eng<1, vec>::get_adv();
257 if (se0ICNT2 != se1ICNT2) {
258 vec v1 = c7x::strm_eng<1, vec>::get_adv();
266 acc1 = acc1 + acc3 + acc5;
269 c7x_horizontal_add(acc1, &scale);
271 if (remainingEle > 0) {
273 dataType *remU = U + (nVec * eleCount * colUStride);
274 for (int32_t i = 0; i < remainingEle; i++) {
275 scale += __abs(remU[i * colUStride]);
284 dataType *normUStore = U1 + colUStride;
285 dataType *pSA0 = normUStore;
286 dataType *pSA1 = normUStore + (se0ICNT2 * eleCount);
287 dataType scalarRecip =
getRecip(scale);
288 vec reciprocalScale = (vec) scalarRecip;
296 __SA1_OPEN(sa1Params);
298 __SA0_OPEN(sa0Params);
301 for (vertical = 0; vertical < se0ICNT2 - 1; vertical += 2) {
302 vec v1 = c7x::strm_eng<0, vec>::get_adv();
303 vec v2 = c7x::strm_eng<1, vec>::get_adv();
304 vec v3 = c7x::strm_eng<0, vec>::get_adv();
305 vec v4 = c7x::strm_eng<1, vec>::get_adv();
307 v1 = v1 * reciprocalScale;
308 v2 = v2 * reciprocalScale;
309 v3 = v3 * reciprocalScale;
310 v4 = v4 * reciprocalScale;
317 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
318 vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
319 __vstore_pred(pred1, pNormU1, v1);
321 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
322 vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
323 __vstore_pred(pred2, pNormU2, v2);
325 __vpred pred3 = c7x::strm_agen<0, vec>::get_vpred();
326 vec *pNormU3 = c7x::strm_agen<0, vec>::get_adv(pSA0);
327 __vstore_pred(pred3, pNormU3, v3);
329 __vpred pred4 = c7x::strm_agen<1, vec>::get_vpred();
330 vec *pNormU4 = c7x::strm_agen<1, vec>::get_adv(pSA1);
331 __vstore_pred(pred4, pNormU4, v4);
334 for (; vertical < se0ICNT2; vertical++) {
335 vec v1 = c7x::strm_eng<0, vec>::get_adv();
336 vec v2 = c7x::strm_eng<1, vec>::get_adv();
338 v1 = v1 * reciprocalScale;
339 v2 = v2 * reciprocalScale;
344 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
345 vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
346 __vstore_pred(pred1, pNormU1, v1);
348 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
349 vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
350 __vstore_pred(pred2, pNormU2, v2);
353 if (se0ICNT2 != se1ICNT2) {
354 vec v2 = c7x::strm_eng<1, vec>::get_adv();
356 v2 = v2 * reciprocalScale;
360 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
361 vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
362 __vstore_pred(pred2, pNormU2, v2);
368 c7x_horizontal_add(acc1, &s2);
370 if (remainingEle > 0) {
372 dataType *remU = U + (nVec * eleCount * colUStride);
373 se1Params.ICNT1 = remainingEle;
375 __SE1_OPEN(remU, se1Params);
376 vec v2 = c7x::strm_eng<1, vec>::get_adv();
378 v2 = v2 * reciprocalScale;
382 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
383 vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
384 __vstore_pred(pred2, pNormU2, v2);
386 for (int32_t i = 0; i < remainingEle; i++) {
391 dataType diagEle = U[0] * scalarRecip;
393 const dataType Half = 0.5;
394 const dataType OneP5 = 1.5;
395 dataType x = __recip_sqrt(s2);
396 x = x * (OneP5 - (s2 * x * x * Half));
397 x = x * (OneP5 - (s2 * x * x * Half));
406 *half_norm_squared = diagEle * (*s) - s2;
407 *pSA0 = (*pSA0) - (*s);
424 float *half_norm_squared,
432 double *half_norm_squared,
441 template <
typename dataType>
446 dataType half_norm_squared,
453 typedef typename c7x::make_full_vector<dataType>::type vec;
454 int32_t eleCount = c7x::element_count_of<vec>::value;
456 __SE_TEMPLATE_v1 se0Params;
457 __SE_TEMPLATE_v1 se1Params;
458 __SA_TEMPLATE_v1 sa2Params;
460 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
461 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
462 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
465 int32_t nTiles = DSPLIB_ceilingDiv(Ncols, eleCount * lenTile);
466 int32_t se1ICNT1 = Nrows / 2;
467 int32_t se0ICNT1 = Nrows - se1ICNT1;
469 se0Params.ICNT1 = se0ICNT1;
470 se0Params.ICNT2 = nTiles;
471 se0Params.DECDIM1_WIDTH = Ncols;
473 se1Params.ICNT1 = se1ICNT1;
474 se1Params.ICNT2 = nTiles;
475 se1Params.DECDIM1_WIDTH = Ncols;
477 sa2Params.ICNT1 = lenTile * nTiles;
478 sa2Params.DECDIM1_WIDTH = Ncols;
480 dataType *siStore = (dataType *) U1;
481 dataType *reciprocalLoad = (dataType *) U1 + colUStride;
483 vec reciprocalHalfNorm = (vec)
getRecip(half_norm_squared);
485 __SA_TEMPLATE_v1 sa3Params;
486 sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
487 sa3Params.ICNT0 = Nrows;
488 sa3Params.ICNT1 = 2 * nTiles;
493 __SE0_OPEN(U + 1, se0Params);
494 __SA2_OPEN(sa2Params);
495 __SA3_OPEN(sa3Params);
499 __SE1_OPEN(U + 1 + colUStride, se1Params);
502 for (int32_t tile = 0; tile < nTiles; tile++) {
503 vec acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8;
504 acc1 = acc2 = acc3 = acc4 = acc5 = acc6 = acc7 = acc8 = (vec) 0;
505 for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
506 vec v01 = c7x::strm_eng<0, vec>::get_adv();
507 vec v02 = c7x::strm_eng<0, vec>::get_adv();
508 vec v03 = c7x::strm_eng<0, vec>::get_adv();
509 vec v04 = c7x::strm_eng<0, vec>::get_adv();
510 vec v05 = c7x::strm_eng<0, vec>::get_adv();
511 vec v06 = c7x::strm_eng<0, vec>::get_adv();
512 vec v07 = c7x::strm_eng<0, vec>::get_adv();
513 vec v08 = c7x::strm_eng<0, vec>::get_adv();
515 vec v11 = c7x::strm_eng<1, vec>::get_adv();
516 vec v12 = c7x::strm_eng<1, vec>::get_adv();
517 vec v13 = c7x::strm_eng<1, vec>::get_adv();
518 vec v14 = c7x::strm_eng<1, vec>::get_adv();
519 vec v15 = c7x::strm_eng<1, vec>::get_adv();
520 vec v16 = c7x::strm_eng<1, vec>::get_adv();
521 vec v17 = c7x::strm_eng<1, vec>::get_adv();
522 vec v18 = c7x::strm_eng<1, vec>::get_adv();
524 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
525 vec u1 = __vload_dup(pU1);
526 dataType *pU2 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
527 vec u2 = __vload_dup(pU2);
529 acc1 += v01 * u1 + v11 * u2;
530 acc2 += v02 * u1 + v12 * u2;
531 acc3 += v03 * u1 + v13 * u2;
532 acc4 += v04 * u1 + v14 * u2;
533 acc5 += v05 * u1 + v15 * u2;
534 acc6 += v06 * u1 + v16 * u2;
535 acc7 += v07 * u1 + v17 * u2;
536 acc8 += v08 * u1 + v18 * u2;
539 if (se1ICNT1 != se0ICNT1)
541 vec v01 = c7x::strm_eng<0, vec>::get_adv();
542 vec v02 = c7x::strm_eng<0, vec>::get_adv();
543 vec v03 = c7x::strm_eng<0, vec>::get_adv();
544 vec v04 = c7x::strm_eng<0, vec>::get_adv();
545 vec v05 = c7x::strm_eng<0, vec>::get_adv();
546 vec v06 = c7x::strm_eng<0, vec>::get_adv();
547 vec v07 = c7x::strm_eng<0, vec>::get_adv();
548 vec v08 = c7x::strm_eng<0, vec>::get_adv();
550 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
551 vec u1 = __vload_dup(pU1);
563 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
564 vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
565 __vstore_pred(pred, pStoreVec, acc1 * reciprocalHalfNorm);
567 pred = c7x::strm_agen<2, vec>::get_vpred();
568 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
569 __vstore_pred(pred, pStoreVec, acc2 * reciprocalHalfNorm);
571 pred = c7x::strm_agen<2, vec>::get_vpred();
572 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
573 __vstore_pred(pred, pStoreVec, acc3 * reciprocalHalfNorm);
575 pred = c7x::strm_agen<2, vec>::get_vpred();
576 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
577 __vstore_pred(pred, pStoreVec, acc4 * reciprocalHalfNorm);
579 pred = c7x::strm_agen<2, vec>::get_vpred();
580 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
581 __vstore_pred(pred, pStoreVec, acc5 * reciprocalHalfNorm);
583 pred = c7x::strm_agen<2, vec>::get_vpred();
584 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
585 __vstore_pred(pred, pStoreVec, acc6 * reciprocalHalfNorm);
587 pred = c7x::strm_agen<2, vec>::get_vpred();
588 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
589 __vstore_pred(pred, pStoreVec, acc7 * reciprocalHalfNorm);
591 pred = c7x::strm_agen<2, vec>::get_vpred();
592 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
593 __vstore_pred(pred, pStoreVec, acc8 * reciprocalHalfNorm);
598 __SA_TEMPLATE_v1 sa0Params;
599 __SA_TEMPLATE_v1 sa1Params;
601 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
602 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
604 sa0Params.ICNT1 = se0ICNT1;
605 sa0Params.ICNT2 = nTiles;
606 sa0Params.DECDIM1_WIDTH = Ncols;
608 sa1Params.ICNT1 = se1ICNT1;
609 sa1Params.ICNT2 = nTiles;
610 sa1Params.DECDIM1_WIDTH = Ncols;
612 __SA0_OPEN(sa0Params);
614 __SA1_OPEN(sa1Params);
616 for (int32_t tile = 0; tile < nTiles; tile++) {
617 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
618 vec *pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
619 vec si1 = __vload_pred(pred, pSi);
621 pred = c7x::strm_agen<2, vec>::get_vpred();
622 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
623 vec si2 = __vload_pred(pred, pSi);
625 pred = c7x::strm_agen<2, vec>::get_vpred();
626 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
627 vec si3 = __vload_pred(pred, pSi);
629 pred = c7x::strm_agen<2, vec>::get_vpred();
630 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
631 vec si4 = __vload_pred(pred, pSi);
633 pred = c7x::strm_agen<2, vec>::get_vpred();
634 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
635 vec si5 = __vload_pred(pred, pSi);
637 pred = c7x::strm_agen<2, vec>::get_vpred();
638 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
639 vec si6 = __vload_pred(pred, pSi);
641 pred = c7x::strm_agen<2, vec>::get_vpred();
642 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
643 vec si7 = __vload_pred(pred, pSi);
645 pred = c7x::strm_agen<2, vec>::get_vpred();
646 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
647 vec si8 = __vload_pred(pred, pSi);
650 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
651 vec u1 = __vload_dup(pU1);
652 dataType *pU2 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
653 vec u2 = __vload_dup(pU2);
654 for (int32_t vertical = 0; vertical < se1ICNT1 - 1; vertical++) {
655 vec v01 = c7x::strm_eng<0, vec>::get_adv();
656 vec v02 = c7x::strm_eng<0, vec>::get_adv();
657 vec v03 = c7x::strm_eng<0, vec>::get_adv();
658 vec v04 = c7x::strm_eng<0, vec>::get_adv();
659 vec v05 = c7x::strm_eng<0, vec>::get_adv();
660 vec v06 = c7x::strm_eng<0, vec>::get_adv();
661 vec v07 = c7x::strm_eng<0, vec>::get_adv();
662 vec v08 = c7x::strm_eng<0, vec>::get_adv();
664 vec v11 = c7x::strm_eng<1, vec>::get_adv();
665 vec v12 = c7x::strm_eng<1, vec>::get_adv();
666 vec v13 = c7x::strm_eng<1, vec>::get_adv();
667 vec v14 = c7x::strm_eng<1, vec>::get_adv();
668 vec v15 = c7x::strm_eng<1, vec>::get_adv();
669 vec v16 = c7x::strm_eng<1, vec>::get_adv();
670 vec v17 = c7x::strm_eng<1, vec>::get_adv();
671 vec v18 = c7x::strm_eng<1, vec>::get_adv();
694 pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
695 u1 = __vload_dup(pU1);
696 pU2 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
697 u2 = __vload_dup(pU2);
699 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
700 vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
701 __vstore_pred(pred1, p1, v01);
703 pred1 = c7x::strm_agen<0, vec>::get_vpred();
704 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
705 __vstore_pred(pred1, p1, v02);
707 pred1 = c7x::strm_agen<0, vec>::get_vpred();
708 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
709 __vstore_pred(pred1, p1, v03);
711 pred1 = c7x::strm_agen<0, vec>::get_vpred();
712 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
713 __vstore_pred(pred1, p1, v04);
715 pred1 = c7x::strm_agen<0, vec>::get_vpred();
716 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
717 __vstore_pred(pred1, p1, v05);
719 pred1 = c7x::strm_agen<0, vec>::get_vpred();
720 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
721 __vstore_pred(pred1, p1, v06);
723 pred1 = c7x::strm_agen<0, vec>::get_vpred();
724 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
725 __vstore_pred(pred1, p1, v07);
727 pred1 = c7x::strm_agen<0, vec>::get_vpred();
728 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
729 __vstore_pred(pred1, p1, v08);
731 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
732 vec *p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
733 __vstore_pred(pred2, p2, v11);
735 pred2 = c7x::strm_agen<1, vec>::get_vpred();
736 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
737 __vstore_pred(pred2, p2, v12);
739 pred2 = c7x::strm_agen<1, vec>::get_vpred();
740 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
741 __vstore_pred(pred2, p2, v13);
743 pred2 = c7x::strm_agen<1, vec>::get_vpred();
744 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
745 __vstore_pred(pred2, p2, v14);
747 pred2 = c7x::strm_agen<1, vec>::get_vpred();
748 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
749 __vstore_pred(pred2, p2, v15);
751 pred2 = c7x::strm_agen<1, vec>::get_vpred();
752 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
753 __vstore_pred(pred2, p2, v16);
755 pred2 = c7x::strm_agen<1, vec>::get_vpred();
756 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
757 __vstore_pred(pred2, p2, v17);
759 pred2 = c7x::strm_agen<1, vec>::get_vpred();
760 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
761 __vstore_pred(pred2, p2, v18);
765 vec v01 = c7x::strm_eng<0, vec>::get_adv();
766 vec v02 = c7x::strm_eng<0, vec>::get_adv();
767 vec v03 = c7x::strm_eng<0, vec>::get_adv();
768 vec v04 = c7x::strm_eng<0, vec>::get_adv();
769 vec v05 = c7x::strm_eng<0, vec>::get_adv();
770 vec v06 = c7x::strm_eng<0, vec>::get_adv();
771 vec v07 = c7x::strm_eng<0, vec>::get_adv();
772 vec v08 = c7x::strm_eng<0, vec>::get_adv();
774 vec v11 = c7x::strm_eng<1, vec>::get_adv();
775 vec v12 = c7x::strm_eng<1, vec>::get_adv();
776 vec v13 = c7x::strm_eng<1, vec>::get_adv();
777 vec v14 = c7x::strm_eng<1, vec>::get_adv();
778 vec v15 = c7x::strm_eng<1, vec>::get_adv();
779 vec v16 = c7x::strm_eng<1, vec>::get_adv();
780 vec v17 = c7x::strm_eng<1, vec>::get_adv();
781 vec v18 = c7x::strm_eng<1, vec>::get_adv();
804 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
805 vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
806 __vstore_pred(pred1, p1, v01);
808 pred1 = c7x::strm_agen<0, vec>::get_vpred();
809 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
810 __vstore_pred(pred1, p1, v02);
812 pred1 = c7x::strm_agen<0, vec>::get_vpred();
813 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
814 __vstore_pred(pred1, p1, v03);
816 pred1 = c7x::strm_agen<0, vec>::get_vpred();
817 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
818 __vstore_pred(pred1, p1, v04);
820 pred1 = c7x::strm_agen<0, vec>::get_vpred();
821 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
822 __vstore_pred(pred1, p1, v05);
824 pred1 = c7x::strm_agen<0, vec>::get_vpred();
825 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
826 __vstore_pred(pred1, p1, v06);
828 pred1 = c7x::strm_agen<0, vec>::get_vpred();
829 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
830 __vstore_pred(pred1, p1, v07);
832 pred1 = c7x::strm_agen<0, vec>::get_vpred();
833 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
834 __vstore_pred(pred1, p1, v08);
836 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
837 vec *p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
838 __vstore_pred(pred2, p2, v11);
840 pred2 = c7x::strm_agen<1, vec>::get_vpred();
841 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
842 __vstore_pred(pred2, p2, v12);
844 pred2 = c7x::strm_agen<1, vec>::get_vpred();
845 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
846 __vstore_pred(pred2, p2, v13);
848 pred2 = c7x::strm_agen<1, vec>::get_vpred();
849 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
850 __vstore_pred(pred2, p2, v14);
852 pred2 = c7x::strm_agen<1, vec>::get_vpred();
853 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
854 __vstore_pred(pred2, p2, v15);
856 pred2 = c7x::strm_agen<1, vec>::get_vpred();
857 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
858 __vstore_pred(pred2, p2, v16);
860 pred2 = c7x::strm_agen<1, vec>::get_vpred();
861 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
862 __vstore_pred(pred2, p2, v17);
864 pred2 = c7x::strm_agen<1, vec>::get_vpred();
865 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
866 __vstore_pred(pred2, p2, v18);
870 if (se0ICNT1 != se1ICNT1) {
871 vec v01 = c7x::strm_eng<0, vec>::get_adv();
872 vec v02 = c7x::strm_eng<0, vec>::get_adv();
873 vec v03 = c7x::strm_eng<0, vec>::get_adv();
874 vec v04 = c7x::strm_eng<0, vec>::get_adv();
875 vec v05 = c7x::strm_eng<0, vec>::get_adv();
876 vec v06 = c7x::strm_eng<0, vec>::get_adv();
877 vec v07 = c7x::strm_eng<0, vec>::get_adv();
878 vec v08 = c7x::strm_eng<0, vec>::get_adv();
880 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
881 vec u1 = __vload_dup(pU1);
892 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
893 vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
894 __vstore_pred(pred1, p1, v01);
896 pred1 = c7x::strm_agen<0, vec>::get_vpred();
897 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
898 __vstore_pred(pred1, p1, v02);
900 pred1 = c7x::strm_agen<0, vec>::get_vpred();
901 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
902 __vstore_pred(pred1, p1, v03);
904 pred1 = c7x::strm_agen<0, vec>::get_vpred();
905 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
906 __vstore_pred(pred1, p1, v04);
908 pred1 = c7x::strm_agen<0, vec>::get_vpred();
909 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
910 __vstore_pred(pred1, p1, v05);
912 pred1 = c7x::strm_agen<0, vec>::get_vpred();
913 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
914 __vstore_pred(pred1, p1, v06);
916 pred1 = c7x::strm_agen<0, vec>::get_vpred();
917 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
918 __vstore_pred(pred1, p1, v07);
920 pred1 = c7x::strm_agen<0, vec>::get_vpred();
921 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
922 __vstore_pred(pred1, p1, v08);
941 float half_norm_squared,
949 double half_norm_squared,
958 template <
typename dataType>
963 dataType *half_norm_squared,
971 typedef typename c7x::make_full_vector<dataType>::type vec;
972 int32_t eleCount = c7x::element_count_of<vec>::value;
974 __SE_TEMPLATE_v1 se0Params, se1Params;
976 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
977 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
979 int32_t nVec = DSPLIB_ceilingDiv(Ncols, eleCount);
982 int32_t iterloop1 = nVec / 12;
983 int32_t remainingVec = nVec - (iterloop1 * 12);
986 int32_t iterloop2 = remainingVec / 4;
987 remainingVec = remainingVec - (iterloop2 * 4);
990 int32_t iterloop3 = remainingVec / 2;
992 int32_t se0Iter = nVec / 2;
993 int32_t se1Iter = nVec - se0Iter;
995 int32_t se0ICNT0 = se0Iter * eleCount;
996 int32_t se1ICNT0 = Ncols - se0ICNT0;
998 se0Params.ICNT0 = se0ICNT0;
999 se1Params.ICNT0 = se1ICNT0;
1011 __SE1_OPEN(&U[se0ICNT0], se1Params);
1013 __SE0_OPEN(U, se0Params);
1016 for (iter = 0; iter < iterloop1; iter++) {
1017 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1018 vec v2 = c7x::strm_eng<0, vec>::get_adv();
1019 vec v3 = c7x::strm_eng<0, vec>::get_adv();
1020 vec v4 = c7x::strm_eng<0, vec>::get_adv();
1021 vec v5 = c7x::strm_eng<0, vec>::get_adv();
1022 vec v6 = c7x::strm_eng<0, vec>::get_adv();
1024 vec v7 = c7x::strm_eng<1, vec>::get_adv();
1025 vec v8 = c7x::strm_eng<1, vec>::get_adv();
1026 vec v9 = c7x::strm_eng<1, vec>::get_adv();
1027 vec v10 = c7x::strm_eng<1, vec>::get_adv();
1028 vec v11 = c7x::strm_eng<1, vec>::get_adv();
1029 vec v12 = c7x::strm_eng<1, vec>::get_adv();
1046 for (iter = 0; iter < iterloop2; iter++) {
1047 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1048 vec v2 = c7x::strm_eng<0, vec>::get_adv();
1049 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1050 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1058 for (iter = 0; iter < iterloop3; iter++) {
1059 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1060 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1070 acc1 = acc1 + acc3 + acc5;
1072 if (se0Iter != se1Iter) {
1073 vec v = c7x::strm_eng<1, vec>::get_adv();
1078 c7x_horizontal_add(acc1, &scale);
1082 dataType *normUStore = U1 + colUStride;
1083 dataType *pSA0 = normUStore;
1084 dataType *pSA1 = normUStore + se0ICNT0;
1087 int32_t lastIndex = 0;
1090 iterloop1 = nVec / 4;
1091 remainingVec = nVec - (iterloop1 * 4);
1094 iterloop2 = remainingVec / 2;
1096 vec reciprocalScale = (vec)
getRecip(scale);
1103 __SA_TEMPLATE_v1 sa0Params, sa1Params;
1104 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1105 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1107 sa0Params.ICNT0 = se0ICNT0;
1108 sa1Params.ICNT0 = se1ICNT0;
1109 sa0Params.ICNT1 = sa1Params.ICNT1 = 2;
1111 __SA1_OPEN(sa1Params);
1113 __SA0_OPEN(sa0Params);
1119 for (iter = 0; iter < iterloop1; iter++) {
1120 v1 = c7x::strm_eng<0, vec>::get_adv();
1121 v2 = c7x::strm_eng<0, vec>::get_adv();
1122 v3 = c7x::strm_eng<1, vec>::get_adv();
1123 v4 = c7x::strm_eng<1, vec>::get_adv();
1125 v1 = v1 * reciprocalScale;
1126 v2 = v2 * reciprocalScale;
1127 v3 = v3 * reciprocalScale;
1128 v4 = v4 * reciprocalScale;
1135 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1136 vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1137 __vstore_pred(pred1, pNormU1, v1);
1139 __vpred pred2 = c7x::strm_agen<0, vec>::get_vpred();
1140 vec *pNormU2 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1141 __vstore_pred(pred2, pNormU2, v2);
1143 __vpred pred3 = c7x::strm_agen<1, vec>::get_vpred();
1144 vec *pNormU3 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1145 __vstore_pred(pred3, pNormU3, v3);
1147 __vpred pred4 = c7x::strm_agen<1, vec>::get_vpred();
1148 vec *pNormU4 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1149 __vstore_pred(pred4, pNormU4, v4);
1152 if (iterloop1 * 4 == nVec) {
1153 lastIndex = se1ICNT0 - ((se1Iter - 1) * eleCount) - 1;
1157 for (iter = 0; iter < iterloop2; iter++) {
1158 v1 = c7x::strm_eng<0, vec>::get_adv();
1159 v2 = c7x::strm_eng<1, vec>::get_adv();
1161 v1 = v1 * reciprocalScale;
1162 v2 = v2 * reciprocalScale;
1167 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1168 vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1169 __vstore_pred(pred1, pNormU1, v1);
1171 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1172 vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1173 __vstore_pred(pred2, pNormU2, v2);
1176 if (iterloop2 * 2 == nVec) {
1177 lastIndex = se1ICNT0 - ((se1Iter - 1) * eleCount) - 1;
1184 if (se0Iter != se1Iter) {
1185 v1 = c7x::strm_eng<1, vec>::get_adv();
1186 v1 = v1 * reciprocalScale;
1189 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1190 vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1191 __vstore_pred(pred2, pNormU2, v1);
1193 lastIndex = se1ICNT0 - ((se1Iter - 1) * eleCount) - 1;
1203 c7x_horizontal_add(acc1, &s2);
1205 const dataType Half = 0.5;
1206 const dataType OneP5 = 1.5;
1207 dataType x = __recip_sqrt(s2);
1208 x = x * (OneP5 - (s2 * x * x * Half));
1209 x = x * (OneP5 - (s2 * x * x * Half));
1210 dataType y = s2 * x;
1212 if (lastV.s[lastIndex] < 0) {
1219 dataType recipScale =
getRecip(scale);
1220 *half_norm_squared = (U[0] * recipScale) * (*s) - s2;
1222 U[0] = U[0] - ((*s) * scale);
1223 *pSA0 = (*pSA0) - (*s);
1227 vec reciprocalHalfNorm = (vec)
getRecip(*half_norm_squared);
1229 __SE1_OPEN(pSA1, se1Params);
1231 __SE0_OPEN(pSA0, se0Params);
1233 for (int32_t horizontal = 0; horizontal < nVec - 1; horizontal += 2) {
1234 v1 = c7x::strm_eng<0, vec>::get_adv();
1235 v2 = c7x::strm_eng<1, vec>::get_adv();
1237 v1 = v1 * reciprocalHalfNorm;
1238 v2 = v2 * reciprocalHalfNorm;
1240 __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
1241 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv(superdiag);
1242 __vstore_pred(pred, pStoreVec, v1);
1244 pred = c7x::strm_agen<1, vec>::get_vpred();
1245 pStoreVec = c7x::strm_agen<1, vec>::get_adv(&superdiag[se0ICNT0]);
1246 __vstore_pred(pred, pStoreVec, v2);
1249 if (se0Iter != se1Iter) {
1250 v1 = c7x::strm_eng<1, vec>::get_adv();
1252 v1 = v1 * reciprocalHalfNorm;
1254 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
1255 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(&superdiag[se0ICNT0]);
1256 __vstore_pred(pred, pStoreVec, v1);
1273 float *half_norm_squared,
1282 double *half_norm_squared,
1292 template <
typename dataType>
1297 dataType *superdiag,
1304 typedef typename c7x::make_full_vector<dataType>::type vec;
1305 int32_t eleCount = c7x::element_count_of<vec>::value;
1307 __SE_TEMPLATE_v1 se0Params, se1Params;
1308 __SA_TEMPLATE_v1 sa0Params, sa1Params, sa2Params, sa3Params;
1309 dataType *siStore = (dataType *) U1;
1313 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
1314 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
1315 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
1316 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
1317 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1318 sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1320 int32_t nVec = DSPLIB_ceilingDiv(Ncols, eleCount);
1321 int32_t rowBlock = 8;
1323 int32_t numBlocks = Nrows / (rowBlock);
1324 int32_t se1ICNT3 = numBlocks / 2;
1325 int32_t se0ICNT3 = numBlocks - se1ICNT3;
1327 int32_t remainingRows = Nrows - (numBlocks * rowBlock);
1329 int32_t remSE1ICNT1 = remainingRows / 2;
1330 int32_t remSE0ICNT1 = remainingRows - remSE1ICNT1;
1332 se0Params.ICNT2 = se1Params.ICNT2 = nVec;
1333 se0Params.ICNT3 = se0ICNT3;
1334 se1Params.ICNT3 = se1ICNT3;
1335 se0Params.DECDIM2_WIDTH = se1Params.DECDIM2_WIDTH = Ncols;
1337 sa2Params.ICNT0 = Ncols;
1338 sa2Params.ICNT1 = (se0ICNT3 * rowBlock) + remSE0ICNT1;
1340 sa3Params.ICNT0 = Nrows;
1341 sa3Params.ICNT1 = 1;
1343 dataType *reciprocalLoad = (dataType *) U1 + colUStride;
1345 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
1346 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
1347 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1349 int32_t se0ICNT0 = Ncols / 2;
1350 int32_t se1ICNT0 = Ncols - se0ICNT0;
1351 int32_t se1ICNT2 = DSPLIB_ceilingDiv(Nrows, eleCount);
1353 se0Params.ICNT0 = se0ICNT0;
1354 se0Params.ICNT2 = se1ICNT2;
1356 se1Params.ICNT0 = se1ICNT0;
1357 se1Params.ICNT2 = se1ICNT2;
1359 sa2Params.ICNT0 = Nrows;
1361 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1362 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1364 sa0Params.ICNT0 = se0ICNT0;
1365 sa1Params.ICNT0 = se1ICNT0;
1366 sa0Params.ICNT1 = sa1Params.ICNT1 = se1ICNT2;
1368 __SA1_OPEN(sa1Params);
1369 __SE1_OPEN(&U[se0ICNT0 + colUStride], se1Params);
1370 __SA2_OPEN(sa2Params);
1372 __SA0_OPEN(sa0Params);
1373 __SE0_OPEN(&U[colUStride], se0Params);
1374 for (int32_t vertical = 0; vertical < se1ICNT2; vertical++) {
1379 int32_t horizontal = 0;
1380 for (horizontal = 0; horizontal < se0ICNT0 - 1; horizontal += 2) {
1381 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1382 vec v2 = c7x::strm_eng<0, vec>::get_adv();
1384 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1385 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1387 dataType *pU1 = c7x::strm_agen<0, dataType>::get_adv(reciprocalLoad);
1388 vec refCol1 = __vload_dup(pU1);
1389 dataType *pU2 = c7x::strm_agen<0, dataType>::get_adv(reciprocalLoad);
1390 vec refCol2 = __vload_dup(pU2);
1391 dataType *pU3 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1392 vec refCol3 = __vload_dup(pU3);
1393 dataType *pU4 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1394 vec refCol4 = __vload_dup(pU4);
1396 acc1 += v1 * refCol1;
1397 acc2 += v2 * refCol2;
1398 acc3 += v3 * refCol3;
1399 acc4 += v4 * refCol4;
1402 for (; horizontal < se0ICNT0; horizontal++) {
1403 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1404 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1406 dataType *pU1 = c7x::strm_agen<0, dataType>::get_adv(reciprocalLoad);
1407 vec refCol1 = __vload_dup(pU1);
1408 dataType *pU2 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1409 vec refCol2 = __vload_dup(pU2);
1411 acc1 += v1 * refCol1;
1412 acc2 += v2 * refCol2;
1415 if (se0ICNT0 != se1ICNT0) {
1416 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1418 dataType *pU1 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1419 vec refCol1 = __vload_dup(pU1);
1421 acc1 += v1 * refCol1;
1429 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1430 vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
1431 __vstore_pred(pred, pStoreVec, acc1);
1437 dataType *pU1 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1438 vec refCol1 = __vload_dup(pU1);
1440 for (int32_t vertical = 0; vertical < se1ICNT2; vertical++) {
1441 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1443 vec acc1 = v1 * refCol1;
1445 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1446 vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
1447 __vstore_pred(pred, pStoreVec, acc1);
1459 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1460 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1461 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1462 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1463 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1464 sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1466 int32_t lenTile = 8;
1467 int32_t nTiles = DSPLIB_ceilingDiv(Ncols, eleCount * lenTile);
1468 int32_t se1ICNT1 = Nrows / 2;
1469 int32_t se0ICNT1 = Nrows - se1ICNT1;
1471 se0Params.ICNT1 = se0ICNT1;
1472 se0Params.ICNT2 = nTiles;
1473 se0Params.DECDIM1_WIDTH = Ncols;
1475 se1Params.ICNT1 = se1ICNT1;
1476 se1Params.ICNT2 = nTiles;
1477 se1Params.DECDIM1_WIDTH = Ncols;
1479 sa2Params.ICNT0 = Ncols;
1481 sa3Params.ICNT0 = Nrows;
1482 sa3Params.ICNT1 = nTiles;
1484 sa0Params.ICNT1 = se0ICNT1;
1485 sa0Params.ICNT2 = nTiles;
1486 sa0Params.DECDIM1_WIDTH = Ncols;
1488 sa1Params.ICNT1 = se1ICNT1;
1489 sa1Params.ICNT2 = nTiles;
1490 sa1Params.DECDIM1_WIDTH = Ncols;
1492 dataType *pSE0 = U + colUStride;
1493 dataType *pSE1 = U + (2 * colUStride);
1495 __SE0_OPEN(pSE0, se0Params);
1496 __SA0_OPEN(sa0Params);
1497 __SA2_OPEN(sa2Params);
1498 __SA3_OPEN(sa3Params);
1500 __SE1_OPEN(pSE1, se1Params);
1501 __SA1_OPEN(sa1Params);
1503 for (int32_t tile = 0; tile < nTiles; tile++) {
1504 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1505 vec *pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1506 vec sd1 = __vload_pred(pred, pSd);
1508 pred = c7x::strm_agen<2, vec>::get_vpred();
1509 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1510 vec sd2 = __vload_pred(pred, pSd);
1512 pred = c7x::strm_agen<2, vec>::get_vpred();
1513 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1514 vec sd3 = __vload_pred(pred, pSd);
1516 pred = c7x::strm_agen<2, vec>::get_vpred();
1517 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1518 vec sd4 = __vload_pred(pred, pSd);
1520 pred = c7x::strm_agen<2, vec>::get_vpred();
1521 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1522 vec sd5 = __vload_pred(pred, pSd);
1524 pred = c7x::strm_agen<2, vec>::get_vpred();
1525 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1526 vec sd6 = __vload_pred(pred, pSd);
1528 pred = c7x::strm_agen<2, vec>::get_vpred();
1529 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1530 vec sd7 = __vload_pred(pred, pSd);
1532 pred = c7x::strm_agen<2, vec>::get_vpred();
1533 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1534 vec sd8 = __vload_pred(pred, pSd);
1537 dataType *pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1538 vec si1 = __vload_dup(pSi1);
1539 dataType *pSi2 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1540 vec si2 = __vload_dup(pSi2);
1541 for (int32_t vertical = 0; vertical < se1ICNT1 - 1; vertical++) {
1542 vec v01 = c7x::strm_eng<0, vec>::get_adv();
1543 vec v02 = c7x::strm_eng<0, vec>::get_adv();
1544 vec v03 = c7x::strm_eng<0, vec>::get_adv();
1545 vec v04 = c7x::strm_eng<0, vec>::get_adv();
1546 vec v05 = c7x::strm_eng<0, vec>::get_adv();
1547 vec v06 = c7x::strm_eng<0, vec>::get_adv();
1548 vec v07 = c7x::strm_eng<0, vec>::get_adv();
1549 vec v08 = c7x::strm_eng<0, vec>::get_adv();
1551 vec v11 = c7x::strm_eng<1, vec>::get_adv();
1552 vec v12 = c7x::strm_eng<1, vec>::get_adv();
1553 vec v13 = c7x::strm_eng<1, vec>::get_adv();
1554 vec v14 = c7x::strm_eng<1, vec>::get_adv();
1555 vec v15 = c7x::strm_eng<1, vec>::get_adv();
1556 vec v16 = c7x::strm_eng<1, vec>::get_adv();
1557 vec v17 = c7x::strm_eng<1, vec>::get_adv();
1558 vec v18 = c7x::strm_eng<1, vec>::get_adv();
1577 pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1578 si1 = __vload_dup(pSi1);
1579 pSi2 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1580 si2 = __vload_dup(pSi2);
1582 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1583 vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1584 __vstore_pred(pred1, p1, v01);
1586 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1587 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1588 __vstore_pred(pred1, p1, v02);
1590 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1591 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1592 __vstore_pred(pred1, p1, v03);
1594 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1595 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1596 __vstore_pred(pred1, p1, v04);
1598 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1599 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1600 __vstore_pred(pred1, p1, v05);
1602 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1603 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1604 __vstore_pred(pred1, p1, v06);
1606 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1607 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1608 __vstore_pred(pred1, p1, v07);
1610 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1611 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1612 __vstore_pred(pred1, p1, v08);
1614 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1615 vec *p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1616 __vstore_pred(pred2, p2, v11);
1618 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1619 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1620 __vstore_pred(pred2, p2, v12);
1622 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1623 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1624 __vstore_pred(pred2, p2, v13);
1626 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1627 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1628 __vstore_pred(pred2, p2, v14);
1630 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1631 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1632 __vstore_pred(pred2, p2, v15);
1634 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1635 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1636 __vstore_pred(pred2, p2, v16);
1638 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1639 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1640 __vstore_pred(pred2, p2, v17);
1642 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1643 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1644 __vstore_pred(pred2, p2, v18);
1648 vec v01 = c7x::strm_eng<0, vec>::get_adv();
1649 vec v02 = c7x::strm_eng<0, vec>::get_adv();
1650 vec v03 = c7x::strm_eng<0, vec>::get_adv();
1651 vec v04 = c7x::strm_eng<0, vec>::get_adv();
1652 vec v05 = c7x::strm_eng<0, vec>::get_adv();
1653 vec v06 = c7x::strm_eng<0, vec>::get_adv();
1654 vec v07 = c7x::strm_eng<0, vec>::get_adv();
1655 vec v08 = c7x::strm_eng<0, vec>::get_adv();
1657 vec v11 = c7x::strm_eng<1, vec>::get_adv();
1658 vec v12 = c7x::strm_eng<1, vec>::get_adv();
1659 vec v13 = c7x::strm_eng<1, vec>::get_adv();
1660 vec v14 = c7x::strm_eng<1, vec>::get_adv();
1661 vec v15 = c7x::strm_eng<1, vec>::get_adv();
1662 vec v16 = c7x::strm_eng<1, vec>::get_adv();
1663 vec v17 = c7x::strm_eng<1, vec>::get_adv();
1664 vec v18 = c7x::strm_eng<1, vec>::get_adv();
1683 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1684 vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1685 __vstore_pred(pred1, p1, v01);
1687 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1688 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1689 __vstore_pred(pred1, p1, v02);
1691 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1692 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1693 __vstore_pred(pred1, p1, v03);
1695 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1696 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1697 __vstore_pred(pred1, p1, v04);
1699 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1700 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1701 __vstore_pred(pred1, p1, v05);
1703 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1704 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1705 __vstore_pred(pred1, p1, v06);
1707 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1708 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1709 __vstore_pred(pred1, p1, v07);
1711 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1712 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1713 __vstore_pred(pred1, p1, v08);
1715 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1716 vec *p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1717 __vstore_pred(pred2, p2, v11);
1719 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1720 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1721 __vstore_pred(pred2, p2, v12);
1723 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1724 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1725 __vstore_pred(pred2, p2, v13);
1727 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1728 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1729 __vstore_pred(pred2, p2, v14);
1731 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1732 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1733 __vstore_pred(pred2, p2, v15);
1735 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1736 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1737 __vstore_pred(pred2, p2, v16);
1739 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1740 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1741 __vstore_pred(pred2, p2, v17);
1743 pred2 = c7x::strm_agen<1, vec>::get_vpred();
1744 p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1745 __vstore_pred(pred2, p2, v18);
1749 if (se0ICNT1 != se1ICNT1) {
1750 vec v01 = c7x::strm_eng<0, vec>::get_adv();
1751 vec v02 = c7x::strm_eng<0, vec>::get_adv();
1752 vec v03 = c7x::strm_eng<0, vec>::get_adv();
1753 vec v04 = c7x::strm_eng<0, vec>::get_adv();
1754 vec v05 = c7x::strm_eng<0, vec>::get_adv();
1755 vec v06 = c7x::strm_eng<0, vec>::get_adv();
1756 vec v07 = c7x::strm_eng<0, vec>::get_adv();
1757 vec v08 = c7x::strm_eng<0, vec>::get_adv();
1759 dataType *pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1760 vec si1 = __vload_dup(pSi1);
1771 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1772 vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1773 __vstore_pred(pred1, p1, v01);
1775 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1776 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1777 __vstore_pred(pred1, p1, v02);
1779 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1780 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1781 __vstore_pred(pred1, p1, v03);
1783 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1784 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1785 __vstore_pred(pred1, p1, v04);
1787 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1788 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1789 __vstore_pred(pred1, p1, v05);
1791 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1792 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1793 __vstore_pred(pred1, p1, v06);
1795 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1796 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1797 __vstore_pred(pred1, p1, v07);
1799 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1800 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1801 __vstore_pred(pred1, p1, v08);
1809 for (int32_t tile = 0; tile < nTiles; tile++) {
1810 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1811 vec *pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1812 vec sd1 = __vload_pred(pred, pSd);
1814 pred = c7x::strm_agen<2, vec>::get_vpred();
1815 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1816 vec sd2 = __vload_pred(pred, pSd);
1818 pred = c7x::strm_agen<2, vec>::get_vpred();
1819 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1820 vec sd3 = __vload_pred(pred, pSd);
1822 pred = c7x::strm_agen<2, vec>::get_vpred();
1823 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1824 vec sd4 = __vload_pred(pred, pSd);
1826 pred = c7x::strm_agen<2, vec>::get_vpred();
1827 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1828 vec sd5 = __vload_pred(pred, pSd);
1830 pred = c7x::strm_agen<2, vec>::get_vpred();
1831 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1832 vec sd6 = __vload_pred(pred, pSd);
1834 pred = c7x::strm_agen<2, vec>::get_vpred();
1835 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1836 vec sd7 = __vload_pred(pred, pSd);
1838 pred = c7x::strm_agen<2, vec>::get_vpred();
1839 pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1840 vec sd8 = __vload_pred(pred, pSd);
1842 vec v01 = c7x::strm_eng<0, vec>::get_adv();
1843 vec v02 = c7x::strm_eng<0, vec>::get_adv();
1844 vec v03 = c7x::strm_eng<0, vec>::get_adv();
1845 vec v04 = c7x::strm_eng<0, vec>::get_adv();
1846 vec v05 = c7x::strm_eng<0, vec>::get_adv();
1847 vec v06 = c7x::strm_eng<0, vec>::get_adv();
1848 vec v07 = c7x::strm_eng<0, vec>::get_adv();
1849 vec v08 = c7x::strm_eng<0, vec>::get_adv();
1851 dataType *pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1852 vec si1 = __vload_dup(pSi1);
1863 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1864 vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1865 __vstore_pred(pred1, p1, v01);
1867 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1868 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1869 __vstore_pred(pred1, p1, v02);
1871 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1872 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1873 __vstore_pred(pred1, p1, v03);
1875 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1876 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1877 __vstore_pred(pred1, p1, v04);
1879 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1880 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1881 __vstore_pred(pred1, p1, v05);
1883 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1884 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1885 __vstore_pred(pred1, p1, v06);
1887 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1888 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1889 __vstore_pred(pred1, p1, v07);
1891 pred1 = c7x::strm_agen<0, vec>::get_vpred();
1892 p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1893 __vstore_pred(pred1, p1, v08);
template double DSPLIB_bidiag_uCol_halfnorm_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double *half_norm_squared, double *U1, double *s, uint8_t *pBlock)
template void DSPLIB_bidiag_u_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_bidiag_u_init_ci< double >(DSPLIB_kernelHandle handle)
void DSPLIB_bidiag_uCol_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType half_norm_squared, dataType *U1, dataType scale, uint8_t *pBlock)
This function implements the Household processing on columns of input U matrix corresponding to the n...
template void DSPLIB_bidiag_uCol_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double half_norm_squared, double *U1, double scale, uint8_t *pBlock)
template void DSPLIB_bidiag_uRow_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double *superdiag, double *U1, double scale, uint8_t *pBlock)
template float DSPLIB_bidiag_uRow_halfnorm_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float *half_norm_squared, float *U1, float *s, float *superdiag, uint8_t *pBlock)
void DSPLIB_bidiag_u_init_ci(DSPLIB_kernelHandle handle)
template float DSPLIB_bidiag_uCol_halfnorm_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float *half_norm_squared, float *U1, float *s, uint8_t *pBlock)
template void DSPLIB_bidiag_uRow_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float *superdiag, float *U1, float scale, uint8_t *pBlock)
dataType DSPLIB_bidiag_uRow_halfnorm_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType *half_norm_squared, dataType *U1, dataType *s, dataType *superdiag, uint8_t *pBlock)
This function calculates the half-norm corresponding to the row of input matrix U and returns scale.
dataType DSPLIB_bidiag_uCol_halfnorm_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType *half_norm_squared, dataType *U1, dataType *s, uint8_t *pBlock)
This function calculates the half-norms corresponding to the column of input matrix U and returns sca...
void DSPLIB_bidiag_uRow_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType *superdiag, dataType *U1, dataType scale, uint8_t *pBlock)
This function implements the Household processing on rows of input U matrix corresponding to the natu...
template double DSPLIB_bidiag_uRow_halfnorm_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double *half_norm_squared, double *U1, double *s, double *superdiag, uint8_t *pBlock)
template void DSPLIB_bidiag_uCol_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float half_norm_squared, float *U1, float scale, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_svd.
dataType getRecip(dataType value)
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Structure that is reserved for internal use by the kernel.
uint32_t strideU
Stride between rows of U matrix
uint8_t bufPblock[DSPLIB_SVD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters