59 uint8_t * pBlock = pKerPrivArgs->
bufPblock;
60 int32_t order = pKerPrivArgs->
order;
62 int32_t colMatstride = strideMat /
sizeof(dataType);
64 typedef typename c7x::make_full_vector<dataType>::type vec;
65 uint32_t eleCount = c7x::element_count_of<vec>::value;
67 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
68 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
69 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
70 __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
72 __SE_TEMPLATE_v1 seDiagReadParams = __gen_SE_TEMPLATE_v1();
73 seDiagReadParams.ICNT0 = 1;
74 seDiagReadParams.ICNT1 = order;
75 seDiagReadParams.DIM1 = colMatstride + 1;
76 seDiagReadParams.DIMFMT = __SE_DIMFMT_2D;
77 seDiagReadParams.ELETYPE = SE_ELETYPE;
78 seDiagReadParams.VECLEN = __SE_VECLEN_1ELEM;
79 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE)) = seDiagReadParams;
81 __SA_TEMPLATE_v1 saWriteXParams = __gen_SA_TEMPLATE_v1();
82 saWriteXParams.ICNT0 = 1;
83 saWriteXParams.ICNT1 = order;
84 saWriteXParams.DIM1 = -1;
85 saWriteXParams.DIMFMT = __SA_DIMFMT_2D;
86 saWriteXParams.VECLEN = SA_VECLEN;
87 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = saWriteXParams;
89 __SE_TEMPLATE_v1 seDivReadParams = __gen_SE_TEMPLATE_v1();
90 seDivReadParams.ICNT0 = order;
91 seDivReadParams.DIMFMT = __SE_DIMFMT_1D;
92 seDivReadParams.ELETYPE = SE_ELETYPE;
93 seDivReadParams.VECLEN = SE_VECLEN;
94 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = seDivReadParams;
96 __SA_TEMPLATE_v1 saDivStoreParams = __gen_SA_TEMPLATE_v1();
97 saDivStoreParams.ICNT0 = order;
98 saDivStoreParams.DIMFMT = __SA_DIMFMT_1D;
99 saDivStoreParams.VECLEN = SA_VECLEN;
100 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = saDivStoreParams;
102 __SE_TEMPLATE_v1 seReadXParams = __gen_SE_TEMPLATE_v1();
103 seReadXParams.ICNT0 = 1;
104 seReadXParams.DIM1 = -1;
105 seReadXParams.DIMFMT = __SE_DIMFMT_2D;
106 seReadXParams.VECLEN = SE_VECLEN;
107 seReadXParams.ELETYPE = SE_ELETYPE;
108 seReadXParams.ELEDUP = SE_ELEDUP;
109 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = seReadXParams;
111 __SE_TEMPLATE_v1 seBlockParams = __gen_SE_TEMPLATE_v1();
112 seBlockParams.ICNT0 = eleCount;
113 seBlockParams.DIM1 = -colMatstride;
114 seBlockParams.DIMFMT = __SE_DIMFMT_2D;
115 seBlockParams.ELETYPE = SE_ELETYPE;
116 seBlockParams.VECLEN = SE_VECLEN;
117 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = seBlockParams;
119 __SA_TEMPLATE_v1 saReverseParams = __gen_SA_TEMPLATE_v1();
120 saReverseParams.ICNT0 = eleCount;
121 saReverseParams.DIM1 = -((int32_t)eleCount);
122 saReverseParams.DIMFMT = __SA_DIMFMT_2D;
123 saReverseParams.VECLEN = SA_VECLEN;
124 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = saReverseParams;
136 uint8_t * pBlock = pKerPrivArgs->
bufPblock;
137 int32_t order = pKerPrivArgs->
order;
139 typedef typename c7x::make_full_vector<uint16_t>::type vecUINT16;
140 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vecUINT16>::value;
141 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vecUINT16>::value;
143 int32_t pStride = pKerPrivArgs->
strideP;
144 int32_t colPStride = pStride /
sizeof(uint16_t);
146 __SE_TEMPLATE_v1 seMatReadParams = __gen_SE_TEMPLATE_v1();
147 seMatReadParams.ICNT0 = order;
148 seMatReadParams.DIM1 = colPStride * 2;
149 seMatReadParams.DIMFMT = __SE_DIMFMT_2D;
150 seMatReadParams.ELETYPE = SE_ELETYPE;
151 seMatReadParams.VECLEN = SE_VECLEN;
153 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = seMatReadParams;
160 template <
typename dataType>
178 kerInitArgsMatTrans.
dimX = bufParamsU->
dim_x;
179 kerInitArgsMatTrans.
dimY = bufParamsU->
dim_y;
186 DSPLIB_matTrans_init_ci<dataType>(pMatTransKerPrivArgs, bufParamsU, bufParamsScratchTrans, &kerInitArgsMatTrans);
187 DSPLIB_lud_sol_substitution_init_ci<dataType>(handle);
188 DSPLIB_lud_sol_permuteB_init_ci<dataType>(handle);
218 template <
typename dataType>
224 uint32_t * permuteOrder,
229 typedef typename c7x::make_full_vector<uint16_t>::type vec;
230 int32_t eleCount = c7x::element_count_of<vec>::value;
232 __SE_TEMPLATE_v1 se0Params, se1Params;
233 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
234 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
236 int32_t nVec = DSPLIB_ceilingDiv(order, eleCount);
237 int32_t se1ICNT1 = order / 2;
238 int32_t se0ICNT1 = order - se1ICNT1;
240 se0Params.ICNT1 = se0ICNT1;
241 se1Params.ICNT1 = se1ICNT1;
243 __SE0_OPEN(pIn, se0Params);
245 vec vecZero = (vec) 0;
246 vec vecOne = (vec) 1;
248 vec idx_0_to_eleCount;
250 #if (__C7X_VEC_SIZE_BITS__ == 256)
251 idx_0_to_eleCount = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
252 #elif (__C7X_VEC_SIZE_BITS__ == 512)
253 idx_0_to_eleCount = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
254 23, 24, 25, 26, 27, 28, 29, 30, 31);
257 int32_t vertical = 0;
259 __SE1_OPEN(pIn + colPStride, se1Params);
261 for (vertical = 0; vertical < order - 1; vertical += 2) {
263 vec maxValVec1 = (vec) 0;
264 vec maxValVec2 = (vec) 0;
267 vec vCurrIdx1 = idx_0_to_eleCount;
268 vec vCurrIdx2 = idx_0_to_eleCount;
270 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
271 vec v1 = c7x::strm_eng<0, vec>::get_adv();
272 vec v2 = c7x::strm_eng<1, vec>::get_adv();
274 __vpred cmpPred1 = __cmp_eq_pred(vecZero, v1);
275 __vpred cmpPred2 = __cmp_eq_pred(vecZero, v2);
277 maxValVec1 = __select(cmpPred1, maxValVec1, v1);
278 maxValVec2 = __select(cmpPred2, maxValVec2, v2);
280 vMaxIdx1 = __select(cmpPred1, vMaxIdx1, vCurrIdx1);
281 vMaxIdx2 = __select(cmpPred2, vMaxIdx2, vCurrIdx2);
283 vCurrIdx1 = vCurrIdx1 + (uint16_t) eleCount;
284 vCurrIdx2 = vCurrIdx2 + (uint16_t) eleCount;
287 __vpred cmpPredFinal1 = __cmp_eq_pred(vecOne, maxValVec1);
288 uint32_t tempIdx1 = __rightmost_bit_detect_short(cmpPredFinal1) >> 1;
289 uint32_t finalIdx1 = __vgetuh_vrd(vMaxIdx1, tempIdx1);
291 __vpred cmpPredFinal2 = __cmp_eq_pred(vecOne, maxValVec2);
292 uint32_t tempIdx2 = __rightmost_bit_detect_short(cmpPredFinal2) >> 1;
293 uint32_t finalIdx2 = __vgetuh_vrd(vMaxIdx2, tempIdx2);
295 B_Mod[vertical + 0] = B[finalIdx1];
296 B_Mod[vertical + 1] = B[finalIdx2];
300 if (se0ICNT1 != se1ICNT1) {
302 vec maxValVec1 = (vec) 0;
303 vec vMaxIdx1 = idx_0_to_eleCount;
304 vec vCurrIdx1 = idx_0_to_eleCount;
306 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
307 vec v1 = c7x::strm_eng<0, vec>::get_adv();
309 __vpred cmpPred1 = __cmp_eq_pred(vecZero, v1);
311 maxValVec1 = __select(cmpPred1, maxValVec1, v1);
313 vMaxIdx1 = __select(cmpPred1, vMaxIdx1, vCurrIdx1);
315 vCurrIdx1 = vCurrIdx1 + (uint16_t) eleCount;
318 __vpred cmpPredFinal1 = __cmp_eq_pred(vecOne, maxValVec1);
319 uint32_t tempIdx1 = __rightmost_bit_detect_short(cmpPredFinal1) >> 1;
320 uint32_t finalIdx1 = __vgetuh_vrd(vMaxIdx1, tempIdx1);
322 B_Mod[vertical + 0] = B[finalIdx1];
333 uint32_t * permuteOrder,
340 uint32_t * permuteOrder,
343 template <typename dataType, typename V = typename c7x::make_full_vector<dataType>::type>
344 inline void getElement(V inVec, uint32_t index, dataType *element);
345 template <
typename V>
inline void getElement(V inVec, uint32_t index,
float *element)
347 *element = __as_float(__vgetw_vrd(c7x::as_int_vec(inVec), index));
350 template <
typename V>
inline void getElement(V inVec, uint32_t index,
double *element)
352 *element = __as_double(__vgetd_vrd(c7x::as_long_vec(inVec), index));
355 template <
typename dataType>
366 __SE_TEMPLATE_v1 seDivReadParams;
367 __SA_TEMPLATE_v1 saDivStoreParams;
368 __SE_TEMPLATE_v1 seDiagReadParams;
369 __SA_TEMPLATE_v1 saDiagWriteParams;
370 seDiagReadParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
371 saDiagWriteParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
372 seDivReadParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
373 saDivStoreParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
375 saDiagWriteParams.DIM1 = 1;
377 typedef typename c7x::make_full_vector<dataType>::type vec;
378 int32_t eleCount = c7x::element_count_of<vec>::value;
379 int32_t nVec = DSPLIB_ceilingDiv(order, eleCount);
383 __SE1_OPEN(pL, seDiagReadParams);
384 __SA1_OPEN(saDiagWriteParams);
387 for (row = 0; row < order; row++) {
388 vec vecDiag = c7x::strm_eng<1, vec>::get_adv();
390 __vpred predDiag = c7x::strm_agen<1, vec>::get_vpred();
391 vec * pStoreDiag = c7x::strm_agen<1, vec>::get_adv(pDiv);
392 __vstore_pred(predDiag, pStoreDiag, vecDiag);
397 __SE0_OPEN(pDiv, seDivReadParams);
398 __SA0_OPEN(saDivStoreParams);
399 dataType TwoP0 = 2.0;
402 for (ii = 0; ii < nVec - 3; ii += 4) {
403 vec v1 = c7x::strm_eng<0, vec>::get_adv();
404 vec v2 = c7x::strm_eng<0, vec>::get_adv();
405 vec v3 = c7x::strm_eng<0, vec>::get_adv();
406 vec v4 = c7x::strm_eng<0, vec>::get_adv();
408 vec yy1 = __recip(v1);
409 yy1 = yy1 * (TwoP0 - v1 * yy1);
410 yy1 = yy1 * (TwoP0 - v1 * yy1);
412 vec yy2 = __recip(v2);
413 yy2 = yy2 * (TwoP0 - v2 * yy2);
414 yy2 = yy2 * (TwoP0 - v2 * yy2);
416 vec yy3 = __recip(v3);
417 yy3 = yy3 * (TwoP0 - v3 * yy3);
418 yy3 = yy3 * (TwoP0 - v3 * yy3);
420 vec yy4 = __recip(v4);
421 yy4 = yy4 * (TwoP0 - v4 * yy4);
422 yy4 = yy4 * (TwoP0 - v4 * yy4);
424 __vpred predDiv1 = c7x::strm_agen<0, vec>::get_vpred();
425 vec * pStoreDiv1 = c7x::strm_agen<0, vec>::get_adv(pDiv);
426 __vstore_pred(predDiv1, pStoreDiv1, yy1);
428 __vpred predDiv2 = c7x::strm_agen<0, vec>::get_vpred();
429 vec * pStoreDiv2 = c7x::strm_agen<0, vec>::get_adv(pDiv);
430 __vstore_pred(predDiv2, pStoreDiv2, yy2);
432 __vpred predDiv3 = c7x::strm_agen<0, vec>::get_vpred();
433 vec * pStoreDiv3 = c7x::strm_agen<0, vec>::get_adv(pDiv);
434 __vstore_pred(predDiv3, pStoreDiv3, yy3);
436 __vpred predDiv4 = c7x::strm_agen<0, vec>::get_vpred();
437 vec * pStoreDiv4 = c7x::strm_agen<0, vec>::get_adv(pDiv);
438 __vstore_pred(predDiv4, pStoreDiv4, yy4);
441 for (; ii < nVec; ii++) {
442 vec v1 = c7x::strm_eng<0, vec>::get_adv();
444 vec yy1 = __recip(v1);
445 yy1 = yy1 * (TwoP0 - v1 * yy1);
446 yy1 = yy1 * (TwoP0 - v1 * yy1);
448 __vpred predDiv1 = c7x::strm_agen<0, vec>::get_vpred();
449 vec * pStoreDiv1 = c7x::strm_agen<0, vec>::get_adv(pDiv);
450 __vstore_pred(predDiv1, pStoreDiv1, yy1);
460 __SE_TEMPLATE_v1 seBlockParams;
461 __SE_TEMPLATE_v1 seReadYParams;
462 __SA_TEMPLATE_v1 saWriteYParams;
463 __SA_TEMPLATE_v1 sa1DReadParams;
465 saWriteYParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
466 seReadYParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
467 seBlockParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
468 sa1DReadParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
470 saWriteYParams.DIM1 = 1;
471 seReadYParams.DIM1 = 1;
472 seBlockParams.DIM1 = colLstride;
477 dataType *pSA3 = pDiv;
479 __SA1_OPEN(saWriteYParams);
480 __SA2_OPEN(sa1DReadParams);
481 __SA3_OPEN(sa1DReadParams);
483 for (int32_t block = 0; block < nVec; block++) {
484 __vpred predB = c7x::strm_agen<2, vec>::get_vpred();
485 vec * pLoadB = c7x::strm_agen<2, vec>::get_adv(pSA2);
486 vec vecB = __vload_pred(predB, pLoadB);
488 __vpred predDiv = c7x::strm_agen<3, vec>::get_vpred();
489 vec * pLoadDiv = c7x::strm_agen<3, vec>::get_adv(pSA3);
490 vec vecDiv = __vload_pred(predDiv, pLoadDiv);
492 int32_t sumRows = block * eleCount;
493 int32_t totalRows = sumRows + eleCount;
495 seBlockParams.ICNT1 = totalRows;
496 seReadYParams.ICNT1 = sumRows;
498 __SE0_OPEN(pSE0, seBlockParams);
500 __SE1_OPEN(pY, seReadYParams);
504 vec vecSum = (vec) 0;
505 vec vecSum1 = (vec) 0;
506 vec vecSum2 = (vec) 0;
507 vec vecSum3 = (vec) 0;
508 vec vecSum4 = (vec) 0;
509 int32_t vertical = 0;
511 for (vertical = 0; vertical < sumRows - 3; vertical += 4) {
512 vec v1 = c7x::strm_eng<0, vec>::get_adv();
513 vec y1 = c7x::strm_eng<1, vec>::get_adv();
516 vec v2 = c7x::strm_eng<0, vec>::get_adv();
517 vec y2 = c7x::strm_eng<1, vec>::get_adv();
520 vec v3 = c7x::strm_eng<0, vec>::get_adv();
521 vec y3 = c7x::strm_eng<1, vec>::get_adv();
524 vec v4 = c7x::strm_eng<0, vec>::get_adv();
525 vec y4 = c7x::strm_eng<1, vec>::get_adv();
529 vecSum = vecSum1 + vecSum2 + vecSum3 + vecSum4;
533 for (vertical = 0; vertical < eleCount; vertical++) {
534 vec v1 = c7x::strm_eng<0, vec>::get_adv();
535 vec result1 = (vecB - vecSum) * vecDiv;
539 vecSum += v1 * (resultEle1);
541 __vpred predYCalc = c7x::strm_agen<1, vec>::get_vpred();
542 vec * pStoreY = c7x::strm_agen<1, vec>::get_adv(pSA1);
543 __vstore_pred(predYCalc, pStoreY, (vec) resultEle1);
570 template <
typename dataType>
581 typedef typename c7x::make_full_vector<dataType>::type vec;
582 int32_t eleCount = c7x::element_count_of<vec>::value;
586 __SE_TEMPLATE_v1 seDivReadParams;
587 __SA_TEMPLATE_v1 saDivStoreParams;
588 __SE_TEMPLATE_v1 seDiagReadParams;
589 __SA_TEMPLATE_v1 saDiagWriteParams;
590 seDiagReadParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
591 saDiagWriteParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
592 seDivReadParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
593 saDivStoreParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
595 saDiagWriteParams.DIM1 = 1;
597 int32_t nVec = DSPLIB_ceilingDiv(order, eleCount);
600 __SE1_OPEN(pL, seDiagReadParams);
601 __SA1_OPEN(saDiagWriteParams);
604 for (row = 0; row < order; row++) {
605 vec vecDiag = c7x::strm_eng<1, vec>::get_adv();
607 __vpred predDiag = c7x::strm_agen<1, vec>::get_vpred();
608 vec * pStoreDiag = c7x::strm_agen<1, vec>::get_adv(pDiv);
609 __vstore_pred(predDiag, pStoreDiag, vecDiag);
614 __SE0_OPEN(pDiv, seDivReadParams);
615 __SA0_OPEN(saDivStoreParams);
616 dataType TwoP0 = 2.0;
619 for (ii = 0; ii < nVec - 3; ii += 4) {
620 vec v1 = c7x::strm_eng<0, vec>::get_adv();
621 vec v2 = c7x::strm_eng<0, vec>::get_adv();
622 vec v3 = c7x::strm_eng<0, vec>::get_adv();
623 vec v4 = c7x::strm_eng<0, vec>::get_adv();
625 vec yy1 = __recip(v1);
626 yy1 = yy1 * (TwoP0 - v1 * yy1);
627 yy1 = yy1 * (TwoP0 - v1 * yy1);
629 vec yy2 = __recip(v2);
630 yy2 = yy2 * (TwoP0 - v2 * yy2);
631 yy2 = yy2 * (TwoP0 - v2 * yy2);
633 vec yy3 = __recip(v3);
634 yy3 = yy3 * (TwoP0 - v3 * yy3);
635 yy3 = yy3 * (TwoP0 - v3 * yy3);
637 vec yy4 = __recip(v4);
638 yy4 = yy4 * (TwoP0 - v4 * yy4);
639 yy4 = yy4 * (TwoP0 - v4 * yy4);
641 __vpred predDiv1 = c7x::strm_agen<0, vec>::get_vpred();
642 vec * pStoreDiv1 = c7x::strm_agen<0, vec>::get_adv(pDiv);
643 __vstore_pred(predDiv1, pStoreDiv1, yy1);
645 __vpred predDiv2 = c7x::strm_agen<0, vec>::get_vpred();
646 vec * pStoreDiv2 = c7x::strm_agen<0, vec>::get_adv(pDiv);
647 __vstore_pred(predDiv2, pStoreDiv2, yy2);
649 __vpred predDiv3 = c7x::strm_agen<0, vec>::get_vpred();
650 vec * pStoreDiv3 = c7x::strm_agen<0, vec>::get_adv(pDiv);
651 __vstore_pred(predDiv3, pStoreDiv3, yy3);
653 __vpred predDiv4 = c7x::strm_agen<0, vec>::get_vpred();
654 vec * pStoreDiv4 = c7x::strm_agen<0, vec>::get_adv(pDiv);
655 __vstore_pred(predDiv4, pStoreDiv4, yy4);
658 for (; ii < nVec; ii++) {
659 vec v1 = c7x::strm_eng<0, vec>::get_adv();
661 vec yy1 = __recip(v1);
662 yy1 = yy1 * (TwoP0 - v1 * yy1);
663 yy1 = yy1 * (TwoP0 - v1 * yy1);
665 __vpred predDiv1 = c7x::strm_agen<0, vec>::get_vpred();
666 vec * pStoreDiv1 = c7x::strm_agen<0, vec>::get_adv(pDiv);
667 __vstore_pred(predDiv1, pStoreDiv1, yy1);
673 int32_t totalBlocks = order / eleCount;
674 int32_t remainingEle = order - (totalBlocks * eleCount);
676 __SE_TEMPLATE_v1 seBlockParams;
677 __SE_TEMPLATE_v1 seReadXParams;
678 __SA_TEMPLATE_v1 saWriteXParams;
679 __SA_TEMPLATE_v1 saReverseParams;
681 saWriteXParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
682 seReadXParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
683 seBlockParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
684 saReverseParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
686 seReadXParams.ICNT0 = 1;
687 seReadXParams.DIM1 = -1;
688 saReverseParams.ICNT1 = totalBlocks;
690 dataType *pLLastElem = &pL[(order - 1) + ((order - 1) * colLstride)];
691 dataType *pXLastElem = &pX[order - 1];
692 dataType *pSE0 = pLLastElem - (eleCount - 1);
693 dataType *pSA1 = pX + order - 1;
694 dataType *pSA2 = pY + order - eleCount;
695 dataType *pSA3 = pDiv + order - eleCount;
697 __SA1_OPEN(saWriteXParams);
699 if (totalBlocks > 0) {
700 __SA2_OPEN(saReverseParams);
701 __SA3_OPEN(saReverseParams);
703 for (int32_t block = 0; block < totalBlocks; block++) {
704 __vpred predY = c7x::strm_agen<2, vec>::get_vpred();
705 vec * pLoadY = c7x::strm_agen<2, vec>::get_adv(pSA2);
706 vec vecY = __vload_pred(predY, pLoadY);
708 __vpred predDiv = c7x::strm_agen<3, vec>::get_vpred();
709 vec * pLoadDiv = c7x::strm_agen<3, vec>::get_adv(pSA3);
710 vec vecDiv = __vload_pred(predDiv, pLoadDiv);
712 int32_t sumRows = block * eleCount;
713 int32_t totalRows = sumRows + eleCount;
715 seBlockParams.ICNT1 = totalRows;
716 seReadXParams.ICNT1 = sumRows;
718 __SE0_OPEN(pSE0, seBlockParams);
720 __SE1_OPEN(pXLastElem, seReadXParams);
724 vec vecSum = (vec) 0;
725 vec vecSum1 = (vec) 0;
726 vec vecSum2 = (vec) 0;
727 vec vecSum3 = (vec) 0;
728 vec vecSum4 = (vec) 0;
729 int32_t vertical = 0;
731 for (vertical = 0; vertical < sumRows - 3; vertical += 4) {
732 vec v1 = c7x::strm_eng<0, vec>::get_adv();
733 vec x1 = c7x::strm_eng<1, vec>::get_adv();
736 vec v2 = c7x::strm_eng<0, vec>::get_adv();
737 vec x2 = c7x::strm_eng<1, vec>::get_adv();
740 vec v3 = c7x::strm_eng<0, vec>::get_adv();
741 vec x3 = c7x::strm_eng<1, vec>::get_adv();
744 vec v4 = c7x::strm_eng<0, vec>::get_adv();
745 vec x4 = c7x::strm_eng<1, vec>::get_adv();
749 vecSum = vecSum1 + vecSum2 + vecSum3 + vecSum4;
752 uint32_t vecIndex = eleCount - 1;
755 for (vertical = 0; vertical < eleCount; vertical++) {
756 vec v1 = c7x::strm_eng<0, vec>::get_adv();
757 vec result1 = (vecY - vecSum) * vecDiv;
759 vecSum += v1 * (resultEle1);
760 dataType *pStoreX1 = c7x::strm_agen<1, dataType>::get_adv(pSA1);
761 *pStoreX1 = resultEle1;
772 if (remainingEle > 0) {
773 seBlockParams.ICNT0 = remainingEle;
774 seBlockParams.ICNT1 = order;
776 seReadXParams.ICNT1 = order;
778 saReverseParams.ICNT0 = remainingEle;
779 saReverseParams.ICNT1 = 1;
780 saReverseParams.DIM1 = 0;
782 pSE0 = &pL[(order - 1) * colLstride];
783 __SE0_OPEN(pSE0, seBlockParams);
784 __SE1_OPEN(pXLastElem, seReadXParams);
785 __SA2_OPEN(saReverseParams);
786 __SA3_OPEN(saReverseParams);
788 int32_t sumRows = totalBlocks * eleCount;
790 __vpred predY = c7x::strm_agen<2, vec>::get_vpred();
791 vec * pLoadY = c7x::strm_agen<2, vec>::get_adv(pY);
792 vec vecY = __vload_pred(predY, pLoadY);
794 __vpred predDiv = c7x::strm_agen<3, vec>::get_vpred();
795 vec * pLoadDiv = c7x::strm_agen<3, vec>::get_adv(pDiv);
796 vec vecDiv = __vload_pred(predDiv, pLoadDiv);
798 vec vecSum = (vec) 0;
799 vec vecSum1 = (vec) 0;
800 vec vecSum2 = (vec) 0;
801 vec vecSum3 = (vec) 0;
802 vec vecSum4 = (vec) 0;
803 int32_t vertical = 0;
805 for (vertical = 0; vertical < sumRows - 3; vertical += 4) {
806 vec v1 = c7x::strm_eng<0, vec>::get_adv();
807 vec x1 = c7x::strm_eng<1, vec>::get_adv();
810 vec v2 = c7x::strm_eng<0, vec>::get_adv();
811 vec x2 = c7x::strm_eng<1, vec>::get_adv();
814 vec v3 = c7x::strm_eng<0, vec>::get_adv();
815 vec x3 = c7x::strm_eng<1, vec>::get_adv();
818 vec v4 = c7x::strm_eng<0, vec>::get_adv();
819 vec x4 = c7x::strm_eng<1, vec>::get_adv();
823 vecSum = vecSum1 + vecSum2 + vecSum3 + vecSum4;
825 int32_t vecIndex = remainingEle - 1;
828 for (vertical = 0; vertical < remainingEle; vertical++) {
829 vec v1 = c7x::strm_eng<0, vec>::get_adv();
830 vec result = (vecY - vecSum) * vecDiv;
833 vecSum += v1 * (resultEle);
835 dataType *pStoreX = c7x::strm_agen<1, dataType>::get_adv(pSA1);
836 *pStoreX = resultEle;
862 template <
typename dataType>
872 dataType *pScratchTrans)
878 int32_t order = pKerPrivArgs->
order;
880 int32_t strideP = pKerPrivArgs->
strideP;
881 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
883 int32_t dataSize =
sizeof(dataType);
884 int32_t dataSizeP =
sizeof(
unsigned short);
886 int32_t orderStride = strideOrder / dataSize;
887 int32_t orderPStride = strideP / dataSizeP;
892 DSPLIB_lud_sol_permuteB_ci<dataType>(P, B, B_Mod, order, orderPStride, NULL, pBlock);
898 DSPLIB_matTrans_exec_ci<dataType>(pMatTransKerPrivArgs, L, pScratchTrans);
900 DSPLIB_lud_sol_forwardSubstitution_ci<dataType>(pScratchTrans, Y, B_Mod, pDiv, order, orderStride, pBlock);
906 DSPLIB_matTrans_exec_ci<dataType>(pMatTransKerPrivArgs, U, pScratchTrans);
907 DSPLIB_lud_sol_backSubstitution_ci<dataType>(pScratchTrans, X, Y, pDiv, order, orderStride, pBlock);
922 float *pScratchTrans);
932 double *pScratchTrans);
934 template <
typename dataType>
941 void *restrict pVecScratch,
942 void *restrict pScratchTrans)
948 int32_t strideVec = pKerPrivArgs->
strideVec;
951 unsigned short *pPLocal = (
unsigned short *) pP;
952 dataType * pLLocal = (dataType *) pL;
953 dataType * pULocal = (dataType *) pU;
954 dataType * pBLocal = (dataType *) pB;
955 dataType * pB_ModLocal = (dataType *) (pVecScratch) + (0 * strideVec /
sizeof(dataType));
956 dataType * pYLocal = (dataType *) (pVecScratch) + (1 * strideVec /
sizeof(dataType));
957 dataType * pXLocal = (dataType *) pX;
958 dataType * pDivLocal = (dataType *) (pVecScratch) + (2 * strideVec /
sizeof(dataType));
959 dataType * pTransLocal = (dataType *) pScratchTrans;
962 0,
"pPLocal: %p pLLocal: %p pULocal: %p pBLocal: %p pB_ModLocal: %p pYLocal: %p pXLocal: %p order: %d\n",
963 pPLocal, pLLocal, pULocal, pBLocal, pB_ModLocal, pYLocal, pXLocal, pKerPrivArgs->
order);
965 DSPLIB_lud_sol_ci<dataType>(pKerPrivArgs, pPLocal, pLLocal, pULocal, pBLocal, pB_ModLocal, pYLocal, pXLocal,
966 pDivLocal, pTransLocal);
979 void *restrict pVecScratch,
980 void *restrict pScratchTrans);
988 void *restrict pVecScratch,
989 void *restrict pScratchTrans);
static void DSPLIB_lud_sol_backSubstitution_ci(dataType *pL, dataType *pX, dataType *pY, dataType *pDiv, int32_t order, int32_t colLstride, uint8_t *pBlock)
template void DSPLIB_lud_sol_permuteB_init_ci< float >(DSPLIB_kernelHandle handle)
int DSPLIB_lud_sol_ci(DSPLIB_lud_sol_PrivArgs *pKerPrivArgs, unsigned short *P, dataType *L, dataType *U, dataType *B, dataType *B_Mod, dataType *Y, dataType *X, dataType *pDiv, dataType *pScratchTrans)
static void DSPLIB_lud_sol_forwardSubstitution_ci(dataType *pL, dataType *pY, dataType *pB, dataType *pDiv, int32_t order, int32_t colLstride, uint8_t *pBlock)
void DSPLIB_lud_sol_permuteB_ci(unsigned short *pIn, dataType *B, dataType *B_Mod, int32_t order, int32_t colPStride, uint32_t *permuteOrder, uint8_t *pBlock)
template DSPLIB_STATUS DSPLIB_lud_sol_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pB, void *restrict pX, void *restrict pVecScratch, void *restrict pScratchTrans)
void getElement(V inVec, uint32_t index, dataType *element)
template int DSPLIB_lud_sol_ci< float >(DSPLIB_lud_sol_PrivArgs *pKerPrivArgs, unsigned short *P, float *L, float *U, float *B, float *B_Mod, float *Y, float *X, float *pDiv, float *pScratchTrans)
template void DSPLIB_lud_sol_forwardSubstitution_ci< float >(float *pL, float *pX, float *pY, float *pDiv, int32_t order, int32_t colLstride, uint8_t *pBlock)
template DSPLIB_STATUS DSPLIB_lud_sol_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams1D_t *bufParamsB, const DSPLIB_bufParams1D_t *bufParamsX, const DSPLIB_bufParams2D_t *bufParamsVecScratch, const DSPLIB_bufParams2D_t *bufParamsScratchTrans, const DSPLIB_lud_solInitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_lud_sol_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pB, void *restrict pX, void *restrict pVecScratch, void *restrict pScratchTrans)
DSPLIB_STATUS DSPLIB_lud_sol_exec_ci(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pB, void *restrict pX, void *restrict pVecScratch, void *restrict pScratchTrans)
This function is the main execution function for the C7x implementation of the kernel....
template void DSPLIB_lud_sol_substitution_init_ci< double >(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_sol_substitution_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_sol_forwardSubstitution_ci< double >(double *pL, double *pX, double *pY, double *pDiv, int32_t order, int32_t colLstride, uint8_t *pBlock)
void DSPLIB_lud_sol_permuteB_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_sol_backSubstitution_ci< double >(double *pL, double *pX, double *pY, double *pDiv, int32_t order, int32_t colLstride, uint8_t *pBlock)
template void DSPLIB_lud_sol_permuteB_ci< float >(unsigned short *pIn, float *B, float *B_Mod, int32_t order, int32_t colPStride, uint32_t *permuteOrder, uint8_t *pBlock)
void DSPLIB_lud_sol_substitution_init_ci(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_lud_sol_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams1D_t *bufParamsB, const DSPLIB_bufParams1D_t *bufParamsX, const DSPLIB_bufParams2D_t *bufParamsVecScratch, const DSPLIB_bufParams2D_t *bufParamsScratchTrans, const DSPLIB_lud_solInitArgs *pKerInitArgs)
template void DSPLIB_lud_sol_permuteB_ci< double >(unsigned short *pIn, double *B, double *B_Mod, int32_t order, int32_t colPStride, uint32_t *permuteOrder, uint8_t *pBlock)
DSPLIB_STATUS DSPLIB_lud_sol_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams1D_t *bufParamsB, const DSPLIB_bufParams1D_t *bufParamsX, const DSPLIB_bufParams2D_t *bufParamsVecScratch, const DSPLIB_bufParams2D_t *bufParamsScratchTrans, const DSPLIB_lud_solInitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template void DSPLIB_lud_sol_permuteB_init_ci< double >(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_sol_backSubstitution_ci< float >(float *pL, float *pX, float *pY, float *pDiv, int32_t order, int32_t colLstride, uint8_t *pBlock)
template int DSPLIB_lud_sol_ci< double >(DSPLIB_lud_sol_PrivArgs *pKerPrivArgs, unsigned short *P, double *L, double *U, double *B, double *B_Mod, double *Y, double *X, double *pDiv, double *pScratchTrans)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_lud_sol.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
int32_t stride_y
Stride in Y dimension in bytes.
uint32_t dim_x
Width of buffer in X dimension in elements.
uint32_t dim_y
Height of buffer in Y dimension in elements.
Structure containing the parameters to initialize the kernel.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
int32_t strideOrder
Stride between rows of input and output data matrix
DSPLIB_matTrans_PrivArgs pMatTransKerPrivArgs
Privargs for the matTrans kernel.
uint8_t bufPblock[DSPLIB_LUD_SOL_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters
int32_t order
Size of input buffer for different batches DSPLIB_lud_sol_init that will be retrieved and used by DSP...
int32_t strideP
Stride between rows of input data matrix P
int32_t strideVec
Stride between rows of scratch data matrix
Structure containing the parameters to initialize the kernel.
uint32_t dimX
Size of input data.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
int32_t strideOut
Stride between rows of output data matrix
uint32_t heightIn
Height of input data matrix
int32_t strideIn
Stride between rows of input data matrix
uint32_t widthIn
Size of input buffer for different batches DSPLIB_matTrans_init that will be retrieved and used by DS...