49 #define LUD_INV_HIGH_PRECISION
61 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
62 int32_t order = pKerPrivArgs->
order;
64 int32_t colStride = stride /
sizeof(dataType);
66 typedef typename c7x::make_full_vector<dataType>::type vec;
67 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
68 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
69 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
71 __SE_TEMPLATE_v1 seMatReadParams = __gen_SE_TEMPLATE_v1();
72 seMatReadParams.ICNT0 = order;
73 seMatReadParams.DIM1 = colStride * 2;
74 seMatReadParams.DIMFMT = __SE_DIMFMT_2D;
75 seMatReadParams.ELETYPE = SE_ELETYPE;
76 seMatReadParams.VECLEN = SE_VECLEN;
78 __SA_TEMPLATE_v1 saRowWriteParams = __gen_SA_TEMPLATE_v1();
79 saRowWriteParams.ICNT0 = order;
80 saRowWriteParams.DIM1 = 0;
81 saRowWriteParams.DIMFMT = __SA_DIMFMT_2D;
82 saRowWriteParams.VECLEN = SA_VECLEN;
84 __SA_TEMPLATE_v1 saPermParams = __gen_SA_TEMPLATE_v1();
85 saPermParams.ICNT0 = order;
86 saPermParams.DIMFMT = __SA_DIMFMT_1D;
87 saPermParams.VECLEN = __SA_VECLEN_1ELEM;
89 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = seMatReadParams;
90 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = saRowWriteParams;
91 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = saPermParams;
93 typedef typename c7x::make_full_vector<uint16_t>::type vecUINT16;
94 SE_ELETYPE = c7x::se_eletype<vecUINT16>::value;
95 SE_VECLEN = c7x::se_veclen<vecUINT16>::value;
96 SA_VECLEN = c7x::sa_veclen<vecUINT16>::value;
97 int32_t pStride = pKerPrivArgs->
strideP;
98 int32_t colPStride = pStride /
sizeof(uint16_t);
100 seMatReadParams = __gen_SE_TEMPLATE_v1();
101 seMatReadParams.ICNT0 = order;
102 seMatReadParams.DIM1 = colPStride * 2;
103 seMatReadParams.DIMFMT = __SE_DIMFMT_2D;
104 seMatReadParams.ELETYPE = SE_ELETYPE;
105 seMatReadParams.VECLEN = SE_VECLEN;
107 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE)) = seMatReadParams;
118 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
120 int32_t order = pKerPrivArgs->
order;
121 int32_t colStrideOrder = strideOrder /
sizeof(dataType);
123 DSPLIB_lud_identity_matrix_generate_init_ci<dataType>(pBlock, order, strideOrder);
125 typedef typename c7x::make_full_vector<dataType>::type vec;
127 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
128 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
129 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
132 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
133 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
135 typedef typename c7x::make_full_vector<dataType>::type vec;
137 uint32_t eleCount = c7x::element_count_of<vec>::value;
139 se0Params.ICNT1 = eleCount;
140 se0Params.DIM1 = colStrideOrder;
141 se0Params.DIM2 = colStrideOrder * eleCount;
142 se0Params.DIMFMT = __SE_DIMFMT_3D;
143 se0Params.ELETYPE = SE_ELETYPE;
144 se0Params.VECLEN = SE_VECLEN;
145 if (
sizeof(dataType) == 4) {
146 se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
149 se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
152 sa0Params.ICNT0 = order;
153 sa0Params.VECLEN = SA_VECLEN;
154 sa0Params.DIMFMT = __SA_DIMFMT_1D;
156 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = se0Params;
157 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = sa0Params;
160 int32_t lenTile8 = 8;
161 int32_t nTiles_8 = DSPLIB_ceilingDiv(order, (eleCount * lenTile8));
163 __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
165 __SE_TEMPLATE_v1 seScalarParams = __gen_SE_TEMPLATE_v1();
166 __SE_TEMPLATE_v1 seMatrixParams = __gen_SE_TEMPLATE_v1();
167 __SA_TEMPLATE_v1 saMatrixParams = __gen_SA_TEMPLATE_v1();
169 seScalarParams.DIM1 = 0;
170 seScalarParams.ELEDUP = SE_ELEDUP;
171 seScalarParams.DIMFMT = __SE_DIMFMT_2D;
172 seScalarParams.VECLEN = SE_VECLEN;
173 seScalarParams.ELETYPE = SE_ELETYPE;
175 seMatrixParams.ICNT0 = (eleCount * lenTile8);
176 seMatrixParams.DIM1 = colStrideOrder;
177 seMatrixParams.ICNT2 = nTiles_8;
178 seMatrixParams.DIM2 = (eleCount * lenTile8);
179 seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
180 seMatrixParams.ELETYPE = SE_ELETYPE;
181 seMatrixParams.VECLEN = SE_VECLEN;
182 seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
183 seMatrixParams.DECDIM1_WIDTH = order;
185 saMatrixParams.ICNT0 = (eleCount * lenTile8);
186 saMatrixParams.DIM1 = colStrideOrder;
187 saMatrixParams.ICNT2 = nTiles_8;
188 saMatrixParams.DIM2 = (eleCount * lenTile8);
189 saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
190 saMatrixParams.VECLEN = SA_VECLEN;
191 saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
192 saMatrixParams.DECDIM1_WIDTH = order;
194 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = seScalarParams;
195 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = seMatrixParams;
196 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = saMatrixParams;
203 template <
typename dataType>
213 int32_t colStrideOrder = pKerPrivArgs->
strideOrder /
sizeof(dataType);
223 kerInitArgsMatTrans.
dimX = bufParamsU->
dim_x;
224 kerInitArgsMatTrans.
dimY = bufParamsU->
dim_y;
235 pMatMulKerPrivArgs->
M = pKerPrivArgs->
order;
236 pMatMulKerPrivArgs->
N = pKerPrivArgs->
order;
237 pMatMulKerPrivArgs->
K = pKerPrivArgs->
order;
239 DSPLIB_matTrans_init_ci<dataType>(pMatTransKerPrivArgs, bufParamsU, bufParamsinvA, &kerInitArgsMatTrans);
240 DSPLIB_matMul_init_ci<dataType>(pMatMulKerPrivArgs, bufParamsL, bufParamsinvA, bufParamsU, &kerInitArgsMatMul);
242 DSPLIB_lud_inv_opt_init_ci<dataType>(handle);
243 DSPLIB_lud_inv_permuteRows_init_ci<dataType>(handle);
270 template <
typename dataType>
274 uint32_t *permuteOrder,
280 typedef typename c7x::make_full_vector<dataType>::type vec;
281 int32_t eleCount = c7x::element_count_of<vec>::value;
283 __SE_TEMPLATE_v1 se0Params, se1Params;
284 __SA_TEMPLATE_v1 sa0Params, sa1Params, sa2Params;
285 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
286 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
287 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
288 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
289 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
291 int32_t nVec = DSPLIB_ceilingDiv(order, eleCount);
292 int32_t se1ICNT1 = order / 2;
293 int32_t se0ICNT1 = order - se1ICNT1;
294 se0Params.ICNT1 = sa0Params.ICNT1 = se0ICNT1;
295 se1Params.ICNT1 = sa1Params.ICNT1 = se1ICNT1;
297 __SE0_OPEN(pIn, se0Params);
298 __SA0_OPEN(sa0Params);
299 __SA2_OPEN(sa2Params);
302 __SE1_OPEN(pIn + colStride, se1Params);
303 __SA1_OPEN(sa1Params);
305 for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
306 uint32_t *loadPerm1 = c7x::strm_agen<2, uint32_t>::get_adv(permuteOrder);
307 uint32_t offset1 = *loadPerm1 * colStride;
309 uint32_t *loadPerm2 = c7x::strm_agen<2, uint32_t>::get_adv(permuteOrder);
310 uint32_t offset2 = *loadPerm2 * colStride;
312 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
313 vec v1 = c7x::strm_eng<0, vec>::get_adv();
314 vec v2 = c7x::strm_eng<1, vec>::get_adv();
316 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
317 vec *pStore1 = c7x::strm_agen<0, vec>::get_adv(pOut + offset1);
318 __vstore_pred(pred1, pStore1, v1);
320 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
321 vec *pStore2 = c7x::strm_agen<1, vec>::get_adv(pOut + offset2);
322 __vstore_pred(pred2, pStore2, v2);
327 if (se0ICNT1 != se1ICNT1) {
328 uint32_t *loadPerm1 = c7x::strm_agen<2, uint32_t>::get_adv(permuteOrder);
329 uint32_t offset1 = *loadPerm1 * colStride;
331 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
332 vec v1 = c7x::strm_eng<0, vec>::get_adv();
334 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
335 vec *pStore1 = c7x::strm_agen<0, vec>::get_adv(pOut + offset1);
336 __vstore_pred(pred1, pStore1, v1);
345 uint32_t *permuteOrder,
351 uint32_t *permuteOrder,
358 uint32_t *permuteOrder,
363 typedef typename c7x::make_full_vector<uint16_t>::type vec;
364 int32_t eleCount = c7x::element_count_of<vec>::value;
366 __SE_TEMPLATE_v1 se0Params, se1Params;
367 __SA_TEMPLATE_v1 sa0Params, sa1Params, sa2Params;
368 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
369 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
370 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
372 int32_t nVec = DSPLIB_ceilingDiv(order, eleCount);
373 int32_t se1ICNT1 = order / 2;
374 int32_t se0ICNT1 = order - se1ICNT1;
376 se0Params.ICNT1 = sa0Params.ICNT1 = se0ICNT1;
377 se1Params.ICNT1 = sa1Params.ICNT1 = se1ICNT1;
379 __SE0_OPEN(pIn, se0Params);
380 __SA2_OPEN(sa2Params);
382 vec vecZero = (vec) 0;
383 vec vecOne = (vec) 1;
385 vec idx_0_to_eleCount;
387 idx_0_to_eleCount = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
389 int32_t vertical = 0;
391 __SE1_OPEN(pIn + colPStride, se1Params);
393 for (vertical = 0; vertical < order - 1; vertical += 2) {
395 vec maxValVec1 = (vec) 0;
396 vec maxValVec2 = (vec) 0;
399 vec vCurrIdx1 = idx_0_to_eleCount;
400 vec vCurrIdx2 = idx_0_to_eleCount;
402 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
403 vec v1 = c7x::strm_eng<0, vec>::get_adv();
404 vec v2 = c7x::strm_eng<1, vec>::get_adv();
406 __vpred cmpPred1 = __cmp_eq_pred(vecZero, v1);
407 __vpred cmpPred2 = __cmp_eq_pred(vecZero, v2);
409 maxValVec1 = __select(cmpPred1, maxValVec1, v1);
410 maxValVec2 = __select(cmpPred2, maxValVec2, v2);
412 vMaxIdx1 = __select(cmpPred1, vMaxIdx1, vCurrIdx1);
413 vMaxIdx2 = __select(cmpPred2, vMaxIdx2, vCurrIdx2);
415 vCurrIdx1 = vCurrIdx1 + (uint16_t) eleCount;
416 vCurrIdx2 = vCurrIdx2 + (uint16_t) eleCount;
419 __vpred cmpPredFinal1 = __cmp_eq_pred(vecOne, maxValVec1);
420 uint32_t tempIdx1 = __rightmost_bit_detect_short(cmpPredFinal1) >> 1;
421 uint32_t finalIdx1 = __vgetuh_vrd(vMaxIdx1, tempIdx1);
423 __vpred cmpPredFinal2 = __cmp_eq_pred(vecOne, maxValVec2);
424 uint32_t tempIdx2 = __rightmost_bit_detect_short(cmpPredFinal2) >> 1;
425 uint32_t finalIdx2 = __vgetuh_vrd(vMaxIdx2, tempIdx2);
427 permuteOrder[vertical + 0] = finalIdx1;
428 permuteOrder[vertical + 1] = finalIdx2;
432 if (se0ICNT1 != se1ICNT1) {
434 vec maxValVec1 = (vec) 0;
435 vec vMaxIdx1 = idx_0_to_eleCount;
436 vec vCurrIdx1 = idx_0_to_eleCount;
438 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
439 vec v1 = c7x::strm_eng<0, vec>::get_adv();
441 __vpred cmpPred1 = __cmp_eq_pred(vecZero, v1);
443 maxValVec1 = __select(cmpPred1, maxValVec1, v1);
445 vMaxIdx1 = __select(cmpPred1, vMaxIdx1, vCurrIdx1);
447 vCurrIdx1 = vCurrIdx1 + (uint16_t) eleCount;
450 __vpred cmpPredFinal1 = __cmp_eq_pred(vecOne, maxValVec1);
451 uint32_t tempIdx1 = __rightmost_bit_detect_short(cmpPredFinal1) >> 1;
452 uint32_t finalIdx1 = __vgetuh_vrd(vMaxIdx1, tempIdx1);
454 permuteOrder[vertical + 0] = finalIdx1;
460 template <typename dataType, typename vec = typename c7x::make_full_vector<dataType>::type>
467 __SE_TEMPLATE_v1 se0Params,
468 __SE_TEMPLATE_v1 se1Params,
469 __SA_TEMPLATE_v1 sa0Params,
470 __SA_TEMPLATE_v1 sa1Params)
476 uint32_t eleCount = c7x::element_count_of<vec>::value;
477 int32_t nVec = DSPLIB_ceilingDiv(nRows, eleCount);
478 int32_t se0ICNT2 = nVec / 2;
479 int32_t se1ICNT2 = nVec - se0ICNT2;
480 se0Params.ICNT2 = se0ICNT2;
481 se1Params.ICNT2 = se1ICNT2;
482 dataType *pSE0 = pCol;
483 dataType *pSE1 = pCol + (se0ICNT2 * colStride * eleCount);
485 __SE1_OPEN(pSE1, se1Params);
487 __SE0_OPEN(pSE0, se0Params);
491 sa0Params.ICNT0 = (se0ICNT2 * eleCount);
492 sa1Params.ICNT0 = nRows - ((se0ICNT2 * eleCount));
493 dataType *pFactorHalf = pFactor + (se0ICNT2 * eleCount);
495 if (sa0Params.ICNT0) {
496 __SA0_OPEN(sa0Params);
498 __SA1_OPEN(sa1Params);
500 for (vertical = 0; vertical < se0ICNT2 - 1; vertical += 2) {
501 vec v1 = c7x::strm_eng<0, vec>::get_adv();
502 vec v2 = c7x::strm_eng<1, vec>::get_adv();
503 vec v3 = c7x::strm_eng<0, vec>::get_adv();
504 vec v4 = c7x::strm_eng<1, vec>::get_adv();
510 __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
511 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
512 __vstore_pred(pred, pStoreVec, v1);
514 pred = c7x::strm_agen<1, vec>::get_vpred();
515 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
516 __vstore_pred(pred, pStoreVec, v2);
518 pred = c7x::strm_agen<0, vec>::get_vpred();
519 pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
520 __vstore_pred(pred, pStoreVec, v3);
522 pred = c7x::strm_agen<1, vec>::get_vpred();
523 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
524 __vstore_pred(pred, pStoreVec, v4);
527 for (; vertical < se0ICNT2; vertical++) {
528 vec v1 = c7x::strm_eng<0, vec>::get_adv();
529 vec v2 = c7x::strm_eng<1, vec>::get_adv();
534 __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
535 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
536 __vstore_pred(pred, pStoreVec, v1);
538 pred = c7x::strm_agen<1, vec>::get_vpred();
539 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
540 __vstore_pred(pred, pStoreVec, v2);
542 if (se0ICNT2 != se1ICNT2) {
543 vec v1 = c7x::strm_eng<1, vec>::get_adv();
547 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
548 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
549 __vstore_pred(pred, pStoreVec, v1);
552 if (sa0Params.ICNT0) {
566 template float DSPLIB_lud_inv_factor_exec_ci<float, typename c7x::make_full_vector<float>::type>(
571 typename c7x::make_full_vector<float>::type scale,
573 __SE_TEMPLATE_v1 se0Params,
574 __SE_TEMPLATE_v1 se1Params,
575 __SA_TEMPLATE_v1 sa0Params,
576 __SA_TEMPLATE_v1 sa1Params);
577 template double DSPLIB_lud_inv_factor_exec_ci<double, typename c7x::make_full_vector<double>::type>(
582 typename c7x::make_full_vector<double>::type scale,
584 __SE_TEMPLATE_v1 se0Params,
585 __SE_TEMPLATE_v1 se1Params,
586 __SA_TEMPLATE_v1 sa0Params,
587 __SA_TEMPLATE_v1 sa1Params);
589 template <
typename dataType>
591 dataType *pLocalInvU,
593 int32_t colStrideOrder,
594 int32_t colInvAStride,
599 typedef typename c7x::make_full_vector<dataType>::type vec;
600 uint32_t eleCount = c7x::element_count_of<vec>::value;
603 __SE_TEMPLATE_v1 se0ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
604 __SE_TEMPLATE_v1 se1ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
605 __SA_TEMPLATE_v1 sa0ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
606 __SA_TEMPLATE_v1 sa1ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
609 __SE_TEMPLATE_v1 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
610 __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
611 __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
612 __SA_TEMPLATE_v1 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
613 __SA_TEMPLATE_v1 saRefStoreParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
615 int32_t lenTile8 = 8;
616 int32_t lenTile4 = 4;
617 int32_t lenTile2 = 2;
618 int32_t lenTile1 = 1;
620 int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
621 int32_t nTiles8 = nTiles1 / lenTile8;
622 nTiles1 -= nTiles8 * lenTile8;
623 int32_t nTiles4 = nTiles1 / lenTile4;
624 nTiles1 -= nTiles4 * lenTile4;
625 int32_t nTiles2 = nTiles1 / lenTile2;
626 nTiles1 -= nTiles2 * lenTile2;
628 int32_t remainingCols = nCols;
629 int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
630 colLimit8 = (remainingCols < colLimit8) ? remainingCols : colLimit8;
632 remainingCols = remainingCols - colLimit8;
633 int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
634 colLimit4 = (remainingCols < colLimit4) ? remainingCols : colLimit4;
636 remainingCols = remainingCols - colLimit4;
637 int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
638 colLimit2 = (remainingCols < colLimit2) ? remainingCols : colLimit2;
640 int32_t colLimit1 = remainingCols - colLimit2;
641 seScalarParams.ICNT1 = 2 * (nTiles8 + nTiles4 + nTiles2 + nTiles1);
643 for (int32_t col = nCols - 1; col >= 0; col--) {
644 dataType *pLastU = pLocalU + (colStrideOrder * col);
645 dataType *pLastInvU = pLocalInvU + (colStrideOrder * col);
647 dataType diag = pLocalU[col + col * colStrideOrder];
648 #ifdef LUD_INV_HIGH_PRECISION
649 vec divVec = (vec) (1 / diag);
651 dataType recipScalar = __recip(diag);
652 dataType twoP0 = 2.0;
654 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
655 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
656 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
657 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
659 vec divVec = (vec) (recipScalar);
662 DSPLIB_lud_inv_factor_exec_ci<dataType, vec>(&pLocalU[col], colStrideOrder, col, factArray, divVec, pBlock,
663 se0ParamsFact, se1ParamsFact, sa0ParamsFact, sa1ParamsFact);
664 seScalarParams.ICNT0 = col;
665 __SE0_OPEN(factArray, seScalarParams);
668 __SA0_OPEN(saRefParams);
669 __SA2_OPEN(saRefStoreParams);
674 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
675 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
676 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
677 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
678 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
681 __SE1_OPEN(pLocalU, seMatrixParams);
682 __SA1_OPEN(saMatrixParams);
685 for (int32_t tile = 0; tile < nTiles8; tile++) {
687 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
688 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
689 vec sV1 = __vload_pred(lPred, pLoadVec);
691 lPred = c7x::strm_agen<0, vec>::get_vpred();
692 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
693 vec sV2 = __vload_pred(lPred, pLoadVec);
695 lPred = c7x::strm_agen<0, vec>::get_vpred();
696 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
697 vec sV3 = __vload_pred(lPred, pLoadVec);
699 lPred = c7x::strm_agen<0, vec>::get_vpred();
700 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
701 vec sV4 = __vload_pred(lPred, pLoadVec);
703 lPred = c7x::strm_agen<0, vec>::get_vpred();
704 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
705 vec sV5 = __vload_pred(lPred, pLoadVec);
707 lPred = c7x::strm_agen<0, vec>::get_vpred();
708 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
709 vec sV6 = __vload_pred(lPred, pLoadVec);
711 lPred = c7x::strm_agen<0, vec>::get_vpred();
712 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
713 vec sV7 = __vload_pred(lPred, pLoadVec);
715 lPred = c7x::strm_agen<0, vec>::get_vpred();
716 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
717 vec sV8 = __vload_pred(lPred, pLoadVec);
719 for (int32_t vertical = 0; vertical < col; vertical++) {
721 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
723 vec v1 = c7x::strm_eng<1, vec>::get_adv();
724 vec v2 = c7x::strm_eng<1, vec>::get_adv();
725 vec v3 = c7x::strm_eng<1, vec>::get_adv();
726 vec v4 = c7x::strm_eng<1, vec>::get_adv();
727 vec v5 = c7x::strm_eng<1, vec>::get_adv();
728 vec v6 = c7x::strm_eng<1, vec>::get_adv();
729 vec v7 = c7x::strm_eng<1, vec>::get_adv();
730 vec v8 = c7x::strm_eng<1, vec>::get_adv();
732 v1 -= sV1 * scalarDup;
733 v2 -= sV2 * scalarDup;
734 v3 -= sV3 * scalarDup;
735 v4 -= sV4 * scalarDup;
736 v5 -= sV5 * scalarDup;
737 v6 -= sV6 * scalarDup;
738 v7 -= sV7 * scalarDup;
739 v8 -= sV8 * scalarDup;
741 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
742 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
743 __vstore_pred(sPred, pStoreVec, v1);
745 sPred = c7x::strm_agen<1, vec>::get_vpred();
746 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
747 __vstore_pred(sPred, pStoreVec, v2);
749 sPred = c7x::strm_agen<1, vec>::get_vpred();
750 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
751 __vstore_pred(sPred, pStoreVec, v3);
753 sPred = c7x::strm_agen<1, vec>::get_vpred();
754 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
755 __vstore_pred(sPred, pStoreVec, v4);
757 sPred = c7x::strm_agen<1, vec>::get_vpred();
758 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
759 __vstore_pred(sPred, pStoreVec, v5);
761 sPred = c7x::strm_agen<1, vec>::get_vpred();
762 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
763 __vstore_pred(sPred, pStoreVec, v6);
765 sPred = c7x::strm_agen<1, vec>::get_vpred();
766 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
767 __vstore_pred(sPred, pStoreVec, v7);
769 sPred = c7x::strm_agen<1, vec>::get_vpred();
770 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
771 __vstore_pred(sPred, pStoreVec, v8);
783 lPred = c7x::strm_agen<2, vec>::get_vpred();
784 vec *psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
785 __vstore_pred(lPred, psV, sV1);
787 lPred = c7x::strm_agen<2, vec>::get_vpred();
788 psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
789 __vstore_pred(lPred, psV, sV2);
791 lPred = c7x::strm_agen<2, vec>::get_vpred();
792 psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
793 __vstore_pred(lPred, psV, sV3);
795 lPred = c7x::strm_agen<2, vec>::get_vpred();
796 psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
797 __vstore_pred(lPred, psV, sV4);
799 lPred = c7x::strm_agen<2, vec>::get_vpred();
800 psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
801 __vstore_pred(lPred, psV, sV5);
803 lPred = c7x::strm_agen<2, vec>::get_vpred();
804 psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
805 __vstore_pred(lPred, psV, sV6);
807 lPred = c7x::strm_agen<2, vec>::get_vpred();
808 psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
809 __vstore_pred(lPred, psV, sV7);
811 lPred = c7x::strm_agen<2, vec>::get_vpred();
812 psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
813 __vstore_pred(lPred, psV, sV8);
822 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
823 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
824 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
825 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
826 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
828 dataType *pSE1 = pLocalU + colLimit8;
829 dataType *pSA1 = pLocalU + colLimit8;
830 dataType *pSA0 = pLastU;
831 dataType *pSA2 = pLastU;
834 __SE1_OPEN(pSE1, seMatrixParams);
835 __SA1_OPEN(saMatrixParams);
838 for (int32_t tile = 0; tile < nTiles4; tile++) {
839 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
840 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
841 vec sV1 = __vload_pred(lPred, pLoadVec);
843 lPred = c7x::strm_agen<0, vec>::get_vpred();
844 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
845 vec sV2 = __vload_pred(lPred, pLoadVec);
847 lPred = c7x::strm_agen<0, vec>::get_vpred();
848 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
849 vec sV3 = __vload_pred(lPred, pLoadVec);
851 lPred = c7x::strm_agen<0, vec>::get_vpred();
852 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
853 vec sV4 = __vload_pred(lPred, pLoadVec);
855 for (int32_t vertical = 0; vertical < col; vertical++) {
856 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
858 vec v1 = c7x::strm_eng<1, vec>::get_adv();
859 vec v2 = c7x::strm_eng<1, vec>::get_adv();
860 vec v3 = c7x::strm_eng<1, vec>::get_adv();
861 vec v4 = c7x::strm_eng<1, vec>::get_adv();
863 v1 -= sV1 * scalarDup;
864 v2 -= sV2 * scalarDup;
865 v3 -= sV3 * scalarDup;
866 v4 -= sV4 * scalarDup;
868 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
869 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
870 __vstore_pred(sPred, pStoreVec, v1);
872 sPred = c7x::strm_agen<1, vec>::get_vpred();
873 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
874 __vstore_pred(sPred, pStoreVec, v2);
876 sPred = c7x::strm_agen<1, vec>::get_vpred();
877 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
878 __vstore_pred(sPred, pStoreVec, v3);
880 sPred = c7x::strm_agen<1, vec>::get_vpred();
881 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
882 __vstore_pred(sPred, pStoreVec, v4);
890 lPred = c7x::strm_agen<2, vec>::get_vpred();
891 vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
892 __vstore_pred(lPred, psV, sV1);
894 lPred = c7x::strm_agen<2, vec>::get_vpred();
895 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
896 __vstore_pred(lPred, psV, sV2);
898 lPred = c7x::strm_agen<2, vec>::get_vpred();
899 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
900 __vstore_pred(lPred, psV, sV3);
902 lPred = c7x::strm_agen<2, vec>::get_vpred();
903 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
904 __vstore_pred(lPred, psV, sV4);
913 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
914 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
915 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
916 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
917 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
919 dataType *pSE1 = pLocalU + colLimit8 + colLimit4;
920 dataType *pSA1 = pLocalU + colLimit8 + colLimit4;
921 dataType *pSA0 = pLastU;
922 dataType *pSA2 = pLastU;
925 __SE1_OPEN(pSE1, seMatrixParams);
926 __SA1_OPEN(saMatrixParams);
929 for (int32_t tile = 0; tile < nTiles2; tile++) {
930 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
931 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
932 vec sV1 = __vload_pred(lPred, pLoadVec);
934 lPred = c7x::strm_agen<0, vec>::get_vpred();
935 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
936 vec sV2 = __vload_pred(lPred, pLoadVec);
938 for (int32_t vertical = 0; vertical < col; vertical++) {
939 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
941 vec v1 = c7x::strm_eng<1, vec>::get_adv();
942 vec v2 = c7x::strm_eng<1, vec>::get_adv();
944 v1 -= sV1 * scalarDup;
945 v2 -= sV2 * scalarDup;
947 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
948 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
949 __vstore_pred(sPred, pStoreVec, v1);
951 sPred = c7x::strm_agen<1, vec>::get_vpred();
952 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
953 __vstore_pred(sPred, pStoreVec, v2);
959 lPred = c7x::strm_agen<2, vec>::get_vpred();
960 vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
961 __vstore_pred(lPred, psV, sV1);
963 lPred = c7x::strm_agen<2, vec>::get_vpred();
964 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
965 __vstore_pred(lPred, psV, sV2);
975 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
976 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
977 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
978 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
979 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
981 dataType *pSE1 = pLocalU + colLimit8 + colLimit4 + colLimit2;
982 dataType *pSA1 = pLocalU + colLimit8 + colLimit4 + colLimit2;
983 dataType *pSA0 = pLastU;
984 dataType *pSA2 = pLastU;
987 __SE1_OPEN(pSE1, seMatrixParams);
988 __SA1_OPEN(saMatrixParams);
991 for (int32_t tile = 0; tile < nTiles1; tile++) {
992 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
993 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
994 vec sV1 = __vload_pred(lPred, pLoadVec);
996 for (int32_t vertical = 0; vertical < col; vertical++) {
997 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
999 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1001 v1 -= sV1 * scalarDup;
1003 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1004 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1005 __vstore_pred(sPred, pStoreVec, v1);
1010 lPred = c7x::strm_agen<2, vec>::get_vpred();
1011 vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1012 __vstore_pred(lPred, psV, sV1);
1022 __SA0_OPEN(saRefParams);
1023 __SA2_OPEN(saRefStoreParams);
1028 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
1029 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1030 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
1031 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
1032 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
1035 __SE1_OPEN(pLocalInvU, seMatrixParams);
1036 __SA1_OPEN(saMatrixParams);
1039 for (int32_t tile = 0; tile < nTiles8; tile++) {
1040 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1041 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1042 vec sV1 = __vload_pred(lPred, pLoadVec);
1044 lPred = c7x::strm_agen<0, vec>::get_vpred();
1045 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1046 vec sV2 = __vload_pred(lPred, pLoadVec);
1048 lPred = c7x::strm_agen<0, vec>::get_vpred();
1049 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1050 vec sV3 = __vload_pred(lPred, pLoadVec);
1052 lPred = c7x::strm_agen<0, vec>::get_vpred();
1053 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1054 vec sV4 = __vload_pred(lPred, pLoadVec);
1056 lPred = c7x::strm_agen<0, vec>::get_vpred();
1057 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1058 vec sV5 = __vload_pred(lPred, pLoadVec);
1060 lPred = c7x::strm_agen<0, vec>::get_vpred();
1061 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1062 vec sV6 = __vload_pred(lPred, pLoadVec);
1064 lPred = c7x::strm_agen<0, vec>::get_vpred();
1065 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1066 vec sV7 = __vload_pred(lPred, pLoadVec);
1068 lPred = c7x::strm_agen<0, vec>::get_vpred();
1069 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1070 vec sV8 = __vload_pred(lPred, pLoadVec);
1071 for (int32_t vertical = 0; vertical < col; vertical++) {
1072 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1074 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1075 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1076 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1077 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1078 vec v5 = c7x::strm_eng<1, vec>::get_adv();
1079 vec v6 = c7x::strm_eng<1, vec>::get_adv();
1080 vec v7 = c7x::strm_eng<1, vec>::get_adv();
1081 vec v8 = c7x::strm_eng<1, vec>::get_adv();
1083 v1 -= sV1 * scalarDup;
1084 v2 -= sV2 * scalarDup;
1085 v3 -= sV3 * scalarDup;
1086 v4 -= sV4 * scalarDup;
1087 v5 -= sV5 * scalarDup;
1088 v6 -= sV6 * scalarDup;
1089 v7 -= sV7 * scalarDup;
1090 v8 -= sV8 * scalarDup;
1092 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1093 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1094 __vstore_pred(sPred, pStoreVec, v1);
1096 sPred = c7x::strm_agen<1, vec>::get_vpred();
1097 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1098 __vstore_pred(sPred, pStoreVec, v2);
1100 sPred = c7x::strm_agen<1, vec>::get_vpred();
1101 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1102 __vstore_pred(sPred, pStoreVec, v3);
1104 sPred = c7x::strm_agen<1, vec>::get_vpred();
1105 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1106 __vstore_pred(sPred, pStoreVec, v4);
1108 sPred = c7x::strm_agen<1, vec>::get_vpred();
1109 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1110 __vstore_pred(sPred, pStoreVec, v5);
1112 sPred = c7x::strm_agen<1, vec>::get_vpred();
1113 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1114 __vstore_pred(sPred, pStoreVec, v6);
1116 sPred = c7x::strm_agen<1, vec>::get_vpred();
1117 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1118 __vstore_pred(sPred, pStoreVec, v7);
1120 sPred = c7x::strm_agen<1, vec>::get_vpred();
1121 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1122 __vstore_pred(sPred, pStoreVec, v8);
1134 lPred = c7x::strm_agen<2, vec>::get_vpred();
1135 vec *psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1136 __vstore_pred(lPred, psV, sV1);
1138 lPred = c7x::strm_agen<2, vec>::get_vpred();
1139 psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1140 __vstore_pred(lPred, psV, sV2);
1142 lPred = c7x::strm_agen<2, vec>::get_vpred();
1143 psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1144 __vstore_pred(lPred, psV, sV3);
1146 lPred = c7x::strm_agen<2, vec>::get_vpred();
1147 psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1148 __vstore_pred(lPred, psV, sV4);
1150 lPred = c7x::strm_agen<2, vec>::get_vpred();
1151 psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1152 __vstore_pred(lPred, psV, sV5);
1154 lPred = c7x::strm_agen<2, vec>::get_vpred();
1155 psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1156 __vstore_pred(lPred, psV, sV6);
1158 lPred = c7x::strm_agen<2, vec>::get_vpred();
1159 psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1160 __vstore_pred(lPred, psV, sV7);
1162 lPred = c7x::strm_agen<2, vec>::get_vpred();
1163 psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1164 __vstore_pred(lPred, psV, sV8);
1173 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
1174 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1175 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
1176 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
1177 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
1179 dataType *pSE1 = pLocalInvU + colLimit8;
1180 dataType *pSA1 = pLocalInvU + colLimit8;
1181 dataType *pSA0 = pLastInvU;
1182 dataType *pSA2 = pLastInvU;
1185 __SE1_OPEN(pSE1, seMatrixParams);
1186 __SA1_OPEN(saMatrixParams);
1189 for (int32_t tile = 0; tile < nTiles4; tile++) {
1190 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1191 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1192 vec sV1 = __vload_pred(lPred, pLoadVec);
1194 lPred = c7x::strm_agen<0, vec>::get_vpred();
1195 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1196 vec sV2 = __vload_pred(lPred, pLoadVec);
1198 lPred = c7x::strm_agen<0, vec>::get_vpred();
1199 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1200 vec sV3 = __vload_pred(lPred, pLoadVec);
1202 lPred = c7x::strm_agen<0, vec>::get_vpred();
1203 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1204 vec sV4 = __vload_pred(lPred, pLoadVec);
1205 for (int32_t vertical = 0; vertical < col; vertical++) {
1206 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1208 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1209 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1210 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1211 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1213 v1 -= sV1 * scalarDup;
1214 v2 -= sV2 * scalarDup;
1215 v3 -= sV3 * scalarDup;
1216 v4 -= sV4 * scalarDup;
1218 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1219 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1220 __vstore_pred(sPred, pStoreVec, v1);
1222 sPred = c7x::strm_agen<1, vec>::get_vpred();
1223 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1224 __vstore_pred(sPred, pStoreVec, v2);
1226 sPred = c7x::strm_agen<1, vec>::get_vpred();
1227 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1228 __vstore_pred(sPred, pStoreVec, v3);
1230 sPred = c7x::strm_agen<1, vec>::get_vpred();
1231 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1232 __vstore_pred(sPred, pStoreVec, v4);
1240 lPred = c7x::strm_agen<2, vec>::get_vpred();
1241 vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1242 __vstore_pred(lPred, psV, sV1);
1244 lPred = c7x::strm_agen<2, vec>::get_vpred();
1245 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1246 __vstore_pred(lPred, psV, sV2);
1248 lPred = c7x::strm_agen<2, vec>::get_vpred();
1249 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1250 __vstore_pred(lPred, psV, sV3);
1252 lPred = c7x::strm_agen<2, vec>::get_vpred();
1253 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1254 __vstore_pred(lPred, psV, sV4);
1263 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
1264 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1265 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
1266 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
1267 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
1269 dataType *pSE1 = pLocalInvU + colLimit8 + colLimit4;
1270 dataType *pSA1 = pLocalInvU + colLimit8 + colLimit4;
1271 dataType *pSA0 = pLastInvU;
1272 dataType *pSA2 = pLastInvU;
1275 __SE1_OPEN(pSE1, seMatrixParams);
1276 __SA1_OPEN(saMatrixParams);
1279 for (int32_t tile = 0; tile < nTiles2; tile++) {
1280 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1281 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1282 vec sV1 = __vload_pred(lPred, pLoadVec);
1284 lPred = c7x::strm_agen<0, vec>::get_vpred();
1285 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1286 vec sV2 = __vload_pred(lPred, pLoadVec);
1288 for (int32_t vertical = 0; vertical < col; vertical++) {
1289 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1291 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1292 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1294 v1 -= sV1 * scalarDup;
1295 v2 -= sV2 * scalarDup;
1297 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1298 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1299 __vstore_pred(sPred, pStoreVec, v1);
1301 sPred = c7x::strm_agen<1, vec>::get_vpred();
1302 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1303 __vstore_pred(sPred, pStoreVec, v2);
1309 lPred = c7x::strm_agen<2, vec>::get_vpred();
1310 vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1311 __vstore_pred(lPred, psV, sV1);
1313 lPred = c7x::strm_agen<2, vec>::get_vpred();
1314 psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1315 __vstore_pred(lPred, psV, sV2);
1325 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
1326 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1327 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
1328 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
1329 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
1331 dataType *pSE1 = pLocalInvU + colLimit8 + colLimit4 + colLimit2;
1332 dataType *pSA1 = pLocalInvU + colLimit8 + colLimit4 + colLimit2;
1333 dataType *pSA0 = pLastInvU;
1334 dataType *pSA2 = pLastInvU;
1337 __SE1_OPEN(pSE1, seMatrixParams);
1338 __SA1_OPEN(saMatrixParams);
1341 for (int32_t tile = 0; tile < nTiles1; tile++) {
1342 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1343 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1344 vec sV1 = __vload_pred(lPred, pLoadVec);
1346 for (int32_t vertical = 0; vertical < col; vertical++) {
1347 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1349 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1351 v1 -= sV1 * scalarDup;
1353 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1354 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1355 __vstore_pred(sPred, pStoreVec, v1);
1360 lPred = c7x::strm_agen<2, vec>::get_vpred();
1361 vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1362 __vstore_pred(lPred, psV, sV1);
1378 int32_t colStrideOrder,
1379 int32_t colInvAStride,
1385 int32_t colStrideOrder,
1386 int32_t colInvAStride,
1390 template <
typename dataType>
1392 dataType *pLocalInvL,
1394 int32_t colStrideOrder,
1395 int32_t colInvLStride,
1396 dataType *factArray,
1400 typedef typename c7x::make_full_vector<dataType>::type vec;
1403 __SE_TEMPLATE_v1 se0ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1404 __SE_TEMPLATE_v1 se1ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1405 __SA_TEMPLATE_v1 sa0ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1406 __SA_TEMPLATE_v1 sa1ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1408 uint32_t eleCount = c7x::element_count_of<vec>::value;
1411 __SE_TEMPLATE_v1 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1412 __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1413 __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1414 __SA_TEMPLATE_v1 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1415 __SA_TEMPLATE_v1 saRefStoreParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1417 int32_t lenTile8 = 8;
1418 int32_t lenTile4 = 4;
1419 int32_t lenTile2 = 2;
1420 int32_t lenTile1 = 1;
1422 int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
1423 int32_t nTiles8 = nTiles1 / lenTile8;
1424 nTiles1 -= nTiles8 * lenTile8;
1425 int32_t nTiles4 = nTiles1 / lenTile4;
1426 nTiles1 -= nTiles4 * lenTile4;
1427 int32_t nTiles2 = nTiles1 / lenTile2;
1428 nTiles1 -= nTiles2 * lenTile2;
1430 int32_t remainingCols = nCols;
1431 int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
1432 colLimit8 = (remainingCols < (colLimit8)) ? remainingCols : colLimit8;
1434 remainingCols = remainingCols - colLimit8;
1435 int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
1436 colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
1438 remainingCols = remainingCols - colLimit4;
1439 int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
1440 colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
1442 int32_t colLimit1 = remainingCols - colLimit2;
1443 seScalarParams.ICNT1 = 2 * (nTiles8 + nTiles4 + nTiles2 + nTiles1);
1445 for (int32_t col = 0; col < nCols; col++) {
1446 dataType *pRefL = pLocalL + (colStrideOrder * col);
1447 dataType *pRefInvL = pLocalInvL + (colInvLStride * col);
1448 dataType *pStartL = pLocalL + (colStrideOrder * (col + 1));
1449 dataType *pStartInvL = pLocalInvL + (colStrideOrder * (col + 1));
1450 int32_t nRows = (nCols - 1) - col;
1452 dataType diag = pLocalL[col + col * colStrideOrder];
1453 #ifdef LUD_INV_HIGH_PRECISION
1454 vec divVec = (vec) (1 / diag);
1456 dataType recipScalar = __recip(diag);
1457 dataType twoP0 = 2.0;
1459 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
1460 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
1462 vec divVec = (vec) recipScalar;
1465 DSPLIB_lud_inv_factor_exec_ci<dataType, vec>(&pStartL[col], colStrideOrder, nRows, factArray, divVec, pBlock,
1466 se0ParamsFact, se1ParamsFact, sa0ParamsFact, sa1ParamsFact);
1467 seScalarParams.ICNT0 = nRows;
1468 __SE0_OPEN(factArray, seScalarParams);
1471 __SA0_OPEN(saRefParams);
1472 __SA2_OPEN(saRefStoreParams);
1477 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
1478 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1479 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
1480 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
1481 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
1484 __SE1_OPEN(pStartL, seMatrixParams);
1485 __SA1_OPEN(saMatrixParams);
1488 for (int32_t tile = 0; tile < nTiles8; tile++) {
1489 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1490 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1491 vec sV1 = __vload_pred(lPred, pLoadVec);
1493 lPred = c7x::strm_agen<0, vec>::get_vpred();
1494 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1495 vec sV2 = __vload_pred(lPred, pLoadVec);
1497 lPred = c7x::strm_agen<0, vec>::get_vpred();
1498 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1499 vec sV3 = __vload_pred(lPred, pLoadVec);
1501 lPred = c7x::strm_agen<0, vec>::get_vpred();
1502 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1503 vec sV4 = __vload_pred(lPred, pLoadVec);
1505 lPred = c7x::strm_agen<0, vec>::get_vpred();
1506 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1507 vec sV5 = __vload_pred(lPred, pLoadVec);
1509 lPred = c7x::strm_agen<0, vec>::get_vpred();
1510 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1511 vec sV6 = __vload_pred(lPred, pLoadVec);
1513 lPred = c7x::strm_agen<0, vec>::get_vpred();
1514 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1515 vec sV7 = __vload_pred(lPred, pLoadVec);
1517 lPred = c7x::strm_agen<0, vec>::get_vpred();
1518 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1519 vec sV8 = __vload_pred(lPred, pLoadVec);
1521 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1523 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1525 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1526 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1527 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1528 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1529 vec v5 = c7x::strm_eng<1, vec>::get_adv();
1530 vec v6 = c7x::strm_eng<1, vec>::get_adv();
1531 vec v7 = c7x::strm_eng<1, vec>::get_adv();
1532 vec v8 = c7x::strm_eng<1, vec>::get_adv();
1534 v1 -= sV1 * scalarDup;
1535 v2 -= sV2 * scalarDup;
1536 v3 -= sV3 * scalarDup;
1537 v4 -= sV4 * scalarDup;
1538 v5 -= sV5 * scalarDup;
1539 v6 -= sV6 * scalarDup;
1540 v7 -= sV7 * scalarDup;
1541 v8 -= sV8 * scalarDup;
1543 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1544 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1545 __vstore_pred(sPred, pStoreVec, v1);
1547 sPred = c7x::strm_agen<1, vec>::get_vpred();
1548 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1549 __vstore_pred(sPred, pStoreVec, v2);
1551 sPred = c7x::strm_agen<1, vec>::get_vpred();
1552 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1553 __vstore_pred(sPred, pStoreVec, v3);
1555 sPred = c7x::strm_agen<1, vec>::get_vpred();
1556 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1557 __vstore_pred(sPred, pStoreVec, v4);
1559 sPred = c7x::strm_agen<1, vec>::get_vpred();
1560 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1561 __vstore_pred(sPred, pStoreVec, v5);
1563 sPred = c7x::strm_agen<1, vec>::get_vpred();
1564 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1565 __vstore_pred(sPred, pStoreVec, v6);
1567 sPred = c7x::strm_agen<1, vec>::get_vpred();
1568 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1569 __vstore_pred(sPred, pStoreVec, v7);
1571 sPred = c7x::strm_agen<1, vec>::get_vpred();
1572 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1573 __vstore_pred(sPred, pStoreVec, v8);
1583 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
1584 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1585 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
1586 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
1587 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
1589 dataType *pSE1 = pStartL + colLimit8;
1590 dataType *pSA1 = pStartL + colLimit8;
1591 dataType *pSA0 = pRefL;
1594 __SE1_OPEN(pSE1, seMatrixParams);
1595 __SA1_OPEN(saMatrixParams);
1598 for (int32_t tile = 0; tile < nTiles4; tile++) {
1599 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1600 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1601 vec sV1 = __vload_pred(lPred, pLoadVec);
1603 lPred = c7x::strm_agen<0, vec>::get_vpred();
1604 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1605 vec sV2 = __vload_pred(lPred, pLoadVec);
1607 lPred = c7x::strm_agen<0, vec>::get_vpred();
1608 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1609 vec sV3 = __vload_pred(lPred, pLoadVec);
1611 lPred = c7x::strm_agen<0, vec>::get_vpred();
1612 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1613 vec sV4 = __vload_pred(lPred, pLoadVec);
1615 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1616 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1618 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1619 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1620 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1621 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1623 v1 -= sV1 * scalarDup;
1624 v2 -= sV2 * scalarDup;
1625 v3 -= sV3 * scalarDup;
1626 v4 -= sV4 * scalarDup;
1628 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1629 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1630 __vstore_pred(sPred, pStoreVec, v1);
1632 sPred = c7x::strm_agen<1, vec>::get_vpred();
1633 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1634 __vstore_pred(sPred, pStoreVec, v2);
1636 sPred = c7x::strm_agen<1, vec>::get_vpred();
1637 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1638 __vstore_pred(sPred, pStoreVec, v3);
1640 sPred = c7x::strm_agen<1, vec>::get_vpred();
1641 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1642 __vstore_pred(sPred, pStoreVec, v4);
1652 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
1653 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1654 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
1655 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
1656 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
1658 dataType *pSE1 = pStartL + colLimit8 + colLimit4;
1659 dataType *pSA1 = pStartL + colLimit8 + colLimit4;
1660 dataType *pSA0 = pRefL;
1663 __SE1_OPEN(pSE1, seMatrixParams);
1664 __SA1_OPEN(saMatrixParams);
1667 for (int32_t tile = 0; tile < nTiles2; tile++) {
1668 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1669 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1670 vec sV1 = __vload_pred(lPred, pLoadVec);
1672 lPred = c7x::strm_agen<0, vec>::get_vpred();
1673 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1674 vec sV2 = __vload_pred(lPred, pLoadVec);
1676 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1677 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1679 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1680 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1682 v1 -= sV1 * scalarDup;
1683 v2 -= sV2 * scalarDup;
1685 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1686 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1687 __vstore_pred(sPred, pStoreVec, v1);
1689 sPred = c7x::strm_agen<1, vec>::get_vpred();
1690 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1691 __vstore_pred(sPred, pStoreVec, v2);
1701 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
1702 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1703 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
1704 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
1705 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
1707 dataType *pSE1 = pStartL + colLimit8 + colLimit4 + colLimit2;
1708 dataType *pSA1 = pStartL + colLimit8 + colLimit4 + colLimit2;
1709 dataType *pSA0 = pRefL;
1712 __SE1_OPEN(pSE1, seMatrixParams);
1713 __SA1_OPEN(saMatrixParams);
1716 for (int32_t tile = 0; tile < nTiles1; tile++) {
1717 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1718 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1719 vec sV1 = __vload_pred(lPred, pLoadVec);
1721 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1722 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1724 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1726 v1 -= sV1 * scalarDup;
1728 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1729 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1730 __vstore_pred(sPred, pStoreVec, v1);
1741 __SA0_OPEN(saRefParams);
1742 __SA2_OPEN(saRefStoreParams);
1747 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
1748 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1749 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
1750 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
1751 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
1754 __SE1_OPEN(pStartInvL, seMatrixParams);
1755 __SA1_OPEN(saMatrixParams);
1758 for (int32_t tile = 0; tile < nTiles8; tile++) {
1759 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1760 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1761 vec sV1 = __vload_pred(lPred, pLoadVec);
1763 lPred = c7x::strm_agen<0, vec>::get_vpred();
1764 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1765 vec sV2 = __vload_pred(lPred, pLoadVec);
1767 lPred = c7x::strm_agen<0, vec>::get_vpred();
1768 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1769 vec sV3 = __vload_pred(lPred, pLoadVec);
1771 lPred = c7x::strm_agen<0, vec>::get_vpred();
1772 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1773 vec sV4 = __vload_pred(lPred, pLoadVec);
1775 lPred = c7x::strm_agen<0, vec>::get_vpred();
1776 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1777 vec sV5 = __vload_pred(lPred, pLoadVec);
1779 lPred = c7x::strm_agen<0, vec>::get_vpred();
1780 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1781 vec sV6 = __vload_pred(lPred, pLoadVec);
1783 lPred = c7x::strm_agen<0, vec>::get_vpred();
1784 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1785 vec sV7 = __vload_pred(lPred, pLoadVec);
1787 lPred = c7x::strm_agen<0, vec>::get_vpred();
1788 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1789 vec sV8 = __vload_pred(lPred, pLoadVec);
1790 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1791 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1793 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1794 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1795 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1796 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1797 vec v5 = c7x::strm_eng<1, vec>::get_adv();
1798 vec v6 = c7x::strm_eng<1, vec>::get_adv();
1799 vec v7 = c7x::strm_eng<1, vec>::get_adv();
1800 vec v8 = c7x::strm_eng<1, vec>::get_adv();
1802 v1 -= sV1 * scalarDup;
1803 v2 -= sV2 * scalarDup;
1804 v3 -= sV3 * scalarDup;
1805 v4 -= sV4 * scalarDup;
1806 v5 -= sV5 * scalarDup;
1807 v6 -= sV6 * scalarDup;
1808 v7 -= sV7 * scalarDup;
1809 v8 -= sV8 * scalarDup;
1811 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1812 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1813 __vstore_pred(sPred, pStoreVec, v1);
1815 sPred = c7x::strm_agen<1, vec>::get_vpred();
1816 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1817 __vstore_pred(sPred, pStoreVec, v2);
1819 sPred = c7x::strm_agen<1, vec>::get_vpred();
1820 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1821 __vstore_pred(sPred, pStoreVec, v3);
1823 sPred = c7x::strm_agen<1, vec>::get_vpred();
1824 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1825 __vstore_pred(sPred, pStoreVec, v4);
1827 sPred = c7x::strm_agen<1, vec>::get_vpred();
1828 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1829 __vstore_pred(sPred, pStoreVec, v5);
1831 sPred = c7x::strm_agen<1, vec>::get_vpred();
1832 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1833 __vstore_pred(sPred, pStoreVec, v6);
1835 sPred = c7x::strm_agen<1, vec>::get_vpred();
1836 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1837 __vstore_pred(sPred, pStoreVec, v7);
1839 sPred = c7x::strm_agen<1, vec>::get_vpred();
1840 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1841 __vstore_pred(sPred, pStoreVec, v8);
1852 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
1853 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1854 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
1855 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
1856 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
1858 dataType *pSE1 = pStartInvL + colLimit8;
1859 dataType *pSA1 = pStartInvL + colLimit8;
1860 dataType *pSA0 = pRefInvL;
1863 __SE1_OPEN(pSE1, seMatrixParams);
1864 __SA1_OPEN(saMatrixParams);
1867 for (int32_t tile = 0; tile < nTiles4; tile++) {
1868 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1869 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1870 vec sV1 = __vload_pred(lPred, pLoadVec);
1872 lPred = c7x::strm_agen<0, vec>::get_vpred();
1873 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1874 vec sV2 = __vload_pred(lPred, pLoadVec);
1876 lPred = c7x::strm_agen<0, vec>::get_vpred();
1877 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1878 vec sV3 = __vload_pred(lPred, pLoadVec);
1880 lPred = c7x::strm_agen<0, vec>::get_vpred();
1881 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1882 vec sV4 = __vload_pred(lPred, pLoadVec);
1884 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1885 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1887 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1888 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1889 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1890 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1892 v1 -= sV1 * scalarDup;
1893 v2 -= sV2 * scalarDup;
1894 v3 -= sV3 * scalarDup;
1895 v4 -= sV4 * scalarDup;
1897 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1898 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1899 __vstore_pred(sPred, pStoreVec, v1);
1901 sPred = c7x::strm_agen<1, vec>::get_vpred();
1902 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1903 __vstore_pred(sPred, pStoreVec, v2);
1905 sPred = c7x::strm_agen<1, vec>::get_vpred();
1906 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1907 __vstore_pred(sPred, pStoreVec, v3);
1909 sPred = c7x::strm_agen<1, vec>::get_vpred();
1910 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1911 __vstore_pred(sPred, pStoreVec, v4);
1921 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
1922 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1923 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
1924 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
1925 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
1927 dataType *pSE1 = pStartInvL + colLimit8 + colLimit4;
1928 dataType *pSA1 = pStartInvL + colLimit8 + colLimit4;
1929 dataType *pSA0 = pRefInvL;
1932 __SE1_OPEN(pSE1, seMatrixParams);
1933 __SA1_OPEN(saMatrixParams);
1936 for (int32_t tile = 0; tile < nTiles2; tile++) {
1937 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1938 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1939 vec sV1 = __vload_pred(lPred, pLoadVec);
1941 lPred = c7x::strm_agen<0, vec>::get_vpred();
1942 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1943 vec sV2 = __vload_pred(lPred, pLoadVec);
1945 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1946 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1948 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1949 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1951 v1 -= sV1 * scalarDup;
1952 v2 -= sV2 * scalarDup;
1954 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1955 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1956 __vstore_pred(sPred, pStoreVec, v1);
1958 sPred = c7x::strm_agen<1, vec>::get_vpred();
1959 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1960 __vstore_pred(sPred, pStoreVec, v2);
1971 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
1972 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1973 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
1974 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
1975 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
1977 dataType *pSE1 = pStartInvL + colLimit8 + colLimit4 + colLimit2;
1978 dataType *pSA1 = pStartInvL + colLimit8 + colLimit4 + colLimit2;
1979 dataType *pSA0 = pRefInvL;
1982 __SE1_OPEN(pSE1, seMatrixParams);
1983 __SA1_OPEN(saMatrixParams);
1986 for (int32_t tile = 0; tile < nTiles1; tile++) {
1987 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1988 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1989 vec sV1 = __vload_pred(lPred, pLoadVec);
1991 for (int32_t vertical = 0; vertical < nRows; vertical++) {
1992 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1994 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1996 v1 -= sV1 * scalarDup;
1998 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1999 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
2000 __vstore_pred(sPred, pStoreVec, v1);
2017 int32_t colStrideOrder,
2018 int32_t colInvLStride,
2024 int32_t colStrideOrder,
2025 int32_t colInvLStride,
2029 template <
typename dataType>
2034 void *restrict pinvA,
2035 void *restrict pStratch)
2041 int32_t order = pKerPrivArgs->
order;
2043 int32_t strideP = pKerPrivArgs->
strideP;
2044 int32_t dataSize =
sizeof(dataType);
2045 int32_t dataSizeP =
sizeof(uint16_t);
2047 int32_t orderStride = strideOrder / dataSize;
2048 int32_t orderPStride = strideP / dataSizeP;
2051 unsigned short *pPLocal = (
unsigned short *) pP;
2052 dataType *pLLocal = (dataType *) pL;
2053 dataType *pULocal = (dataType *) pU;
2054 dataType *pinvALocal = (dataType *) pinvA;
2055 dataType *pFactArray = (dataType *) pStratch;
2056 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
2059 DSPLIB_DEBUGPRINTFN(0,
"pPLocal: %p pLLocal: %p pULocal: %p pinvALocal: %p order: %d\n", pPLocal, pLLocal, pULocal,
2062 dataType *invL, *invU, *invU_x_invL;
2064 invL = &pinvALocal[0];
2065 DSPLIB_lud_identity_matrix_generate_exec_ci<dataType>(invL, order, orderStride, pBlock);
2066 DSPLIB_lud_inv_invL_exec_ci<dataType>(pLLocal, invL, order, orderStride, orderStride, pFactArray, pBlock);
2070 DSPLIB_lud_identity_matrix_generate_exec_ci<dataType>(invU, order, orderStride, pBlock);
2081 invU_x_invL = &pULocal[0];
2083 DSPLIB_matMul_exec_ci<dataType>(pMatMulKerPrivArgs, invU, invL, invU_x_invL);
2085 uint32_t *permuteOrder = (uint32_t *) pFactArray;
2090 DSPLIB_matTrans_exec_ci<dataType>(&pKerPrivArgs->
pMatTransKerPrivArgs, invU_x_invL, pinvALocal);
2093 DSPLIB_lud_inv_permuteRows_ci<dataType>(pinvALocal, order, orderStride, permuteOrder, invU_x_invL, pBlock);
2096 DSPLIB_matTrans_exec_ci<dataType>(&pKerPrivArgs->
pMatTransKerPrivArgs, invU_x_invL, pinvALocal);
2107 void *restrict pinvA,
2108 void *restrict pStratch);
2114 void *restrict pinvA,
2115 void *restrict pStratch);
template void DSPLIB_lud_inv_invU_exec_ci< double >(double *pLocalU, double *pLocalInvU, int32_t nCols, int32_t colStrideOrder, int32_t colInvAStride, double *factArray, uint8_t *pBlock)
template void DSPLIB_lud_inv_opt_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_inv_invU_exec_ci< float >(float *pLocalU, float *pLocalInvU, int32_t nCols, int32_t colStrideOrder, int32_t colInvAStride, float *factArray, uint8_t *pBlock)
template DSPLIB_STATUS DSPLIB_lud_inv_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsinvA, const DSPLIB_lud_invInitArgs *pKerInitArgs)
template void DSPLIB_lud_inv_invL_exec_ci< float >(float *pLocalL, float *pLocalInvL, int32_t nCols, int32_t colStrideOrder, int32_t colInvLStride, float *factArray, uint8_t *pBlock)
template void DSPLIB_lud_inv_permuteRows_ci< float >(float *pIn, int32_t order, int32_t colStride, uint32_t *permuteOrder, float *pOut, uint8_t *pBlock)
void DSPLIB_lud_inv_opt_init_ci(DSPLIB_kernelHandle handle)
void DSPLIB_lud_inv_permuteIndex_ci(unsigned short *pIn, int32_t order, int32_t colPStride, uint32_t *permuteOrder, uint8_t *pBlock)
template void DSPLIB_lud_inv_permuteRows_init_ci< double >(DSPLIB_kernelHandle handle)
void DSPLIB_lud_inv_permuteRows_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_inv_permuteRows_ci< double >(double *pIn, int32_t order, int32_t colStride, uint32_t *permuteOrder, double *pOut, uint8_t *pBlock)
template void DSPLIB_lud_inv_permuteRows_init_ci< float >(DSPLIB_kernelHandle handle)
DSPLIB_STATUS DSPLIB_lud_inv_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsinvA, const DSPLIB_lud_invInitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template void DSPLIB_lud_inv_invL_exec_ci< double >(double *pLocalL, double *pLocalInvL, int32_t nCols, int32_t colStrideOrder, int32_t colInvLStride, double *factArray, uint8_t *pBlock)
template DSPLIB_STATUS DSPLIB_lud_inv_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pinvA, void *restrict pStratch)
template DSPLIB_STATUS DSPLIB_lud_inv_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pinvA, void *restrict pStratch)
template DSPLIB_STATUS DSPLIB_lud_inv_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsinvA, const DSPLIB_lud_invInitArgs *pKerInitArgs)
static dataType DSPLIB_lud_inv_factor_exec_ci(dataType *pCol, int32_t colStride, int32_t nRows, dataType *pFactor, vec scaleVec, uint8_t *pBlock, __SE_TEMPLATE_v1 se0Params, __SE_TEMPLATE_v1 se1Params, __SA_TEMPLATE_v1 sa0Params, __SA_TEMPLATE_v1 sa1Params)
template void DSPLIB_lud_inv_opt_init_ci< double >(DSPLIB_kernelHandle handle)
void DSPLIB_lud_inv_invU_exec_ci(dataType *pLocalU, dataType *pLocalInvU, int32_t nCols, int32_t colStrideOrder, int32_t colInvAStride, dataType *factArray, uint8_t *pBlock)
void DSPLIB_lud_inv_permuteRows_ci(dataType *pIn, int32_t order, int32_t colStride, uint32_t *permuteOrder, dataType *pOut, uint8_t *pBlock)
DSPLIB_STATUS DSPLIB_lud_inv_exec_ci(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pinvA, void *restrict pStratch)
This function is the main execution function for the C7x implementation of the kernel....
void DSPLIB_lud_inv_invL_exec_ci(dataType *pLocalL, dataType *pLocalInvL, int32_t nCols, int32_t colStrideOrder, int32_t colInvLStride, dataType *factArray, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_lud_inv.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 2 dimensional buffer descriptor.
int32_t stride_y
Stride in Y dimension in bytes.
uint32_t dim_x
Width of buffer in X dimension in elements.
uint32_t dim_y
Height of buffer in Y dimension in elements.
Structure containing the parameters to initialize the kernel.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
DSPLIB_matMul_PrivArgs pMatMulKerPrivArgs
Privargs for the matMul kernel.
uint8_t bufPblock[DSPLIB_LUD_INV_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters
DSPLIB_matTrans_PrivArgs pMatTransKerPrivArgs
Privargs for the matTrans kernel.
int32_t strideOrder
Stride between rows of input and output data matrix
int32_t order
Size of input buffer for different batches DSPLIB_lud_inv_init that will be retrieved and used by DSP...
int32_t strideP
Stride between rows of output data matrix P
Structure containing the parameters to initialize the kernel.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
int32_t strideIn1Elements
int32_t strideIn0Elements
int32_t strideOutElements
Structure containing the parameters to initialize the kernel.
uint32_t dimX
Size of input data.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
int32_t strideOut
Stride between rows of output data matrix
uint32_t heightIn
Height of input data matrix
int32_t strideIn
Stride between rows of input data matrix
uint32_t widthIn
Size of input buffer for different batches DSPLIB_matTrans_init that will be retrieved and used by DS...