47 #include "../common/DSPLIB_inlines.h"
51 #define LUD_HIGH_PRECISION
58 template <
typename dataType>
69 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
71 int32_t strideP = pKerPrivArgs->
strideP;
72 int32_t order = pKerPrivArgs->
order;
73 int32_t colStrideOrder = strideOrder /
sizeof(dataType);
76 DSPLIB_lud_blk_move_init_ci<dataType>(&pBlock[17 * SE_PARAM_SIZE], order, colStrideOrder);
78 typedef typename c7x::make_full_vector<dataType>::type vec;
79 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
80 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
81 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
82 int32_t eleCount = c7x::element_count_of<vec>::value;
85 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
88 se0Params.ICNT1 = eleCount;
89 se0Params.DIM1 = colStrideOrder;
90 se0Params.DIM2 = colStrideOrder * eleCount;
91 se0Params.DIMFMT = __SE_DIMFMT_3D;
92 se0Params.ELETYPE = SE_ELETYPE;
93 se0Params.VECLEN = SE_VECLEN;
94 if (
sizeof(dataType) == 4) {
95 se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
98 se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
101 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = se0Params;
103 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
104 se0Params = __gen_SE_TEMPLATE_v1();
106 sa0Params.ICNT0 = order;
107 sa0Params.DIMFMT = __SA_DIMFMT_1D;
108 sa0Params.VECLEN = SA_VECLEN;
110 se0Params.ICNT0 = order;
111 se0Params.DIMFMT = __SE_DIMFMT_1D;
112 se0Params.ELETYPE = SE_ELETYPE;
113 se0Params.VECLEN = SE_VECLEN;
115 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = sa0Params;
116 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = se0Params;
119 typedef typename c7x::make_full_vector<unsigned short>::type vecShort;
121 __SE_ELETYPE SE_ELETYPE_SHORT = c7x::se_eletype<vecShort>::value;
122 __SE_VECLEN SE_VECLEN_SHORT = c7x::se_veclen<vecShort>::value;
123 __SA_VECLEN SA_VECLEN_SHORT = c7x::sa_veclen<vecShort>::value;
125 sa0Params = __gen_SA_TEMPLATE_v1();
126 se0Params = __gen_SE_TEMPLATE_v1();
128 sa0Params.ICNT0 = order;
129 sa0Params.DIMFMT = __SA_DIMFMT_1D;
130 sa0Params.VECLEN = SA_VECLEN_SHORT;
132 se0Params.ICNT0 = order;
133 se0Params.DIMFMT = __SE_DIMFMT_1D;
134 se0Params.ELETYPE = SE_ELETYPE_SHORT;
135 se0Params.VECLEN = SE_VECLEN_SHORT;
137 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE)) = sa0Params;
138 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE)) = se0Params;
141 __SE_TEMPLATE_v1 seRefParams = __gen_SE_TEMPLATE_v1();
142 __SE_TEMPLATE_v1 seMatrixParams = __gen_SE_TEMPLATE_v1();
143 __SA_TEMPLATE_v1 saMatrixParams = __gen_SA_TEMPLATE_v1();
144 __SA_TEMPLATE_v1 sa2Params = __gen_SA_TEMPLATE_v1();
146 sa0Params = __gen_SA_TEMPLATE_v1();
148 int32_t lenTile4 = 4;
151 sa0Params.DIMFMT = __SA_DIMFMT_2D;
152 sa0Params.VECLEN = __SA_VECLEN_1ELEM;
153 sa0Params.DIM1 = colStrideOrder;
156 sa2Params.DIMFMT = __SA_DIMFMT_2D;
157 sa2Params.VECLEN = __SA_VECLEN_1ELEM;
158 sa2Params.DIM1 = colStrideOrder;
160 seRefParams.DIM1 = (eleCount * lenTile4);
161 seRefParams.DIM2 = 0;
162 seRefParams.DIMFMT = __SE_DIMFMT_3D;
163 seRefParams.VECLEN = SE_VECLEN;
164 seRefParams.ELETYPE = SE_ELETYPE;
165 seRefParams.ICNT0 = (eleCount * lenTile4);
166 seRefParams.DECDIM2 = __SE_DECDIM_DIM1;
168 seMatrixParams.ICNT0 = (eleCount * lenTile4);
169 seMatrixParams.DIM1 = colStrideOrder;
170 seMatrixParams.ICNT1 = lenTile4;
171 seMatrixParams.DIM2 = (eleCount * lenTile4);
172 seMatrixParams.DIM3 = (colStrideOrder * lenTile4);
173 seMatrixParams.DIMFMT = __SE_DIMFMT_4D;
174 seMatrixParams.ELETYPE = SE_ELETYPE;
175 seMatrixParams.VECLEN = SE_VECLEN;
176 seMatrixParams.DECDIM2 = __SE_DECDIM_DIM2;
178 saMatrixParams.ICNT0 = (eleCount * lenTile4);
179 saMatrixParams.DIM1 = colStrideOrder;
180 saMatrixParams.ICNT1 = lenTile4;
181 saMatrixParams.DIM2 = (eleCount * lenTile4);
182 saMatrixParams.DIM3 = (colStrideOrder * lenTile4);
183 saMatrixParams.DIMFMT = __SA_DIMFMT_4D;
184 saMatrixParams.VECLEN = SA_VECLEN;
185 saMatrixParams.DECDIM2 = __SA_DECDIM_DIM2;
187 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = sa0Params;
188 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = sa2Params;
189 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = seRefParams;
190 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = seMatrixParams;
191 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = saMatrixParams;
194 seMatrixParams = __gen_SE_TEMPLATE_v1();
195 saMatrixParams = __gen_SA_TEMPLATE_v1();
197 seMatrixParams.ICNT0 = (eleCount * lenTile4);
198 seMatrixParams.DIM2 = colStrideOrder;
199 seMatrixParams.DIM1 = (eleCount * lenTile4);
200 seMatrixParams.ICNT3 = 1;
201 seMatrixParams.DIM3 = 0;
202 seMatrixParams.DIMFMT = __SE_DIMFMT_4D;
203 seMatrixParams.ELETYPE = SE_ELETYPE;
204 seMatrixParams.VECLEN = SE_VECLEN;
205 seMatrixParams.DECDIM2 = __SE_DECDIM_DIM1;
207 saMatrixParams.ICNT0 = (eleCount * lenTile4);
208 saMatrixParams.DIM2 = colStrideOrder;
209 saMatrixParams.DIM1 = (eleCount * lenTile4);
210 saMatrixParams.ICNT3 = 1;
211 saMatrixParams.DIM3 = 0;
212 saMatrixParams.DIMFMT = __SA_DIMFMT_4D;
213 saMatrixParams.VECLEN = SA_VECLEN;
214 saMatrixParams.DECDIM2 = __SA_DECDIM_DIM1;
216 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE)) = seMatrixParams;
217 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (10 * SE_PARAM_SIZE)) = saMatrixParams;
221 seMatrixParams = __gen_SE_TEMPLATE_v1();
222 saMatrixParams = __gen_SA_TEMPLATE_v1();
224 seMatrixParams.ICNT0 = order;
225 seMatrixParams.ICNT1 = order;
226 seMatrixParams.DIM1 = colStrideOrder;
227 seMatrixParams.DIMFMT = __SE_DIMFMT_2D;
228 seMatrixParams.ELETYPE = SE_ELETYPE;
229 seMatrixParams.VECLEN = SE_VECLEN;
231 saMatrixParams.ICNT0 = order;
232 saMatrixParams.ICNT1 = order;
233 saMatrixParams.DIM1 = colStrideOrder;
234 saMatrixParams.DIMFMT = __SA_DIMFMT_2D;
235 saMatrixParams.VECLEN = SA_VECLEN;
237 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE)) = seMatrixParams;
238 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE)) = saMatrixParams;
242 __SE_TEMPLATE_v1 seColParams = __gen_SE_TEMPLATE_v1();
243 __SA_TEMPLATE_v1 saColParams = __gen_SA_TEMPLATE_v1();
245 seColParams.ICNT0 = eleCount;
246 seColParams.DIM1 = 2 * colStrideOrder;
247 seColParams.DIMFMT = __SE_DIMFMT_2D;
248 seColParams.ELETYPE = SE_ELETYPE;
249 seColParams.VECLEN = SE_VECLEN;
251 saColParams.ICNT0 = 1;
252 saColParams.DIM1 = colStrideOrder;
253 saColParams.DIMFMT = __SA_DIMFMT_2D;
254 saColParams.VECLEN = __SA_VECLEN_1ELEM;
256 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE)) = seColParams;
257 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE)) = saColParams;
261 int32_t lenTile8 = 8;
262 int32_t nTiles_8 = DSPLIB_ceilingDiv(order, (eleCount * lenTile8));
264 __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
265 __SE_TEMPLATE_v1 seScalarParams = __gen_SE_TEMPLATE_v1();
266 seMatrixParams = __gen_SE_TEMPLATE_v1();
267 saMatrixParams = __gen_SA_TEMPLATE_v1();
269 seScalarParams.ICNT0 = 1;
270 seScalarParams.DIM1 = colStrideOrder;
271 seScalarParams.DIM2 = 0;
272 seScalarParams.ELEDUP = SE_ELEDUP;
273 seScalarParams.DIMFMT = __SE_DIMFMT_3D;
274 seScalarParams.VECLEN = SE_VECLEN;
275 seScalarParams.ELETYPE = SE_ELETYPE;
277 seMatrixParams.ICNT0 = (eleCount * lenTile8);
278 seMatrixParams.DIM1 = colStrideOrder;
279 seMatrixParams.ICNT2 = nTiles_8;
280 seMatrixParams.DIM2 = (eleCount * lenTile8);
281 seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
282 seMatrixParams.ELETYPE = SE_ELETYPE;
283 seMatrixParams.VECLEN = SE_VECLEN;
284 seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
285 seMatrixParams.DECDIM1_WIDTH = order;
287 saMatrixParams.ICNT0 = (eleCount * lenTile8);
288 saMatrixParams.DIM1 = colStrideOrder;
289 saMatrixParams.ICNT2 = nTiles_8;
290 saMatrixParams.DIM2 = (eleCount * lenTile8);
291 saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
292 saMatrixParams.VECLEN = SA_VECLEN;
293 saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
294 saMatrixParams.DECDIM1_WIDTH = order;
296 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (19 * SE_PARAM_SIZE)) = seScalarParams;
297 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (20 * SE_PARAM_SIZE)) = seMatrixParams;
298 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (21 * SE_PARAM_SIZE)) = saMatrixParams;
322 template <
typename dataType>
inline __vpred
getPMask(uint32_t idx);
331 template <
typename V,
typename dataType>
inline V
getIdxVec();
333 template <>
inline c7x::float_vec getIdxVec<c7x::float_vec, float>() {
return idx_float; }
335 template <>
inline c7x::double_vec getIdxVec<c7x::double_vec, double>() {
return idx_double; }
337 template <typename dataType, typename vec = typename c7x::make_full_vector<dataType>::type>
340 vec idx_0_to_eleCount,
345 __SE_TEMPLATE_v1 se0Params)
349 int32_t eleCount = c7x::element_count_of<vec>::value;
350 int32_t nVec = nRows / eleCount;
351 int32_t remRows = nRows - (nVec * eleCount);
353 vec maxValVec = (vec) (-std::numeric_limits<dataType>::max());
354 vec minValVec = (vec) (std::numeric_limits<dataType>::max());
356 vec zeroVec = (vec) 0;
358 __vpred predMask = getPMask<dataType>(remRows);
359 vec minMask = __select(predMask, zeroVec, minValVec);
361 se0Params.ICNT2 = nVec + ((remRows > 0) ? 1 : 0);
363 dataType maxVal = -std::numeric_limits<dataType>::max();
364 dataType minVal = std::numeric_limits<dataType>::max();
365 vec offSetVec = (vec) k;
367 idx_0_to_eleCount += offSetVec;
369 vec vCurrIdx = idx_0_to_eleCount;
370 vec vMaxIdx = idx_0_to_eleCount;
371 vec vMinIdx = idx_0_to_eleCount;
373 __SE0_OPEN(pCol, se0Params);
375 for (int32_t k = 0; k < nVec; k++) {
376 vec v1 = c7x::strm_eng<0, vec>::get_adv();
379 vpMask = __cmp_le_pred(v1, maxValVec);
380 maxValVec = __select(vpMask, maxValVec, v1);
381 vMaxIdx = __select(vpMask, vMaxIdx, vCurrIdx);
383 vpMask = __cmp_le_pred(minValVec, v1);
384 minValVec = __select(vpMask, minValVec, v1);
385 vMinIdx = __select(vpMask, vMinIdx, vCurrIdx);
387 vCurrIdx = vCurrIdx + (eleCount);
390 vec v1 = c7x::strm_eng<0, vec>::get_adv();
392 vec v3 = c7x::reinterpret<vec>(__andn(c7x::as_uchar_vec(minMask), c7x::as_uchar_vec(v1)));
393 vec v4 = v3 + minMask;
395 vpMask = __cmp_le_pred(v3, maxValVec);
396 maxValVec = __select(vpMask, maxValVec, v3);
397 vMaxIdx = __select(vpMask, vMaxIdx, vCurrIdx);
398 vpMask = __cmp_lt_pred(minValVec, v4);
399 minValVec = __select(vpMask, minValVec, v4);
400 vMinIdx = __select(vpMask, vMinIdx, vCurrIdx);
402 vCurrIdx = vCurrIdx + (eleCount);
405 c7x_horizontal_max_with_index(maxValVec, vMaxIdx, &maxVal, max);
406 vpMask = __cmp_eq_pred(maxValVec, (vec) maxVal);
407 vec tmpIdx = __select(vpMask, vMaxIdx, (vec) (std::numeric_limits<dataType>::max()));
408 *max = (int32_t) c7x_horizontal_min_fp<dataType, vec>(tmpIdx);
410 c7x_horizontal_min_with_index(minValVec, vMinIdx, &minVal, min);
411 vpMask = __cmp_eq_pred(minValVec, (vec) minVal);
412 tmpIdx = __select(vpMask, vMinIdx, (vec) (std::numeric_limits<dataType>::max()));
413 *min = (int32_t) c7x_horizontal_min_fp<dataType, vec>(tmpIdx);
420 typename c7x::make_full_vector<float>::type idx_0_to_eleCount,
425 __SE_TEMPLATE_v1 se0Params);
428 typename c7x::make_full_vector<double>::type idx_0_to_eleCount,
433 __SE_TEMPLATE_v1 se0Params);
435 template <
typename dataType>
439 __SA_TEMPLATE_v1 sa1Params,
440 __SE_TEMPLATE_v1 se1Params)
444 typedef typename c7x::make_full_vector<dataType>::type vec;
445 int32_t eleCount = c7x::element_count_of<vec>::value;
446 int32_t nTiles = DSPLIB_ceilingDiv(se1Params.ICNT0, eleCount);
448 __SE0_OPEN(pArray1, se1Params);
449 __SA0_OPEN(sa1Params);
450 __SE1_OPEN(pArray2, se1Params);
451 __SA1_OPEN(sa1Params);
453 for (int32_t horizontal = 0; horizontal < nTiles; horizontal++) {
454 vec v11 = c7x::strm_eng<0, vec>::get_adv();
455 vec v21 = c7x::strm_eng<1, vec>::get_adv();
457 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
458 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pArray2);
459 __vstore_pred(sPred, pStoreVec, v11);
461 sPred = c7x::strm_agen<0, vec>::get_vpred();
462 pStoreVec = c7x::strm_agen<0, vec>::get_adv(pArray1);
463 __vstore_pred(sPred, pStoreVec, v21);
476 __SA_TEMPLATE_v1 sa1Params,
477 __SE_TEMPLATE_v1 se1Params);
481 __SA_TEMPLATE_v1 sa1Params,
482 __SE_TEMPLATE_v1 se1Params);
484 unsigned short *pArray2,
486 __SA_TEMPLATE_v1 sa1Params,
487 __SE_TEMPLATE_v1 se1Params);
488 template <
typename dataType>
492 __SA_TEMPLATE_v1 saColParams,
493 __SE_TEMPLATE_v1 se0ColParams,
494 __SE_TEMPLATE_v1 se1ColParams)
498 typedef typename c7x::make_full_vector<dataType>::type vec;
500 int32_t nBlocks = (int32_t) ((uint32_t) nRows >> (uint32_t) 1);
501 int32_t se0ICNT1 = nRows - nBlocks;
502 int32_t se1ICNT1 = nBlocks;
504 #ifndef LUD_HIGH_PRECISION
505 dataType diag = pCol[0];
506 dataType recipScalar = __recip(diag);
507 dataType twoP0 = 2.0;
508 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
509 recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
511 vec recip = (vec) recipScalar;
513 vec recip = (vec) (1 / pCol[0]);
515 se0ColParams.ICNT1 = se0ICNT1;
516 se1ColParams.ICNT1 = se1ICNT1;
517 saColParams.ICNT1 = nRows;
519 __SE0_OPEN(pCol + colStride, se0ColParams);
520 __SA0_OPEN(saColParams);
523 __SE1_OPEN(pCol + (2 * colStride), se1ColParams);
526 for (int32_t horizontal = 0; horizontal < nRows - 1; horizontal += 2) {
527 vec v1 = c7x::strm_eng<0, vec>::get_adv();
528 vec v2 = c7x::strm_eng<1, vec>::get_adv();
533 __vpred sPred = c7x::strm_agen<0, vec>::get_vpred();
534 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv(pCol + colStride);
535 __vstore_pred(sPred, pStoreVec, v1);
537 sPred = c7x::strm_agen<0, vec>::get_vpred();
538 pStoreVec = c7x::strm_agen<0, vec>::get_adv(pCol + colStride);
539 __vstore_pred(sPred, pStoreVec, v2);
542 if (se0ICNT1 != se1ICNT1) {
543 vec v1 = c7x::strm_eng<0, vec>::get_adv();
547 __vpred sPred = c7x::strm_agen<0, vec>::get_vpred();
548 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv(pCol + colStride);
549 __vstore_pred(sPred, pStoreVec, v1);
562 __SA_TEMPLATE_v1 saColParams,
563 __SE_TEMPLATE_v1 se0ColParams,
564 __SE_TEMPLATE_v1 se1ColParams);
568 __SA_TEMPLATE_v1 saColParams,
569 __SE_TEMPLATE_v1 se0ColParams,
570 __SE_TEMPLATE_v1 se1ColParams);
572 template <
typename dataType>
576 typedef typename c7x::make_full_vector<dataType>::type vec;
578 uint32_t eleCount = c7x::element_count_of<vec>::value;
581 __SE_TEMPLATE_v1 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (19 * SE_PARAM_SIZE));
582 __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (20 * SE_PARAM_SIZE));
583 __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (21 * SE_PARAM_SIZE));
584 __SA_TEMPLATE_v1 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
586 int32_t lenTile8 = 8;
587 int32_t lenTile4 = 4;
588 int32_t lenTile2 = 2;
589 int32_t lenTile1 = 1;
591 int32_t nRows = (order - 1);
592 saRefParams.ICNT0 = nRows;
594 int32_t nTiles1 = DSPLIB_ceilingDiv(nRows, (eleCount));
595 int32_t nTiles8 = nTiles1 / lenTile8;
596 nTiles1 -= nTiles8 * lenTile8;
597 int32_t nTiles4 = nTiles1 / lenTile4;
598 nTiles1 -= nTiles4 * lenTile4;
599 int32_t nTiles2 = nTiles1 / lenTile2;
600 nTiles1 -= nTiles2 * lenTile2;
602 int32_t remainingCols = nRows;
603 int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
604 colLimit8 = (remainingCols < (colLimit8)) ? remainingCols : colLimit8;
606 remainingCols = remainingCols - colLimit8;
607 int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
608 colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
610 remainingCols = remainingCols - colLimit4;
611 int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
612 colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
614 int32_t colLimit1 = remainingCols - colLimit2;
616 dataType *pRefL = pLocalU + 1;
617 dataType *pStartL = pRefL + colStrideU;
619 seScalarParams.ICNT1 = nRows;
620 seScalarParams.ICNT2 = nTiles8 + nTiles4 + nTiles2 + nTiles1;
622 __SE0_OPEN(pLocalU + colStrideU, seScalarParams);
624 __SA0_OPEN(saRefParams);
629 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
630 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
631 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
632 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
633 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
635 __SE1_OPEN(pStartL, seMatrixParams);
636 __SA1_OPEN(saMatrixParams);
638 for (int32_t tile = 0; tile < nTiles8; tile++) {
639 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
640 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
641 vec sV1 = __vload_pred(lPred, pLoadVec);
643 lPred = c7x::strm_agen<0, vec>::get_vpred();
644 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
645 vec sV2 = __vload_pred(lPred, pLoadVec);
647 lPred = c7x::strm_agen<0, vec>::get_vpred();
648 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
649 vec sV3 = __vload_pred(lPred, pLoadVec);
651 lPred = c7x::strm_agen<0, vec>::get_vpred();
652 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
653 vec sV4 = __vload_pred(lPred, pLoadVec);
655 lPred = c7x::strm_agen<0, vec>::get_vpred();
656 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
657 vec sV5 = __vload_pred(lPred, pLoadVec);
659 lPred = c7x::strm_agen<0, vec>::get_vpred();
660 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
661 vec sV6 = __vload_pred(lPred, pLoadVec);
663 lPred = c7x::strm_agen<0, vec>::get_vpred();
664 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
665 vec sV7 = __vload_pred(lPred, pLoadVec);
667 lPred = c7x::strm_agen<0, vec>::get_vpred();
668 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
669 vec sV8 = __vload_pred(lPred, pLoadVec);
672 for (int32_t vertical = 0; vertical < nRows; vertical++) {
673 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
675 vec v1 = c7x::strm_eng<1, vec>::get_adv();
676 vec v2 = c7x::strm_eng<1, vec>::get_adv();
677 vec v3 = c7x::strm_eng<1, vec>::get_adv();
678 vec v4 = c7x::strm_eng<1, vec>::get_adv();
679 vec v5 = c7x::strm_eng<1, vec>::get_adv();
680 vec v6 = c7x::strm_eng<1, vec>::get_adv();
681 vec v7 = c7x::strm_eng<1, vec>::get_adv();
682 vec v8 = c7x::strm_eng<1, vec>::get_adv();
684 v1 -= sV1 * scalarDup;
685 v2 -= sV2 * scalarDup;
686 v3 -= sV3 * scalarDup;
687 v4 -= sV4 * scalarDup;
688 v5 -= sV5 * scalarDup;
689 v6 -= sV6 * scalarDup;
690 v7 -= sV7 * scalarDup;
691 v8 -= sV8 * scalarDup;
693 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
694 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
695 __vstore_pred(sPred, pStoreVec, v1);
697 sPred = c7x::strm_agen<1, vec>::get_vpred();
698 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
699 __vstore_pred(sPred, pStoreVec, v2);
701 sPred = c7x::strm_agen<1, vec>::get_vpred();
702 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
703 __vstore_pred(sPred, pStoreVec, v3);
705 sPred = c7x::strm_agen<1, vec>::get_vpred();
706 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
707 __vstore_pred(sPred, pStoreVec, v4);
709 sPred = c7x::strm_agen<1, vec>::get_vpred();
710 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
711 __vstore_pred(sPred, pStoreVec, v5);
713 sPred = c7x::strm_agen<1, vec>::get_vpred();
714 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
715 __vstore_pred(sPred, pStoreVec, v6);
717 sPred = c7x::strm_agen<1, vec>::get_vpred();
718 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
719 __vstore_pred(sPred, pStoreVec, v7);
721 sPred = c7x::strm_agen<1, vec>::get_vpred();
722 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
723 __vstore_pred(sPred, pStoreVec, v8);
732 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
733 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
734 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
735 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
736 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
738 dataType *pSE1 = pStartL + colLimit8;
739 dataType *pSA1 = pStartL + colLimit8;
740 dataType *pSA0 = pRefL;
742 __SE1_OPEN(pSE1, seMatrixParams);
743 __SA1_OPEN(saMatrixParams);
745 for (int32_t tile = 0; tile < nTiles4; tile++) {
746 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
747 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
748 vec sV1 = __vload_pred(lPred, pLoadVec);
750 lPred = c7x::strm_agen<0, vec>::get_vpred();
751 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
752 vec sV2 = __vload_pred(lPred, pLoadVec);
754 lPred = c7x::strm_agen<0, vec>::get_vpred();
755 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
756 vec sV3 = __vload_pred(lPred, pLoadVec);
758 lPred = c7x::strm_agen<0, vec>::get_vpred();
759 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
760 vec sV4 = __vload_pred(lPred, pLoadVec);
763 for (int32_t vertical = 0; vertical < nRows; vertical++) {
765 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
767 vec v1 = c7x::strm_eng<1, vec>::get_adv();
768 vec v2 = c7x::strm_eng<1, vec>::get_adv();
769 vec v3 = c7x::strm_eng<1, vec>::get_adv();
770 vec v4 = c7x::strm_eng<1, vec>::get_adv();
772 v1 -= sV1 * scalarDup;
773 v2 -= sV2 * scalarDup;
774 v3 -= sV3 * scalarDup;
775 v4 -= sV4 * scalarDup;
777 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
778 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
779 __vstore_pred(sPred, pStoreVec, v1);
781 sPred = c7x::strm_agen<1, vec>::get_vpred();
782 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
783 __vstore_pred(sPred, pStoreVec, v2);
785 sPred = c7x::strm_agen<1, vec>::get_vpred();
786 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
787 __vstore_pred(sPred, pStoreVec, v3);
789 sPred = c7x::strm_agen<1, vec>::get_vpred();
790 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
791 __vstore_pred(sPred, pStoreVec, v4);
801 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
802 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
803 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
804 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
805 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
807 dataType *pSE1 = pStartL + colLimit8 + colLimit4;
808 dataType *pSA1 = pStartL + colLimit8 + colLimit4;
809 dataType *pSA0 = pRefL;
810 __SE1_OPEN(pSE1, seMatrixParams);
811 __SA1_OPEN(saMatrixParams);
813 for (int32_t tile = 0; tile < nTiles2; tile++) {
814 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
815 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
816 vec sV1 = __vload_pred(lPred, pLoadVec);
818 lPred = c7x::strm_agen<0, vec>::get_vpred();
819 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
820 vec sV2 = __vload_pred(lPred, pLoadVec);
822 for (int32_t vertical = 0; vertical < nRows; vertical++) {
823 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
825 vec v1 = c7x::strm_eng<1, vec>::get_adv();
826 vec v2 = c7x::strm_eng<1, vec>::get_adv();
828 v1 -= sV1 * scalarDup;
829 v2 -= sV2 * scalarDup;
831 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
832 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
833 __vstore_pred(sPred, pStoreVec, v1);
835 sPred = c7x::strm_agen<1, vec>::get_vpred();
836 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
837 __vstore_pred(sPred, pStoreVec, v2);
847 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
848 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
849 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
850 seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
851 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
853 dataType *pSE1 = pStartL + colLimit8 + colLimit4 + colLimit2;
854 dataType *pSA1 = pStartL + colLimit8 + colLimit4 + colLimit2;
855 dataType *pSA0 = pRefL;
857 __SE1_OPEN(pSE1, seMatrixParams);
858 __SA1_OPEN(saMatrixParams);
860 for (int32_t tile = 0; tile < nTiles1; tile++) {
861 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
862 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
863 vec sV1 = __vload_pred(lPred, pLoadVec);
866 for (int32_t vertical = 0; vertical < nRows; vertical++) {
867 vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
869 vec v1 = c7x::strm_eng<1, vec>::get_adv();
870 v1 -= sV1 * scalarDup;
872 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
873 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
874 __vstore_pred(sPred, pStoreVec, v1);
891 template <
typename dataType>
896 typedef typename c7x::make_full_vector<dataType>::type vec;
897 typedef typename c7x::make_full_vector<unsigned char>::type ucharvec;
898 uint32_t eleCount = c7x::element_count_of<vec>::value;
899 const uchar shiftConst = 8 *
sizeof(dataType);
900 vec zeroVec = (vec) 0;
901 int32_t frontZeroCount = 0;
902 int32_t iter1 = DSPLIB_ceilingDiv(order, eleCount);
904 __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
905 __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE));
907 __SA0_OPEN(saMatrixParams);
908 __SA1_OPEN(saMatrixParams);
909 __SE0_OPEN(pU, seMatrixParams);
911 for (int32_t block = 0; block < iter1; block++) {
913 ucharvec byteMask = (ucharvec) 0xFF;
917 for (uint32_t vertical = 0; vertical < eleCount; vertical++) {
918 int32_t horizontal = 0;
922 for (; horizontal < frontZeroCount; horizontal++) {
923 uV = c7x::strm_eng<0, vec>::get_adv();
925 pred = c7x::strm_agen<0, vec>::get_vpred();
926 pStoreVec = c7x::strm_agen<0, vec>::get_adv(pU);
927 __vstore_pred(pred, pStoreVec, zeroVec);
929 pred = c7x::strm_agen<1, vec>::get_vpred();
930 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pL);
931 __vstore_pred(pred, pStoreVec, uV);
934 uV = c7x::strm_eng<0, vec>::get_adv();
936 vec lV = c7x::reinterpret<vec>(__andn(byteMask, c7x::as_uchar_vec(uV)));
937 uV = c7x::reinterpret<vec>((byteMask & (c7x::as_uchar_vec(uV))));
939 byteMask = c7x::as_uchar_vec(__shift_left_full(c7x::as_ulong_vec(byteMask), shiftConst));
941 lV = c7x::reinterpret<vec>(c7x::as_uchar_vec(oneVec) | c7x::as_uchar_vec(lV));
942 oneVec = c7x::reinterpret<vec>(__shift_left_full(c7x::as_ulong_vec(oneVec), shiftConst));
944 pred = c7x::strm_agen<0, vec>::get_vpred();
945 pStoreVec = c7x::strm_agen<0, vec>::get_adv(pU);
946 __vstore_pred(pred, pStoreVec, uV);
948 pred = c7x::strm_agen<1, vec>::get_vpred();
949 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pL);
950 __vstore_pred(pred, pStoreVec, lV);
952 for (; horizontal < iter1 - 1; horizontal++) {
953 uV = c7x::strm_eng<0, vec>::get_adv();
955 pStoreVec = c7x::strm_agen<0, vec>::get_adv(pU);
957 pred = c7x::strm_agen<1, vec>::get_vpred();
958 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pL);
959 __vstore_pred(pred, pStoreVec, zeroVec);
975 template <
typename dataType>
985 int32_t order = pKerPrivArgs->
order;
987 int32_t strideP = pKerPrivArgs->
strideP;
988 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
991 dataType *pALocal = (dataType *) pA;
992 dataType *pLLocal = (dataType *) pL;
993 dataType *pULocal = (dataType *) pU;
994 unsigned short *pPLocal = (
unsigned short *) pP;
996 DSPLIB_DEBUGPRINTFN(0,
"pALocal: %p pLLocal: %p pULocal: %p pPLocal: %p order: %d\n", pALocal, pLLocal, pULocal,
999 int min_row, max_row, k;
1001 int32_t dataSize =
sizeof(dataType);
1002 int32_t dataSizeP =
sizeof(
unsigned short);
1004 int32_t orderStride = strideOrder / dataSize;
1005 int32_t orderPStride = strideP / dataSizeP;
1006 int32_t nRows = order;
1008 __SA_TEMPLATE_v1 saSwap1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1009 __SE_TEMPLATE_v1 seSwap1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1011 __SA_TEMPLATE_v1 saSwap2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE));
1012 __SE_TEMPLATE_v1 seSwap2Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE));
1014 __SA_TEMPLATE_v1 saColParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE));
1015 __SE_TEMPLATE_v1 se0ColParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
1016 __SE_TEMPLATE_v1 se1ColParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
1017 __SE_TEMPLATE_v1 se0MinMax = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1024 DSPLIB_lud_blk_move_exec_ci<dataType>(pULocal, pALocal, order, order, orderStride, orderStride,
1025 &pBlock[17 * SE_PARAM_SIZE]);
1030 typedef typename c7x::make_full_vector<dataType>::type vec;
1032 vec idx_0_to_eleCount = getIdxVec<vec, dataType>();
1034 for (k = 0; k < order - 1; k++) {
1036 &max_row, &min_row, se0MinMax);
1039 DSPLIB_lud_array_swap_exec_ci<dataType>(&pULocal[min_row * orderStride], &pULocal[max_row * orderStride],
1040 order, saSwap1Params, seSwap1Params);
1042 &pPLocal[min_row * orderPStride], &pPLocal[max_row * orderPStride], order, saSwap2Params, seSwap2Params);
1045 DSPLIB_lud_U_colDiv_exec_ci<dataType>(&pULocal[k + k * orderStride], (order - k), orderStride, saColParams,
1046 se0ColParams, se1ColParams);
1047 DSPLIB_lud_U_generate_exec_ci<dataType>(&pULocal[k + k * orderStride], (order - k), orderStride, pBlock);
1051 DSPLIB_lud_extract_exec_ci<dataType>(pULocal, pLLocal, order, orderStride, pBlock);
template void DSPLIB_lud_U_generate_exec_ci< float >(float *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock)
__vpred getPMask< float >(uint32_t idx)
template DSPLIB_STATUS DSPLIB_lud_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsA, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_ludInitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_lud_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsA, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_ludInitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template void DSPLIB_lud_maxMinIndex_exec_ci< float >(float *pCol, int32_t nRows, typename c7x::make_full_vector< float >::type idx_0_to_eleCount, int32_t k, int32_t colStride, int32_t *max, int32_t *min, __SE_TEMPLATE_v1 se0Params)
template void DSPLIB_lud_U_colDiv_exec_ci< float >(float *pCol, int32_t nRows, int32_t colStride, __SA_TEMPLATE_v1 saColParams, __SE_TEMPLATE_v1 se0ColParams, __SE_TEMPLATE_v1 se1ColParams)
template void DSPLIB_lud_array_swap_exec_ci< unsigned short >(unsigned short *pArray1, unsigned short *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
template DSPLIB_STATUS DSPLIB_lud_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pL, void *restrict pU, void *restrict pP)
static void DSPLIB_lud_maxMinIndex_exec_ci(dataType *pCol, int32_t nRows, vec idx_0_to_eleCount, int32_t k, int32_t colStride, int32_t *max, int32_t *min, __SE_TEMPLATE_v1 se0Params)
__vpred getPMask(uint32_t idx)
template void DSPLIB_lud_array_swap_exec_ci< double >(double *pArray1, double *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
template void DSPLIB_lud_U_generate_exec_ci< double >(double *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock)
template void DSPLIB_lud_extract_exec_ci< double >(double *pU, double *pL, int32_t order, int32_t colStride, uint8_t *pBlock)
template void DSPLIB_lud_maxMinIndex_exec_ci< double >(double *pCol, int32_t nRows, typename c7x::make_full_vector< double >::type idx_0_to_eleCount, int32_t k, int32_t colStride, int32_t *max, int32_t *min, __SE_TEMPLATE_v1 se0Params)
__vpred getPMask< double >(uint32_t idx)
static void DSPLIB_lud_U_colDiv_exec_ci(dataType *pCol, int32_t nRows, int32_t colStride, __SA_TEMPLATE_v1 saColParams, __SE_TEMPLATE_v1 se0ColParams, __SE_TEMPLATE_v1 se1ColParams)
static void DSPLIB_lud_array_swap_exec_ci(dataType *pArray1, dataType *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
template DSPLIB_STATUS DSPLIB_lud_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pL, void *restrict pU, void *restrict pP)
template DSPLIB_STATUS DSPLIB_lud_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsA, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_ludInitArgs *pKerInitArgs)
void DSPLIB_lud_extract_exec_ci(dataType *pU, dataType *pL, int32_t order, int32_t colStride, uint8_t *pBlock)
template void DSPLIB_lud_array_swap_exec_ci< float >(float *pArray1, float *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
DSPLIB_STATUS DSPLIB_lud_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pL, void *restrict pU, void *restrict pP)
This function is the main execution function for the C7x implementation of the kernel....
static void DSPLIB_lud_U_generate_exec_ci(dataType *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock)
template void DSPLIB_lud_U_colDiv_exec_ci< double >(double *pCol, int32_t nRows, int32_t colStride, __SA_TEMPLATE_v1 saColParams, __SE_TEMPLATE_v1 se0ColParams, __SE_TEMPLATE_v1 se1ColParams)
template void DSPLIB_lud_extract_exec_ci< float >(float *pU, float *pL, int32_t order, int32_t colStride, uint8_t *pBlock)
template void DSPLIB_lud_identity_matrix_generate_init_ci< unsigned short >(uint8_t *pBlock, int32_t order, int32_t stride)
template void DSPLIB_lud_identity_matrix_generate_exec_ci< unsigned short >(unsigned short *pMat, int32_t order, int32_t colStride, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_lud.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_LUD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters
int32_t order
Size of input buffer for different batches DSPLIB_lud_init that will be retrieved and used by DSPLIB_...
int32_t strideOrder
Stride between rows of input and output data matrix
int32_t strideP
Stride between rows of output data matrix P