55 #define UNROLL_COUNT 4
56 #define MIN_HORIZONTAL_COLUMNS_FOR_UNROLL 2
57 #define NUM_VECS_IN_TILE 6
71 __SE_TEMPLATE_v1 seParamFetchU;
72 __SE_TEMPLATE_v1 seParamFetchD;
73 __SA_TEMPLATE_v1 saParamMulFetch;
74 __SA_TEMPLATE_v1 saParamALoad;
75 __SA_TEMPLATE_v1 saParamLStore;
77 __SE_ELETYPE SE_ELETYPE;
78 __SE_VECLEN SE_VECLEN;
79 __SA_VECLEN SA_VECLEN;
83 typedef typename c7x::make_full_vector<dataType>::type vec;
85 SE_VECLEN = c7x::se_veclen<vec>::value;
86 SE_ELETYPE = c7x::se_eletype<vec>::value;
87 SA_VECLEN = c7x::sa_veclen<vec>::value;
89 uint32_t eleCount = c7x::element_count_of<vec>::value;
91 uint32_t yStride = pKerPrivArgs->
stride /
sizeof(dataType);
96 seParamFetchU = __gen_SE_TEMPLATE_v1();
99 seParamFetchU.ICNT1 = 0;
102 seParamFetchU.DIM1 = (yStride) << 1;
106 seParamFetchU.ICNT2 = 0;
110 seParamFetchU.DECDIM1 = __SE_DECDIM_DIM2;
111 seParamFetchU.DECDIM1_WIDTH = pKerPrivArgs->
order;
113 seParamFetchU.ELETYPE = SE_ELETYPE;
114 seParamFetchU.VECLEN = SE_VECLEN;
115 seParamFetchU.DIMFMT = __SE_DIMFMT_3D;
120 seParamFetchD = __gen_SE_TEMPLATE_v1();
123 seParamFetchD.ICNT1 = 0;
124 seParamFetchD.DIM1 = (yStride) << 1;
125 seParamFetchD.ICNT2 = 0;
127 seParamFetchD.DECDIM1 = __SE_DECDIM_DIM2;
128 seParamFetchD.DECDIM1_WIDTH = pKerPrivArgs->
order;
130 seParamFetchD.ELETYPE = SE_ELETYPE;
131 seParamFetchD.VECLEN = SE_VECLEN;
132 seParamFetchD.DIMFMT = __SE_DIMFMT_3D;
137 saParamMulFetch = __gen_SA_TEMPLATE_v1();
139 saParamMulFetch.ICNT0 = 1;
140 saParamMulFetch.ICNT1 = 0;
141 saParamMulFetch.DIM1 = yStride;
142 saParamMulFetch.ICNT2 = 0;
143 saParamMulFetch.DIM2 = 0;
145 saParamMulFetch.VECLEN = __SA_VECLEN_1ELEM;
146 saParamMulFetch.DIMFMT = __SA_DIMFMT_3D;
151 saParamLStore = __gen_SA_TEMPLATE_v1();
153 saParamLStore.ICNT0 =
155 saParamLStore.VECLEN = SA_VECLEN;
156 saParamLStore.DIMFMT = __SA_DIMFMT_1D;
161 saParamALoad = __gen_SA_TEMPLATE_v1();
165 saParamALoad.VECLEN = SA_VECLEN;
166 saParamALoad.DIMFMT = __SA_DIMFMT_1D;
182 template <
typename dataType>
192 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
193 int32_t order = pKerPrivArgs->
order;
194 int32_t strideA = pKerPrivArgs->
stride;
195 int32_t colAStride = strideA /
sizeof(dataType);
197 DSPLIB_cholesky_inplace_c7x_PingPong_init<dataType>(handle);
198 DSPLIB_cholesky_inplace_isPosDefinite_init<dataType>(order, colAStride, pBlock);
224 const dataType Half = 0.5f;
225 const dataType OneP5 = 1.5f;
230 x = x * (OneP5 - (a * x * x * Half));
231 x = x * (OneP5 - (a * x * x * Half));
243 template <
typename dataType>
246 dataType *restrict pInALocal,
247 dataType *restrict pOutULocal,
248 dataType *restrict pMulBuffer)
252 typedef typename c7x::make_full_vector<dataType>::type vec;
253 int32_t eleCount = c7x::element_count_of<vec>::value;
264 int32_t order = pKerPrivArgs->
order;
265 int32_t vecLen = eleCount;
267 int32_t row, lRow, tile_i;
268 int32_t stride = pKerPrivArgs->
stride;
269 int32_t yStride = stride /
sizeof(dataType);
271 dataType *pLFirstRow = pOutULocal;
272 dataType *ptrL = pOutULocal;
273 dataType *ptrA = pInALocal;
275 dataType *pMultiplier = NULL;
276 dataType recipDiagValue;
279 int32_t validElemsPerRow = order;
281 int32_t tileWidthCeilValue = validElemsPerRow + tile_width - 1;
283 __SE_ELEDUP SE_ELEDUP;
284 SE_ELEDUP = c7x::se_eledup<dataType, vec>::value;
286 vec vec00, vec01, vec02, vec03, vec04, vec05;
287 vec vec10, vec11, vec12, vec13, vec14, vec15;
288 vec acc00, acc01, acc02, acc03, acc04, acc05;
289 vec acc10, acc11, acc12, acc13, acc14, acc15;
294 for (row = 0; row < order; row++) {
295 pMultiplier = pLFirstRow;
300 saParamLStore.ICNT0 = validElemsPerRow;
301 saParamALoad.ICNT0 = validElemsPerRow;
303 __SA1_OPEN(saParamLStore);
304 __SA2_OPEN(saParamALoad);
308 lRowSE0 = (int32_t) (((uint32_t) row + 1U) >> 1U);
309 lRowSE1 = (int32_t) (((uint32_t) row) >> 1U);
311 seParamFetchU.DECDIM1_WIDTH = validElemsPerRow;
312 seParamFetchD.DECDIM1_WIDTH = validElemsPerRow;
315 int32_t nTiles = (tileWidthCeilValue) / tile_width;
318 saParamMulFetch.ICNT1 = row;
319 saParamMulFetch.ICNT2 = nTiles;
321 __SA0_OPEN(saParamMulFetch);
324 seParamFetchU.ICNT1 = lRowSE0;
325 seParamFetchU.ICNT2 = nTiles;
327 seParamFetchD.ICNT1 = lRowSE1;
328 seParamFetchD.ICNT2 = nTiles;
331 __SE0_OPEN(pLFirstRow, seParamFetchU);
334 __SE1_OPEN(pLFirstRow + yStride, seParamFetchD);
339 for (tile_i = 0; tile_i < nTiles; tile_i++) {
354 for (lRow = 0; lRow < lRowSE1; lRow++) {
355 dataType *pMultiplierUp = c7x::strm_agen<0, dataType>::get_adv(pMultiplier);
356 vec multiplier0 = __vload_dup(pMultiplierUp);
358 dataType *pMultiplierDown = c7x::strm_agen<0, dataType>::get_adv(pMultiplier);
359 vec multiplier1 = __vload_dup(pMultiplierDown);
360 vec00 = c7x::strm_eng<0, vec>::get_adv();
361 vec10 = c7x::strm_eng<1, vec>::get_adv();
362 acc00 += (vec00 * multiplier0);
363 acc10 += (vec10 * multiplier1);
365 vec01 = c7x::strm_eng<0, vec>::get_adv();
366 vec11 = c7x::strm_eng<1, vec>::get_adv();
367 acc01 += (vec01 * multiplier0);
368 acc11 += (vec11 * multiplier1);
370 vec02 = c7x::strm_eng<0, vec>::get_adv();
371 vec12 = c7x::strm_eng<1, vec>::get_adv();
372 acc02 += (vec02 * multiplier0);
373 acc12 += (vec12 * multiplier1);
375 vec03 = c7x::strm_eng<0, vec>::get_adv();
376 vec13 = c7x::strm_eng<1, vec>::get_adv();
377 acc03 += (vec03 * multiplier0);
378 acc13 += (vec13 * multiplier1);
380 vec04 = c7x::strm_eng<0, vec>::get_adv();
381 vec14 = c7x::strm_eng<1, vec>::get_adv();
382 acc04 += (vec04 * multiplier0);
383 acc14 += (vec14 * multiplier1);
385 vec05 = c7x::strm_eng<0, vec>::get_adv();
386 vec15 = c7x::strm_eng<1, vec>::get_adv();
387 acc05 += (vec05 * multiplier0);
388 acc15 += (vec15 * multiplier1);
391 if (lRowSE0 != lRowSE1) {
392 dataType *pMultiplierUp = c7x::strm_agen<0, dataType>::get_adv(pMultiplier);
393 vec multiplier0 = __vload_dup(pMultiplierUp);
395 vec00 = c7x::strm_eng<0, vec>::get_adv();
396 acc00 += (vec00 * multiplier0);
398 vec01 = c7x::strm_eng<0, vec>::get_adv();
399 acc01 += (vec01 * multiplier0);
401 vec02 = c7x::strm_eng<0, vec>::get_adv();
402 acc02 += (vec02 * multiplier0);
404 vec03 = c7x::strm_eng<0, vec>::get_adv();
405 acc03 += (vec03 * multiplier0);
407 vec04 = c7x::strm_eng<0, vec>::get_adv();
408 acc04 += (vec04 * multiplier0);
410 vec05 = c7x::strm_eng<0, vec>::get_adv();
411 acc05 += (vec05 * multiplier0);
422 vec00 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
423 vec01 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
424 vec02 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
425 vec03 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
426 vec04 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
427 vec05 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
430 vec10 = vec00 - acc00;
431 vec11 = vec01 - acc01;
432 vec12 = vec02 - acc02;
433 vec13 = vec03 - acc03;
434 vec14 = vec04 - acc04;
435 vec15 = vec05 - acc05;
441 __vpred vpred0 = c7x::strm_agen<1, vec>::get_vpred();
442 vec *storePtr0 = c7x::strm_agen<1, vec>::get_adv(ptrL);
443 __vstore_pred(vpred0, storePtr0, vec10 * recipDiagValue);
445 __vpred vpred1 = c7x::strm_agen<1, vec>::get_vpred();
446 vec *storePtr1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
447 __vstore_pred(vpred1, storePtr1, vec11 * recipDiagValue);
449 __vpred vpred2 = c7x::strm_agen<1, vec>::get_vpred();
450 vec *storePtr2 = c7x::strm_agen<1, vec>::get_adv(ptrL);
451 __vstore_pred(vpred2, storePtr2, vec12 * recipDiagValue);
453 __vpred vpred3 = c7x::strm_agen<1, vec>::get_vpred();
454 vec *storePtr3 = c7x::strm_agen<1, vec>::get_adv(ptrL);
455 __vstore_pred(vpred3, storePtr3, vec13 * recipDiagValue);
457 __vpred vpred4 = c7x::strm_agen<1, vec>::get_vpred();
458 vec *storePtr4 = c7x::strm_agen<1, vec>::get_adv(ptrL);
459 __vstore_pred(vpred4, storePtr4, vec14 * recipDiagValue);
461 __vpred vpred5 = c7x::strm_agen<1, vec>::get_vpred();
462 vec *storePtr5 = c7x::strm_agen<1, vec>::get_adv(ptrL);
463 __vstore_pred(vpred5, storePtr5, vec15 * recipDiagValue);
468 lRowSE0 = (int32_t) (((uint32_t) row + 1U) >> 1U);
469 lRowSE1 = (int32_t) (((uint32_t) row) >> 1U);
471 seParamFetchU.ICNT0 = 1;
472 seParamFetchU.ICNT1 = row;
473 seParamFetchU.ICNT2 = 1;
474 seParamFetchU.DIM1 = (yStride);
475 seParamFetchU.DIM2 = 0;
476 seParamFetchU.ELEDUP = SE_ELEDUP;
479 seParamFetchD.ICNT1 = row;
480 seParamFetchD.ICNT2 = 1;
481 seParamFetchD.DIM1 = (yStride);
482 seParamFetchD.DIM2 = 0;
485 __SE0_OPEN(pLFirstRow, seParamFetchU);
486 __SE1_OPEN(pLFirstRow, seParamFetchD);
498 vec vecMul0, vecMul1, vecMul2, vecMul3;
499 vec vecMul4, vecMul5, vecMul6, vecMul7;
505 for (lRow = 0; lRow < row; lRow += 4) {
508 vecMul0 = c7x::strm_eng<0, vec>::get();
509 vec0 = c7x::strm_eng<1, vec>::get_adv();
511 acc00 += (vec0 * vecMul0);
513 vecMul1 = c7x::strm_eng<0, vec>::get_adv();
514 vec1 = c7x::strm_eng<1, vec>::get_adv();
515 acc01 += (vec1 * vecMul1);
519 vecMul2 = c7x::strm_eng<0, vec>::get();
520 vec2 = c7x::strm_eng<1, vec>::get_adv();
521 acc10 += (vec2 * vecMul2);
523 vecMul3 = c7x::strm_eng<0, vec>::get_adv();
524 vec3 = c7x::strm_eng<1, vec>::get_adv();
525 acc11 += (vec3 * vecMul3);
529 vecMul4 = c7x::strm_eng<0, vec>::get();
530 vec4 = c7x::strm_eng<1, vec>::get_adv();
531 acc20 += (vec4 * vecMul4);
533 vecMul5 = c7x::strm_eng<0, vec>::get_adv();
534 vec5 = c7x::strm_eng<1, vec>::get_adv();
535 acc21 += (vec5 * vecMul5);
539 vecMul6 = c7x::strm_eng<0, vec>::get();
540 vec6 = c7x::strm_eng<1, vec>::get_adv();
541 acc30 += (vec6 * vecMul6);
543 vecMul7 = c7x::strm_eng<0, vec>::get_adv();
544 vec7 = c7x::strm_eng<1, vec>::get_adv();
545 acc31 += (vec7 * vecMul7);
551 vec accLTmp = acc20 + acc30;
552 vec accRTmp = acc21 + acc31;
558 vec00 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
559 vec01 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
562 vec10 = vec00 - acc00;
563 vec11 = vec01 - acc01;
567 __vpred vpred0 = c7x::strm_agen<1, vec>::get_vpred();
568 vec *storePtr0 = c7x::strm_agen<1, vec>::get_adv(ptrL);
569 __vstore_pred(vpred0, storePtr0, vec10 * recipDiagValue);
571 __vpred vpred1 = c7x::strm_agen<1, vec>::get_vpred();
572 vec *storePtr1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
573 __vstore_pred(vpred1, storePtr1, vec11 * recipDiagValue);
584 tileWidthCeilValue--;
598 template <
typename dataType>
606 dataType *pLocalA = (dataType *) pA;
607 dataType *pLocalMul = (dataType *) pMul;
608 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
609 int32_t order = pKerPrivArgs->
order;
610 int32_t enable_test = pKerPrivArgs->
enableTest;
611 typedef typename c7x::make_full_vector<dataType>::type vec;
612 int32_t eleCount = c7x::element_count_of<vec>::value;
dataType DSPLIB_cholesky_inplace_isPosDefinite(dataType *A, const int32_t order, const int32_t eleCount, uint8_t *pBlock)
#define SA_SA0_PARAM_OFFSET
#define SA_SA1_PARAM_OFFSET
#define SA_SA2_PARAM_OFFSET
#define SE_SE2_PARAM_OFFSET
#define SE_SE3_PARAM_OFFSET
#define MIN_HORIZONTAL_COLUMNS_FOR_UNROLL
DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init< float >(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci< double >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
This function is the main execution function for the C7x implementation of the kernel....
DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong(int enable_test, DSPLIB_cholesky_inplace_PrivArgs *pKerPrivArgs, dataType *restrict pInALocal, dataType *restrict pOutULocal, dataType *restrict pMulBuffer)
dataType getRecipSqrt(dataType a)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci< float >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init< double >(DSPLIB_kernelHandle handle)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_cholesky_inplace...
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_CHOLESKY_INPLACE_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t order
Order of input buffer for different batches DSPLIB_cholesky_inplace_init that will be retrieved and u...