48 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
49 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
50 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
51 int32_t strideR = pKerPrivArgs->
strideR;
52 int32_t colStrideR = strideR /
sizeof(dataType);
54 typedef typename c7x::make_full_vector<dataType>::type vec;
55 uint32_t eleCount = c7x::element_count_of<vec>::value;
56 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
57 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
58 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
61 se0Params.ICNT1 = eleCount;
62 se0Params.DIM1 = colStrideR;
63 se0Params.DIM2 = colStrideR * eleCount;
64 se0Params.DIMFMT = __SE_DIMFMT_3D;
65 se0Params.ELETYPE = SE_ELETYPE;
66 se0Params.VECLEN = SE_VECLEN;
67 if (
sizeof(dataType) == 4) {
68 se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
71 se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
74 sa0Params.DIMFMT = __SA_DIMFMT_1D;
75 sa0Params.VECLEN = SA_VECLEN;
77 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = se0Params;
78 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = sa0Params;
85 template <
typename dataType>
97 DSPLIB_qrd_blk_move_init_ci<dataType>(pKerPrivArgs->
heightA, pKerPrivArgs->
widthA, pKerPrivArgs->
strideR,
98 &pKerPrivArgs->
bufPblock[18 * SE_PARAM_SIZE]);
99 DSPLIB_qrd_identity_matrix_generate_init_ci<dataType>(pKerPrivArgs->
heightA, pKerPrivArgs->
strideQ,
101 DSPLIB_qrd_alpha_init_ci<dataType>(handle);
102 DSPLIB_R_column_init_ci<dataType>(handle);
103 DSPLIB_Q_matrix_init_ci<dataType>(handle);
127 template <
typename vec>
static inline vec
getSqrt(vec a)
129 vec zeroVec = (vec) 0;
130 vec Half = (vec) 0.5f;
131 vec OneP5 = (vec) 1.5f;
132 __vpred zeroPred = __cmp_eq_pred(zeroVec, a);
133 vec nonZero = (vec) 1e-12f;
135 a = __select(zeroPred, nonZero, a);
137 vec x = __recip_sqrt(a);
140 vec tmp1 = OneP5 - tmp * x * Half;
144 tmp1 = OneP5 - tmp * y * Half;
150 template <
typename dataType>
155 __SE_TEMPLATE_v1 se0Params,
156 __SA_TEMPLATE_v1 sa0Params)
161 __SE_TEMPLATE_v1 se1Params = se0Params;
162 __SA_TEMPLATE_v1 sa1Params = sa0Params;
164 typedef typename c7x::make_full_vector<dataType>::type vec;
165 uint32_t eleCount = c7x::element_count_of<vec>::value;
167 int32_t nVec = nRows / eleCount;
168 int32_t se0ICNT2 = nVec / 2;
169 int32_t se1ICNT2_int = nVec - se0ICNT2;
170 int32_t remainingEle = nRows - (nVec * eleCount);
171 int32_t se1ICNT2 = (remainingEle > 0) ? se1ICNT2_int + 1 : se1ICNT2_int;
172 se0Params.ICNT2 = se0ICNT2;
173 se1Params.ICNT2 = se1ICNT2;
176 dataType *pSE1 = pR + (se0ICNT2 * colStrideR * eleCount);
178 __SE1_OPEN(pSE1, se1Params);
180 __SE0_OPEN(pSE0, se0Params);
183 int32_t iterloop1 = se0ICNT2 / 3;
184 int32_t vertical = iterloop1 * 3;
185 sa0Params.ICNT0 = (se0ICNT2 * eleCount);
186 sa1Params.ICNT0 = nRows - ((se0ICNT2 * eleCount));
187 dataType *pUHalf = pU + (se0ICNT2 * eleCount);
188 if (sa0Params.ICNT0){
189 __SA0_OPEN(sa0Params);
191 __SA1_OPEN(sa1Params);
193 vec sumVec1 = (vec) 0;
194 vec sumVec2 = (vec) 0;
195 vec sumVec3 = (vec) 0;
196 vec sumVec4 = (vec) 0;
197 vec sumVec5 = (vec) 0;
198 vec sumVec6 = (vec) 0;
200 for (int32_t iter = 0; iter < iterloop1; iter++) {
201 vec v1 = c7x::strm_eng<0, vec>::get_adv();
202 vec v2 = c7x::strm_eng<1, vec>::get_adv();
203 vec v3 = c7x::strm_eng<0, vec>::get_adv();
204 vec v4 = c7x::strm_eng<1, vec>::get_adv();
205 vec v5 = c7x::strm_eng<0, vec>::get_adv();
206 vec v6 = c7x::strm_eng<1, vec>::get_adv();
208 __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
209 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pU);
210 __vstore_pred(pred, pStoreVec, v1);
212 pred = c7x::strm_agen<1, vec>::get_vpred();
213 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
214 __vstore_pred(pred, pStoreVec, v2);
216 pred = c7x::strm_agen<0, vec>::get_vpred();
217 pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pU);
218 __vstore_pred(pred, pStoreVec, v3);
220 pred = c7x::strm_agen<1, vec>::get_vpred();
221 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
222 __vstore_pred(pred, pStoreVec, v4);
224 pred = c7x::strm_agen<0, vec>::get_vpred();
225 pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pU);
226 __vstore_pred(pred, pStoreVec, v5);
228 pred = c7x::strm_agen<1, vec>::get_vpred();
229 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
230 __vstore_pred(pred, pStoreVec, v6);
241 for (; vertical < se0ICNT2 - 1; vertical += 2) {
242 vec v1 = c7x::strm_eng<0, vec>::get_adv();
243 vec v2 = c7x::strm_eng<1, vec>::get_adv();
244 vec v3 = c7x::strm_eng<0, vec>::get_adv();
245 vec v4 = c7x::strm_eng<1, vec>::get_adv();
247 __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
248 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pU);
249 __vstore_pred(pred, pStoreVec, v1);
251 pred = c7x::strm_agen<1, vec>::get_vpred();
252 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
253 __vstore_pred(pred, pStoreVec, v2);
255 pred = c7x::strm_agen<0, vec>::get_vpred();
256 pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pU);
257 __vstore_pred(pred, pStoreVec, v3);
259 pred = c7x::strm_agen<1, vec>::get_vpred();
260 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
261 __vstore_pred(pred, pStoreVec, v4);
269 for (; vertical < se0ICNT2; vertical++) {
270 vec v1 = c7x::strm_eng<0, vec>::get_adv();
271 vec v2 = c7x::strm_eng<1, vec>::get_adv();
272 __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
273 vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pU);
274 __vstore_pred(pred, pStoreVec, v1);
276 pred = c7x::strm_agen<1, vec>::get_vpred();
277 pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
278 __vstore_pred(pred, pStoreVec, v2);
283 if (se0ICNT2 != se1ICNT2_int) {
284 vec v1 = c7x::strm_eng<1, vec>::get_adv();
285 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
286 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
287 __vstore_pred(pred, pStoreVec, v1);
290 sumVec1 = sumVec1 + sumVec3 + sumVec5 + sumVec2;
291 c7x_horizontal_add(sumVec1, &sum);
293 if (remainingEle > 0) {
294 vec v1 = c7x::strm_eng<1, vec>::get_adv();
296 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
297 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pUHalf);
298 __vstore_pred(pred, pStoreVec, v1);
300 for (int32_t i = 0; i < remainingEle; i++) {
301 sum += v1.s[i] * v1.s[i];
304 if (sa0Params.ICNT0){
324 __SE_TEMPLATE_v1 se0Params,
325 __SA_TEMPLATE_v1 sa0Params);
330 __SE_TEMPLATE_v1 se0Params,
331 __SA_TEMPLATE_v1 sa0Params);
333 template <
typename dataType>
338 __SA_TEMPLATE_v1 sa1Params = sa0Params;
340 sa1Params.ICNT1 = nRows / 2;
341 sa0Params.ICNT1 = nRows - sa1Params.ICNT1;
343 __SA0_OPEN(sa0Params);
344 if (sa1Params.ICNT1) {
345 __SA1_OPEN(sa1Params);
347 for (
int vertical = 0; vertical < nRows / 2; vertical++) {
348 dataType *pR1 = c7x::strm_agen<0, dataType>::get_adv(pR);
351 dataType *pR2 = c7x::strm_agen<1, dataType>::get_adv(pR + colStrideR);
354 if (sa0Params.ICNT1 != sa1Params.ICNT1) {
355 dataType *pR1 = c7x::strm_agen<0, dataType>::get_adv(pR);
359 if (sa1Params.ICNT1) {
369 template <
typename dataType>
371 const void *restrict pA,
372 const void *restrict pQ,
373 const void *restrict pR,
374 const void *restrict pU,
375 const void *restrict pScratch)
382 int32_t nRows = pKerPrivArgs->
heightA;
383 int32_t nCols = pKerPrivArgs->
widthA;
384 int32_t strideA = pKerPrivArgs->
strideA;
385 int32_t strideQ = pKerPrivArgs->
strideQ;
386 int32_t strideR = pKerPrivArgs->
strideR;
387 int32_t dataSize =
sizeof(dataType);
388 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
389 int32_t colStrideQ = strideQ / dataSize;
390 int32_t colStrideR = strideR / dataSize;
391 int32_t colStrideA = strideA / dataSize;
398 dataType *pLocalA = (dataType *) pA;
399 dataType *pLocalQ = (dataType *) pQ;
400 dataType *pLocalR = (dataType *) pR;
401 dataType *pLocalU = (dataType *) pU;
402 dataType *pSum = (dataType *) pScratch;
404 DSPLIB_DEBUGPRINTFN(0,
"pALocal: %p pLocalQ: %p pLocalR: %p pLocalU: %p nCols: %d nRows: %d\n", pLocalA, pLocalQ,
405 pLocalR, pLocalU, nCols, nRows);
407 __SE_TEMPLATE_v1 seAlphaParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (SE_PARAM_SIZE));
408 __SA_TEMPLATE_v1 saAlphaParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
409 __SA_TEMPLATE_v1 saRColParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
414 DSPLIB_qrd_blk_move_exec_ci<dataType>(pLocalR, pLocalA, nRows, nCols, colStrideR, colStrideA,
415 &pBlock[18 * SE_PARAM_SIZE]);
421 DSPLIB_qrd_identity_matrix_generate_exec_ci<dataType>(pLocalQ, nRows, colStrideQ, pBlock);
423 if (nRows <= nCols) {
424 loopCount = nRows - 2;
427 loopCount = nCols - 1;
430 for (col = 0; col <= loopCount; col++) {
431 sum = DSPLIB_qrd_alpha_exec_ci<dataType>(&pLocalR[col + col * colStrideR], colStrideR, (nRows - col),
432 &pLocalU[col], seAlphaParams, saAlphaParams);
435 if (pLocalR[col + (col * colStrideR)] >= 0) {
438 pLocalU[col] = pLocalR[col + (col * colStrideR)] + alpha;
439 pLocalR[col + (col * colStrideR)] = -alpha;
440 DSPLIB_qrd_R_column_exec_ci<dataType>(&pLocalR[col + ((col + 1) * colStrideR)], colStrideR,
441 (nRows - (col + 1)), saRColParams);
442 if (alpha * pLocalU[col] != 0.0) {
443 dataType product = (alpha * pLocalU[col]);
444 dataType scale = __recip(product);
445 dataType twoP0 = 2.0;
447 scale = scale * (twoP0 - (product * scale));
448 scale = scale * (twoP0 - (product * scale));
450 if ((col + 1) < nCols) {
451 DSPLIB_qrd_R_matrix_exec_ci<dataType>(&pLocalR[(col) + (col * colStrideR)], &pLocalU[col],
452 &pSum[col + 1], scale, colStrideR, (nRows - col),
453 (nCols - (col + 1)), pBlock);
456 DSPLIB_qrd_Q_matrix_exec_ci<dataType>(&pLocalQ[(col)], &pLocalU[col], &pSum[0], scale, colStrideQ, nRows,
457 (nRows - col), pBlock);
468 const void *restrict pA,
469 const void *restrict pQ,
470 const void *restrict pR,
471 const void *restrict pU,
472 const void *restrict pScratch);
475 const void *restrict pA,
476 const void *restrict pQ,
477 const void *restrict pR,
478 const void *restrict pU,
479 const void *restrict pScratch);
template DSPLIB_STATUS DSPLIB_qrd_init_ci< double >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams2D_t *bufParamsQ, DSPLIB_bufParams2D_t *bufParamsR, DSPLIB_bufParams1D_t *bufParamsU, const DSPLIB_qrdInitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_qrd_init_ci(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams2D_t *bufParamsQ, DSPLIB_bufParams2D_t *bufParamsR, DSPLIB_bufParams1D_t *bufParamsU, const DSPLIB_qrdInitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_qrd_init_ci< float >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams2D_t *bufParamsQ, DSPLIB_bufParams2D_t *bufParamsR, DSPLIB_bufParams1D_t *bufParamsU, const DSPLIB_qrdInitArgs *pKerInitArgs)
template float DSPLIB_qrd_alpha_exec_ci< float >(float *pR, int32_t colStrideR, int32_t nRows, float *pU, __SE_TEMPLATE_v1 se0Params, __SA_TEMPLATE_v1 sa0Params)
void DSPLIB_qrd_alpha_init_ci(DSPLIB_kernelHandle handle)
static vec getSqrt(vec a)
template void DSPLIB_qrd_R_column_exec_ci< double >(double *pR, int32_t colStrideR, int32_t nRows, __SA_TEMPLATE_v1 sa0Params)
template void DSPLIB_qrd_alpha_init_ci< float >(DSPLIB_kernelHandle handle)
static void DSPLIB_qrd_R_column_exec_ci(dataType *pR, int32_t colStrideR, int32_t nRows, __SA_TEMPLATE_v1 sa0Params)
template DSPLIB_STATUS DSPLIB_qrd_exec_ci< float >(DSPLIB_kernelHandle handle, const void *restrict pA, const void *restrict pQ, const void *restrict pR, const void *restrict pU, const void *restrict pScratch)
template DSPLIB_STATUS DSPLIB_qrd_exec_ci< double >(DSPLIB_kernelHandle handle, const void *restrict pA, const void *restrict pQ, const void *restrict pR, const void *restrict pU, const void *restrict pScratch)
DSPLIB_STATUS DSPLIB_qrd_exec_ci(DSPLIB_kernelHandle handle, const void *restrict pA, const void *restrict pQ, const void *restrict pR, const void *restrict pU, const void *restrict pScratch)
This function is the main execution function for the C7x implementation of the kernel....
template void DSPLIB_qrd_alpha_init_ci< double >(DSPLIB_kernelHandle handle)
static dataType DSPLIB_qrd_alpha_exec_ci(dataType *pR, int32_t colStrideR, int32_t nRows, dataType *pU, __SE_TEMPLATE_v1 se0Params, __SA_TEMPLATE_v1 sa0Params)
template void DSPLIB_qrd_R_column_exec_ci< float >(float *pR, int32_t colStrideR, int32_t nRows, __SA_TEMPLATE_v1 sa0Params)
template double DSPLIB_qrd_alpha_exec_ci< double >(double *pR, int32_t colStrideR, int32_t nRows, double *pU, __SE_TEMPLATE_v1 se0Params, __SA_TEMPLATE_v1 sa0Params)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_qrd.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
int32_t strideR
Stride between rows of R output data matrix
uint32_t heightA
Height of input data matrix
uint32_t widthA
Size of input buffer for different batches DSPLIB_qrd_init that will be retrieved and used by DSPLIB_...
uint8_t bufPblock[DSPLIB_QRD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters.
int32_t strideQ
Stride between rows of Q output data matrix
int32_t strideA
Stride between rows of input data matrix