DSPLIB User Guide
DSPLIB_cholesky_inplace_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
48 
49 /*******************************************************************************
50  *
51  * DEFINES
52  *
53  ******************************************************************************/
54 
55 #define UNROLL_COUNT 4
56 #define MIN_HORIZONTAL_COLUMNS_FOR_UNROLL 2
57 #define NUM_VECS_IN_TILE 6
58 
59 /*******************************************************************************
60  *
61  * INITIALIZATION
62  *
63  ******************************************************************************/
64 
66 {
67  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
68 
70 
71  __SE_TEMPLATE_v1 seParamFetchL; // Left fetch
72  __SE_TEMPLATE_v1 seParamFetchR; // Right fetch
73  __SA_TEMPLATE_v1 saParamMulStore; // Stores the multiplier values
74  __SA_TEMPLATE_v1 saParamLStore; // Stores the L values
75 
76  __SE_ELETYPE SE_ELETYPE;
77  __SE_VECLEN SE_VECLEN;
78  __SA_VECLEN SA_VECLEN;
79 
81 
82  typedef typename c7x::make_full_vector<dataType>::type vec;
83 
84  SE_VECLEN = c7x::se_veclen<vec>::value;
85  SE_ELETYPE = c7x::se_eletype<vec>::value;
86  SA_VECLEN = c7x::sa_veclen<vec>::value;
87 
88  uint32_t eleCount = c7x::element_count_of<vec>::value;
89 
90  pKerPrivArgs->shiftForVecLenDiv = -1;
91  uint32_t vecLenValue = eleCount;
92  while (vecLenValue != 0) {
93  vecLenValue >>= 1;
94  pKerPrivArgs->shiftForVecLenDiv++;
95  }
96 
97  int32_t yStride = pKerPrivArgs->stride / sizeof(dataType);
98  /**********************************************************************/
99  /* Prepare streaming engine for fetching L values(Left) Merge approach*/
100  /**********************************************************************/
101  seParamFetchL = __gen_SE_TEMPLATE_v1();
102 
103  seParamFetchL.ICNT0 = eleCount;
104  seParamFetchL.ICNT1 = 0; // No of rows to process
105  seParamFetchL.DIM1 = yStride; // order
106  seParamFetchL.ICNT2 = 0; // No of left fetches
107  seParamFetchL.DIM2 = eleCount << 1;
108 
109  seParamFetchL.ELETYPE = SE_ELETYPE;
110  seParamFetchL.VECLEN = SE_VECLEN;
111  seParamFetchL.DIMFMT = __SE_DIMFMT_3D;
112 
113  /**********************************************************************/
114  /* Prepare streaming engine for fetching L values(Right) Merge approach*/
115  /**********************************************************************/
116  seParamFetchR = __gen_SE_TEMPLATE_v1();
117 
118  seParamFetchR.ICNT0 = eleCount;
119  seParamFetchR.ICNT1 = 0; // No of rows to process
120  seParamFetchR.DIM1 = yStride;
121  seParamFetchR.ICNT2 = 0; // No of right fetches
122  seParamFetchR.DIM2 = eleCount << 1;
123 
124  seParamFetchR.ELETYPE = SE_ELETYPE;
125  seParamFetchR.VECLEN = SE_VECLEN;
126  seParamFetchR.DIMFMT = __SE_DIMFMT_3D;
127 
128  /**********************************************************************/
129  /* Prepare Address generator to store the multipliers */
130  /**********************************************************************/
131  saParamMulStore = __gen_SA_TEMPLATE_v1();
132 
133  saParamMulStore.ICNT0 = pKerPrivArgs->order;
134  saParamMulStore.VECLEN = __SA_VECLEN_1ELEM;
135  saParamMulStore.DIMFMT = __SA_DIMFMT_1D;
136 
137  /**********************************************************************/
138  /* Prepare Address generator to store the L values */
139  /**********************************************************************/
140  saParamLStore = __gen_SA_TEMPLATE_v1();
141 
142  saParamLStore.ICNT0 = pKerPrivArgs->order;
143  saParamLStore.VECLEN = SA_VECLEN;
144  saParamLStore.DIMFMT = __SA_DIMFMT_1D;
145 
146  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE2_PARAM_OFFSET) = seParamFetchL;
147  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE3_PARAM_OFFSET) = seParamFetchR;
148  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA0_PARAM_OFFSET) = saParamMulStore;
149  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA1_PARAM_OFFSET) = saParamLStore;
150 
151  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
152  return status;
153 }
154 
157 
158 // Initialize SE params
159 template <typename dataType>
161  DSPLIB_bufParams2D_t *bufParamsA,
162  DSPLIB_bufParams1D_t *bufParamsMul,
163  const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
164 {
165  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
166 
167  DSPLIB_STATUS status = DSPLIB_SUCCESS;
169  uint8_t *pBlock = pKerPrivArgs->bufPblock;
170  int32_t order = pKerPrivArgs->order;
171  int32_t strideA = pKerPrivArgs->stride;
172  int32_t colAStride = strideA / sizeof(dataType);
173 
174  DSPLIB_cholesky_inplace_c7x_PingPong_init<dataType>(handle);
175  DSPLIB_cholesky_inplace_isPosDefinite_init<dataType>(order, colAStride, pBlock);
176 
177  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
178 
179  return status;
180 }
181 
183  DSPLIB_bufParams2D_t *bufParamsA,
184  DSPLIB_bufParams1D_t *bufParamsMul,
185  const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs);
186 
188  DSPLIB_bufParams2D_t *bufParamsA,
189  DSPLIB_bufParams1D_t *bufParamsMul,
190  const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs);
191 
192 /*******************************************************************************
193  *
194  * IMPLEMENTATION
195  *
196  ******************************************************************************/
197 
198 template <typename dataType> inline dataType getRecipSqrt(dataType a)
199 {
200 
201  const dataType Half = 0.5f;
202  const dataType OneP5 = 1.5f;
203  dataType x;
204 
205  x = __recip_sqrt(a); // compute square root reciprocal
206 
207  x = x * (OneP5 - (a * x * x * Half));
208  x = x * (OneP5 - (a * x * x * Half));
209  // PRAGMA: do not unroll this loop
210  // int i;
211  // #pragma UNROLL(1)
212  // for (i = 0; i < 2; i++)
213  // {
214  // x = x * (OneP5 - (a * x * x * Half));
215  // }
216 
217  return x;
218 }
219 
220 template <typename dataType> c7x::uchar_vec DSPLIB_cholesky_inplace_getMaskIncrement();
221 template <> c7x::uchar_vec DSPLIB_cholesky_inplace_getMaskIncrement<float>() { return (c7x::uchar_vec) 4; };
222 template <> c7x::uchar_vec DSPLIB_cholesky_inplace_getMaskIncrement<double>() { return (c7x::uchar_vec) 8; };
223 
224 template <typename dataType>
226  DSPLIB_cholesky_inplace_PrivArgs *pKerPrivArgs,
227  dataType *restrict pInALocal,
228  dataType *restrict pOutULocal,
229  dataType *restrict pMulBuffer)
230 {
231 
232  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
233 
234  typedef typename c7x::make_full_vector<dataType>::type vec;
235  int32_t eleCount = c7x::element_count_of<vec>::value;
236 
237  __SE_TEMPLATE_v1 seParamFetchL = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE2_PARAM_OFFSET);
238  __SE_TEMPLATE_v1 seParamFetchR = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE3_PARAM_OFFSET);
239 
240  __SA_TEMPLATE_v1 saParamLStore = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA1_PARAM_OFFSET);
241  __SA_TEMPLATE_v1 saParamALoad = saParamLStore;
242 
243  int32_t order = pKerPrivArgs->order;
244  int32_t vecLen = eleCount;
245 
246  int32_t row, fetch, lRow;
247  int32_t shiftForVecLenDiv = pKerPrivArgs->shiftForVecLenDiv;
248  int32_t stride = pKerPrivArgs->stride;
249  int32_t yStride = stride / sizeof(dataType);
250 
251  dataType *pLFirstRow = pOutULocal;
252  dataType recipDiagValue;
253 
254  c7x::uchar_vec vMask, vMaskInit;
255  vMaskInit.s[0] = 0;
256  vMaskInit.s[1] = 1;
257  vMaskInit.s[2] = 2;
258  vMaskInit.s[3] = 3;
259  vMaskInit.s[4] = 4;
260  vMaskInit.s[5] = 5;
261  vMaskInit.s[6] = 6;
262  vMaskInit.s[7] = 7;
263 
264  c7x::uchar_vec vMaskIncrement = DSPLIB_cholesky_inplace_getMaskIncrement<dataType>();
265 
266  int32_t blockMax = int32_t((uint32_t) (order + vecLen - 1) >> (uint32_t) shiftForVecLenDiv);
267  int32_t extraRows = vecLen - (int32_t) ((uint32_t) order & (uint32_t) (vecLen - 1)); // gives extra rows needed
268  // to make the height of matrix
269  // integral multiple of vecLen
270  if (extraRows == vecLen) {
271  extraRows = 0;
272  }
273 
274  int32_t elemsPerRow = order;
275  int32_t offset = 0;
276  int32_t rowNumber = 0;
277  int32_t block;
278  int32_t elemsPerRowCeil = elemsPerRow + vecLen - 1;
279 
280  int32_t lezrCount[UNROLL_COUNT];
281  __SE_LEZR lezrDim[UNROLL_COUNT];
282 
283  int32_t *lezrCountPtr = lezrCount;
284  __SE_LEZR *lezrDimPtr = lezrDim;
285 
286  *lezrCountPtr = 0;
287  *lezrDimPtr = __SE_LEZR_OFF;
288  lezrCountPtr++;
289  lezrDimPtr++;
290 
291  for (int32_t i = UNROLL_COUNT - 1; i > 0; i--) {
292  *lezrCountPtr = i;
293  *lezrDimPtr = __SE_LEZR_ICNT1;
294  lezrCountPtr++;
295  lezrDimPtr++;
296  }
297 
298  for (block = 0; block < blockMax - 2; block++) {
299  // configuration for sa
300  saParamLStore.ICNT0 = elemsPerRow;
301  saParamALoad.ICNT0 = elemsPerRow;
302 
303  vMask = vMaskInit;
304  for (row = 0; row < vecLen; row++) {
305  int32_t fetchesPerRow =
306  (int32_t) ((uint32_t) elemsPerRowCeil >> (uint32_t) shiftForVecLenDiv); // number of vector fetches per row
307  int32_t leftFetchesPerRow = (int32_t) ((uint32_t) (fetchesPerRow + 1) >> 1u); // number of fetches by SE0
308  int32_t rightFetchesPerRow = fetchesPerRow - leftFetchesPerRow; // number of fetches by SE1
309  int32_t lezrIndex =
310  (int32_t) ((uint32_t) rowNumber & (uint32_t) (UNROLL_COUNT - 1)); // rowNumber%4=>4 is unroll count
311 
312  // configuration for SE
313  seParamFetchL.ICNT1 = rowNumber;
314  seParamFetchL.ICNT2 = leftFetchesPerRow;
315  seParamFetchL.LEZR = lezrDim[lezrIndex];
316  seParamFetchL.LEZR_CNT = lezrCount[lezrIndex];
317 
318  seParamFetchR.ICNT1 = rowNumber;
319  seParamFetchR.ICNT2 = rightFetchesPerRow;
320  seParamFetchR.LEZR = lezrDim[lezrIndex];
321  seParamFetchR.LEZR_CNT = lezrCount[lezrIndex];
322 
323  if (rowNumber > 0) {
324  __SE0_OPEN(pLFirstRow, seParamFetchL);
325  __SE1_OPEN(pLFirstRow + vecLen, seParamFetchR);
326  }
327 
328  __SA1_OPEN(saParamLStore);
329  __SA2_OPEN(saParamALoad);
330 
331  dataType *ptrL = pOutULocal + offset;
332  dataType *ptrA = pInALocal + offset;
333 
334  vec vLSum0 = vec(0); // Holds sum of all LxLy
335  vec vLSum1 = vec(0);
336  vec vLSum2 = vec(0);
337  vec vLSum3 = vec(0);
338 
339  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
340 
341  vec vRSum0 = vec(0); // Holds sum of all LxLy
342  vec vRSum1 = vec(0);
343  vec vRSum2 = vec(0);
344  vec vRSum3 = vec(0);
345 
346  vec vRA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
347 
348  dataType *pMulStore = pMulBuffer;
349 
350  for (lRow = 0; lRow < rowNumber; lRow += UNROLL_COUNT) {
351  vec vLL0 = c7x::strm_eng<0, vec>::get_adv(); // L value vector from each row
352  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
353  vLSum0 += vLL0 * vLL0Temp.s[0];
354  vec vLR0 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
355  vRSum0 += vLR0 * vLL0Temp.s[0];
356  *pMulStore = vLL0Temp.s[0];
357  pMulStore++;
358 
359  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
360  vec vLL1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL1)));
361  vLSum1 += vLL1 * vLL1Temp.s[0];
362  vec vLR1 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
363  vRSum1 += vLR1 * vLL1Temp.s[0];
364  *pMulStore = vLL1Temp.s[0];
365  pMulStore++;
366 
367  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
368  vec vLL2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL2)));
369  vLSum2 += vLL2 * vLL2Temp.s[0];
370  vec vLR2 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
371  vRSum2 += vLR2 * vLL2Temp.s[0];
372  *pMulStore = vLL2Temp.s[0];
373  pMulStore++;
374 
375  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
376  vec vLL3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL3)));
377  vLSum3 += vLL3 * vLL3Temp.s[0];
378  vec vLR3 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
379  vRSum3 += vLR3 * vLL3Temp.s[0];
380  *pMulStore = vLL3Temp.s[0];
381  pMulStore++;
382  }
383 
384  vLSum0 += vLSum1;
385  vLSum2 += vLSum3;
386 
387  vRSum0 += vRSum1;
388  vRSum2 += vRSum3;
389 
390  vLA -= vLSum2;
391  vec vLDiff = vLA - vLSum0;
392 
393  vRA -= vRSum2;
394  vec vRDiff = vRA - vRSum0;
395 
396  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
397 
398  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
399 
400  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
401  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
402  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
403 
404  __vpred vpStoreR = c7x::strm_agen<1, vec>::get_vpred();
405  vec *outVecPtrR = c7x::strm_agen<1, vec>::get_adv(ptrL);
406  __vstore_pred(vpStoreR, outVecPtrR, vRDiff * recipDiagValue);
407 
408  /* Handling all the pong fetches */
409 
410  for (fetch = 0; fetch < leftFetchesPerRow - 1; fetch++) {
411  vLSum0 = vec(0);
412  vLSum1 = vec(0);
413  vLSum2 = vec(0);
414  vLSum3 = vec(0);
415 
416  vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
417 
418  vRSum0 = vec(0);
419  vRSum1 = vec(0);
420  vRSum2 = vec(0);
421  vRSum3 = vec(0);
422 
423  vRA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
424 
425  pMulStore = pMulBuffer;
426 
427  for (lRow = 0; lRow < rowNumber; lRow += UNROLL_COUNT) {
428  vec multiplier0 = __vload_dup(pMulStore);
429  pMulStore++;
430  vec vLL0 = c7x::strm_eng<0, vec>::get_adv();
431  vec vLR0 = c7x::strm_eng<1, vec>::get_adv();
432  vLSum0 += vLL0 * multiplier0;
433  vRSum0 += vLR0 * multiplier0;
434 
435  vec multiplier1 = __vload_dup(pMulStore);
436  pMulStore++;
437  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
438  vec vLR1 = c7x::strm_eng<1, vec>::get_adv();
439  vLSum1 += vLL1 * multiplier1;
440  vRSum1 += vLR1 * multiplier1;
441 
442  vec multiplier2 = __vload_dup(pMulStore);
443  pMulStore++;
444  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
445  vec vLR2 = c7x::strm_eng<1, vec>::get_adv();
446  vLSum2 += vLL2 * multiplier2;
447  vRSum2 += vLR2 * multiplier2;
448 
449  vec multiplier3 = __vload_dup(pMulStore);
450  pMulStore++;
451  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
452  vec vLR3 = c7x::strm_eng<1, vec>::get_adv();
453  vLSum3 += vLL3 * multiplier3;
454  vRSum3 += vLR3 * multiplier3;
455  }
456 
457  vLSum0 += vLSum1;
458  vLSum2 += vLSum3;
459 
460  vRSum0 += vRSum1;
461  vRSum2 += vRSum3;
462 
463  vLA -= vLSum2;
464  vec vLDiff1 = vLA - vLSum0;
465 
466  vRA -= vRSum2;
467  vec vRDiff1 = vRA - vRSum0;
468 
469  __vpred vpStoreL1 = c7x::strm_agen<1, vec>::get_vpred();
470  vec *outVecPtrL1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
471  __vstore_pred(vpStoreL1, outVecPtrL1, vLDiff1 * recipDiagValue);
472 
473  __vpred vpStoreR1 = c7x::strm_agen<1, vec>::get_vpred();
474  vec *outVecPtrR1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
475  __vstore_pred(vpStoreR1, outVecPtrR1, vRDiff1 * recipDiagValue);
476  }
477 
478  offset += yStride;
479  rowNumber++;
480  vMask += vMaskIncrement; // uchar(4);
481  }
482 
483  pLFirstRow += vecLen;
484  offset += vecLen;
485  elemsPerRow -= vecLen;
486  elemsPerRowCeil -= vecLen;
487  }
488 
489  for (; block < blockMax - 1; block++) {
490  // configuration for sa
491  saParamLStore.ICNT0 = elemsPerRow;
492  saParamALoad.ICNT0 = elemsPerRow;
493 
494  vMask = vMaskInit;
495  for (row = 0; row < vecLen; row++) {
496  int32_t lezrIndex =
497  (int32_t) ((uint32_t) rowNumber & (uint32_t) (UNROLL_COUNT - 1)); // rowNumber%4=>4 is unroll count
498 
499  // configuration for SE
500  seParamFetchL.ICNT1 = rowNumber;
501  seParamFetchL.ICNT2 = 1;
502  seParamFetchL.LEZR = lezrDim[lezrIndex];
503  seParamFetchL.LEZR_CNT = lezrCount[lezrIndex];
504 
505  seParamFetchR.ICNT1 = rowNumber;
506  seParamFetchR.ICNT2 = 1;
507  seParamFetchR.LEZR = lezrDim[lezrIndex];
508  seParamFetchR.LEZR_CNT = lezrCount[lezrIndex];
509 
510  if (rowNumber > 0) {
511  __SE0_OPEN(pLFirstRow, seParamFetchL);
512  __SE1_OPEN(pLFirstRow + vecLen, seParamFetchR);
513  }
514 
515  __SA1_OPEN(saParamLStore);
516  __SA2_OPEN(saParamALoad);
517 
518  dataType *ptrL = pOutULocal + offset;
519  dataType *ptrA = pInALocal + offset;
520 
521  vec vLSum0 = vec(0); // Holds sum of all LxLy
522  vec vLSum1 = vec(0);
523  vec vLSum2 = vec(0);
524  vec vLSum3 = vec(0);
525 
526  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
527 
528  vec vRSum0 = vec(0); // Holds sum of all LxLy
529  vec vRSum1 = vec(0);
530  vec vRSum2 = vec(0);
531  vec vRSum3 = vec(0);
532 
533  vec vRA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
534 
535  for (lRow = 0; lRow < rowNumber; lRow += UNROLL_COUNT) {
536  vec vLL0 = c7x::strm_eng<0, vec>::get_adv(); // L value vector from each row
537  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
538  vLSum0 += vLL0 * vLL0Temp.s[0];
539  vec vLR0 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
540  vRSum0 += vLR0 * vLL0Temp.s[0];
541 
542  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
543  vec vLL1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL1)));
544  vLSum1 += vLL1 * vLL1Temp.s[0];
545  vec vLR1 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
546  vRSum1 += vLR1 * vLL1Temp.s[0];
547 
548  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
549  vec vLL2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL2)));
550  vLSum2 += vLL2 * vLL2Temp.s[0];
551  vec vLR2 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
552  vRSum2 += vLR2 * vLL2Temp.s[0];
553 
554  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
555  vec vLL3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL3)));
556  vLSum3 += vLL3 * vLL3Temp.s[0];
557  vec vLR3 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
558  vRSum3 += vLR3 * vLL3Temp.s[0];
559  }
560 
561  vLSum0 += vLSum1;
562  vLSum2 += vLSum3;
563 
564  vRSum0 += vRSum1;
565  vRSum2 += vRSum3;
566 
567  vLA -= vLSum2;
568  vec vLDiff = vLA - vLSum0;
569 
570  vRA -= vRSum2;
571  vec vRDiff = vRA - vRSum0;
572 
573  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
574 
575  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
576 
577  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
578  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
579  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
580 
581  __vpred vpStoreR = c7x::strm_agen<1, vec>::get_vpred();
582  vec *outVecPtrR = c7x::strm_agen<1, vec>::get_adv(ptrL);
583  __vstore_pred(vpStoreR, outVecPtrR, vRDiff * recipDiagValue);
584 
585  offset += yStride;
586  rowNumber++;
587  vMask += vMaskIncrement; // uchar(4);
588  }
589 
590  pLFirstRow += vecLen;
591  offset += vecLen;
592  elemsPerRow -= vecLen;
593  }
594 
595  for (; block < blockMax; block++) {
596  // configuration for sa
597  saParamLStore.ICNT0 = elemsPerRow;
598  saParamALoad.ICNT0 = elemsPerRow;
599 
600  /*seParamFetchL => Up fetch seParamFetchR => Down fetch*/
601  seParamFetchL.ICNT0 = vecLen;
602  seParamFetchL.DIM1 = (int32_t) ((uint32_t) yStride << 1u); // order
603  seParamFetchR.ICNT0 = vecLen;
604  seParamFetchR.DIM1 = (int32_t) ((uint32_t) yStride << 1u); // order
605 
606  seParamFetchL.DIMFMT = __SE_DIMFMT_2D;
607  seParamFetchR.DIMFMT = __SE_DIMFMT_2D;
608 
609  vMask = vMaskInit;
610 
611  for (row = 0; row < vecLen - extraRows; row++) {
612 
613  if (rowNumber < 2) {
614 
615  // configuration for SE
616  seParamFetchL.ICNT1 = rowNumber;
617 
618  if (rowNumber > 0) {
619 
620  __SE0_OPEN(pLFirstRow, seParamFetchL);
621 
622  __SA1_OPEN(saParamLStore);
623  __SA2_OPEN(saParamALoad);
624 
625  dataType *ptrA = pInALocal + offset;
626 
627  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
628 
629  vec vLSum0 = vec(0); // Holds sum of all LxLy
630  dataType *ptrL = pOutULocal + offset;
631 
632  vec vLL0 = c7x::strm_eng<0, vec>::get(); // L value vector from each row
633  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
634  vLSum0 += vLL0 * vLL0Temp.s[0];
635 
636  vec vLDiff = vLA - vLSum0;
637 
638  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
639 
640  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
641 
642  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
643  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
644  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
645 
646  offset += yStride;
647  rowNumber++;
648  }
649  else {
650  __SA1_OPEN(saParamLStore);
651  __SA2_OPEN(saParamALoad);
652 
653  dataType *ptrA = pInALocal + offset;
654  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
655  dataType *ptrL = pOutULocal + offset;
656 
657  vec vLDiff = vLA; // - vLSum0;
658  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
659  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
660 
661  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
662  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
663  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
664 
665  offset += yStride;
666  rowNumber++;
667  }
668  }
669  else {
670  int32_t upFetchesPerRow = (int32_t) ((uint32_t) (rowNumber + 1) >> 1u);
671  int32_t downFetchesPerRow = (int32_t) ((uint32_t) rowNumber >> 1u);
672 
673  // configuration for SE
674  seParamFetchL.ICNT1 = upFetchesPerRow;
675  seParamFetchR.ICNT1 = downFetchesPerRow;
676 
677  __SE0_OPEN(pLFirstRow, seParamFetchL);
678  __SE1_OPEN(pLFirstRow + yStride, seParamFetchR);
679 
680  __SA1_OPEN(saParamLStore);
681  __SA2_OPEN(saParamALoad);
682 
683  dataType *ptrL = pOutULocal + offset;
684  dataType *ptrA = pInALocal + offset;
685 
686  vec vLSum0 = vec(0); // Holds sum of all LxLy
687  vec vLSum1 = vec(0);
688  vec vLSum2 = vec(0);
689  vec vLSum3 = vec(0);
690 
691  vec vRSum0 = vec(0); // Holds sum of all LxLy
692  vec vRSum1 = vec(0);
693  vec vRSum2 = vec(0);
694  vec vRSum3 = vec(0);
695 
696  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
697 
698  for (lRow = 0; lRow < upFetchesPerRow; lRow += UNROLL_COUNT) {
699  vec vLL0 = c7x::strm_eng<0, vec>::get_adv(); // L value vector from each row
700  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
701  vLSum0 += vLL0 * vLL0Temp.s[0];
702 
703  vec vLR0 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
704  vec vLR0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR0)));
705  vRSum0 += vLR0 * vLR0Temp.s[0];
706 
707  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
708  vec vLL1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL1)));
709  vLSum1 += vLL1 * vLL1Temp.s[0];
710  vec vLR1 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
711  vec vLR1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR1)));
712  vRSum1 += vLR1 * vLR1Temp.s[0];
713 
714  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
715  vec vLL2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL2)));
716  vLSum2 += vLL2 * vLL2Temp.s[0];
717  vec vLR2 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
718  vec vLR2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR2)));
719  vRSum2 += vLR2 * vLR2Temp.s[0];
720 
721  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
722  vec vLL3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL3)));
723  vLSum3 += vLL3 * vLL3Temp.s[0];
724  vec vLR3 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
725  vec vLR3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR3)));
726  vRSum3 += vLR3 * vLR3Temp.s[0];
727  }
728 
729  vLSum0 += vLSum1;
730  vLSum2 += vLSum3;
731 
732  vRSum0 += vRSum1;
733  vRSum2 += vRSum3;
734 
735  vLSum2 += vLSum0;
736  vRSum2 += vRSum0;
737 
738  vec vLDiff = vLA - vLSum2 - vRSum2;
739 
740  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
741 
742  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
743 
744  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
745  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
746  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
747 
748  offset += yStride;
749  rowNumber++;
750  }
751  vMask += vMaskIncrement; // uchar(4);
752  }
753 
754  pLFirstRow += vecLen;
755  offset += vecLen;
756  elemsPerRow -= vecLen;
757  }
758 
759  __SE0_CLOSE();
760  __SE1_CLOSE();
761  __SA1_CLOSE();
762  __SA2_CLOSE();
763 
764  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
765  return DSPLIB_SUCCESS;
766 }
767 
768 template <typename dataType>
769 DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
770 {
771  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
772 
773  DSPLIB_STATUS status = DSPLIB_SUCCESS;
774 
776  dataType *pLocalA = (dataType *) pA;
777  dataType *pLocalMul = (dataType *) pMul;
778  uint8_t *pBlock = pKerPrivArgs->bufPblock;
779  int32_t order = pKerPrivArgs->order;
780  int32_t enable_test = pKerPrivArgs->enableTest;
781  typedef typename c7x::make_full_vector<dataType>::type vec;
782  int32_t eleCount = c7x::element_count_of<vec>::value;
783 
784  DSPLIB_DEBUGPRINTFN(0, "pLocalA: %p\n", pLocalA);
785  if (enable_test) {
786  dataType sum = DSPLIB_cholesky_inplace_isPosDefinite(pLocalA, order, eleCount, pBlock);
787  if (sum <= 0) {
788  status = DSPLIB_ERR_FAILURE;
789  }
790  else {
791  status = DSPLIB_cholesky_inplace_c7x_PingPong(enable_test, pKerPrivArgs, pLocalA, pLocalA, pLocalMul);
792  }
793  }
794  else {
795  status = DSPLIB_cholesky_inplace_c7x_PingPong(enable_test, pKerPrivArgs, pLocalA, pLocalA, pLocalMul);
796  }
797 
798  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", status);
799  return status;
800 }
801 
803 DSPLIB_cholesky_inplace_exec_ci<float>(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul);
804 
806 DSPLIB_cholesky_inplace_exec_ci<double>(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul);
807 
808 /* ======================================================================== */
809 /* End of file: DSPLIB_cholesky_inplace_ci.cpp */
810 /* ======================================================================== */
dataType DSPLIB_cholesky_inplace_isPosDefinite(dataType *A, const int32_t order, const int32_t eleCount, uint8_t *pBlock)
#define SA_SA0_PARAM_OFFSET
#define SA_SA1_PARAM_OFFSET
#define SE_SE2_PARAM_OFFSET
#define SE_SE3_PARAM_OFFSET
c7x::uchar_vec DSPLIB_cholesky_inplace_getMaskIncrement< float >()
c7x::uchar_vec DSPLIB_cholesky_inplace_getMaskIncrement< double >()
DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
#define UNROLL_COUNT
DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init< float >(DSPLIB_kernelHandle handle)
c7x::uchar_vec DSPLIB_cholesky_inplace_getMaskIncrement()
template DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci< double >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
This function is the main execution function for the C7x implementation of the kernel....
DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong(int enable_test, DSPLIB_cholesky_inplace_PrivArgs *pKerPrivArgs, dataType *restrict pInALocal, dataType *restrict pOutULocal, dataType *restrict pMulBuffer)
dataType getRecipSqrt(dataType a)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci< float >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init< double >(DSPLIB_kernelHandle handle)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_cholesky_inplace...
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
@ DSPLIB_ERR_FAILURE
Definition: DSPLIB_types.h:153
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_CHOLESKY_INPLACE_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t order
Order of input buffer for different batches DSPLIB_cholesky_inplace_init that will be retrieved and u...