DSPLIB User Guide
DSPLIB_cholesky_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
47 #include "DSPLIB_cholesky_priv.h"
48 
49 /*******************************************************************************
50  *
51  * DEFINES
52  *
53  ******************************************************************************/
54 
55 #define UNROLL_COUNT 4
56 #define MIN_HORIZONTAL_COLUMNS_FOR_UNROLL 2
57 #define NUM_VECS_IN_TILE 6
58 
59 /*******************************************************************************
60  *
61  * INITIALIZATION
62  *
63  ******************************************************************************/
64 
66 {
67  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
68 
70 
71  __SE_TEMPLATE_v1 seParamFetchL; // Left fetch
72  __SE_TEMPLATE_v1 seParamFetchR; // Right fetch
73  __SA_TEMPLATE_v1 saParamMulStore; // Stores the multiplier values
74  __SA_TEMPLATE_v1 saParamLStore; // Stores the L values
75 
76  __SE_ELETYPE SE_ELETYPE;
77  __SE_VECLEN SE_VECLEN;
78  __SA_VECLEN SA_VECLEN;
79 
80  DSPLIB_cholesky_PrivArgs *pKerPrivArgs = (DSPLIB_cholesky_PrivArgs *) handle;
81 
82  typedef typename c7x::make_full_vector<dataType>::type vec;
83 
84  SE_VECLEN = c7x::se_veclen<vec>::value;
85  SE_ELETYPE = c7x::se_eletype<vec>::value;
86  SA_VECLEN = c7x::sa_veclen<vec>::value;
87 
88  uint32_t eleCount = c7x::element_count_of<vec>::value;
89 
90  pKerPrivArgs->shiftForVecLenDiv = -1;
91  uint32_t vecLenValue = eleCount;
92  while (vecLenValue != 0) {
93  vecLenValue >>= 1;
94  pKerPrivArgs->shiftForVecLenDiv++;
95  }
96 
97  int32_t yStride = pKerPrivArgs->stride / sizeof(dataType);
98  /**********************************************************************/
99  /* Prepare streaming engine for fetching L values(Left) Merge approach*/
100  /**********************************************************************/
101  seParamFetchL = __gen_SE_TEMPLATE_v1();
102 
103  seParamFetchL.ICNT0 = eleCount;
104  seParamFetchL.ICNT1 = 0; // No of rows to process
105  seParamFetchL.DIM1 = yStride; // order
106  seParamFetchL.ICNT2 = 0; // No of left fetches
107  seParamFetchL.DIM2 = eleCount << 1;
108 
109  seParamFetchL.ELETYPE = SE_ELETYPE;
110  seParamFetchL.VECLEN = SE_VECLEN;
111  seParamFetchL.DIMFMT = __SE_DIMFMT_3D;
112 
113  /**********************************************************************/
114  /* Prepare streaming engine for fetching L values(Right) Merge approach*/
115  /**********************************************************************/
116  seParamFetchR = __gen_SE_TEMPLATE_v1();
117 
118  seParamFetchR.ICNT0 = eleCount;
119  seParamFetchR.ICNT1 = 0; // No of rows to process
120  seParamFetchR.DIM1 = yStride;
121  seParamFetchR.ICNT2 = 0; // No of right fetches
122  seParamFetchR.DIM2 = eleCount << 1;
123 
124  seParamFetchR.ELETYPE = SE_ELETYPE;
125  seParamFetchR.VECLEN = SE_VECLEN;
126  seParamFetchR.DIMFMT = __SE_DIMFMT_3D;
127 
128  /**********************************************************************/
129  /* Prepare Address generator to store the multipliers */
130  /**********************************************************************/
131  saParamMulStore = __gen_SA_TEMPLATE_v1();
132 
133  saParamMulStore.ICNT0 = pKerPrivArgs->order;
134  saParamMulStore.VECLEN = __SA_VECLEN_1ELEM;
135  saParamMulStore.DIMFMT = __SA_DIMFMT_1D;
136 
137  /**********************************************************************/
138  /* Prepare Address generator to store the L values */
139  /**********************************************************************/
140  saParamLStore = __gen_SA_TEMPLATE_v1();
141 
142  saParamLStore.ICNT0 = pKerPrivArgs->order;
143  saParamLStore.VECLEN = SA_VECLEN;
144  saParamLStore.DIMFMT = __SA_DIMFMT_1D;
145 
146  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE2_PARAM_OFFSET) = seParamFetchL;
147  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE3_PARAM_OFFSET) = seParamFetchR;
148  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA0_PARAM_OFFSET) = saParamMulStore;
149  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA1_PARAM_OFFSET) = saParamLStore;
150 
151  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
152  return status;
153 }
154 
157 
158 // Initialize SE params
159 template <typename dataType>
161  const DSPLIB_bufParams2D_t *bufParamsIn,
162  const DSPLIB_bufParams2D_t *bufParamsOut,
163  const DSPLIB_cholesky_InitArgs *pKerInitArgs)
164 {
165  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
166 
167  DSPLIB_STATUS status = DSPLIB_SUCCESS;
168  DSPLIB_cholesky_PrivArgs *pKerPrivArgs = (DSPLIB_cholesky_PrivArgs *) handle;
169  uint8_t *pBlock = pKerPrivArgs->bufPblock;
170  int32_t order = pKerPrivArgs->order;
171  int32_t strideA = pKerPrivArgs->stride;
172  int32_t colAStride = strideA / sizeof(dataType);
173 
174  DSPLIB_cholesky_c7x_PingPong_init<dataType>(handle);
175  DSPLIB_cholesky_inplace_isPosDefinite_init<dataType>(order, colAStride, pBlock);
176 
177  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
178 
179  return status;
180 }
181 
183  const DSPLIB_bufParams2D_t *bufParamsIn,
184  const DSPLIB_bufParams2D_t *bufParamsOut,
185  const DSPLIB_cholesky_InitArgs *pKerInitArgs);
186 
188  const DSPLIB_bufParams2D_t *bufParamsIn,
189  const DSPLIB_bufParams2D_t *bufParamsOut,
190  const DSPLIB_cholesky_InitArgs *pKerInitArgs);
191 
192 /*******************************************************************************
193  *
194  * IMPLEMENTATION
195  *
196  ******************************************************************************/
197 
198 template <typename dataType> inline dataType getRecipSqrt(dataType a)
199 {
200 
201  const dataType Half = 0.5f;
202  const dataType OneP5 = 1.5f;
203  dataType x;
204 
205  x = __recip_sqrt(a); // compute square root reciprocal
206 
207  x = x * (OneP5 - (a * x * x * Half));
208  x = x * (OneP5 - (a * x * x * Half));
209  // PRAGMA: do not unroll this loop
210  // int i;
211  // #pragma UNROLL(1)
212  // for (i = 0; i < 2; i++)
213  // {
214  // x = x * (OneP5 - (a * x * x * Half));
215  // }
216 
217  return x;
218 }
219 
220 template <typename dataType> inline c7x::uchar_vec DSPLIB_cholesky_getMaskIncrement();
221 template <> inline c7x::uchar_vec DSPLIB_cholesky_getMaskIncrement<float>() { return (c7x::uchar_vec) 4; };
222 template <> inline c7x::uchar_vec DSPLIB_cholesky_getMaskIncrement<double>() { return (c7x::uchar_vec) 8; };
223 
224 template <typename dataType>
226  DSPLIB_cholesky_PrivArgs *pKerPrivArgs,
227  dataType *restrict pInALocal,
228  dataType *restrict pOutULocal,
229  dataType *restrict pMulBuffer)
230 {
231  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
232 
233  typedef typename c7x::make_full_vector<dataType>::type vec;
234  int32_t eleCount = c7x::element_count_of<vec>::value;
235 
236  __SE_TEMPLATE_v1 seParamFetchL = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE2_PARAM_OFFSET);
237  __SE_TEMPLATE_v1 seParamFetchR = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE3_PARAM_OFFSET);
238 
239  __SA_TEMPLATE_v1 saParamLStore = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA1_PARAM_OFFSET);
240  __SA_TEMPLATE_v1 saParamALoad = saParamLStore;
241 
242  int32_t order = pKerPrivArgs->order;
243  int32_t vecLen = eleCount;
244 
245  int32_t row, fetch, lRow;
246  int32_t shiftForVecLenDiv = pKerPrivArgs->shiftForVecLenDiv;
247  int32_t stride = pKerPrivArgs->stride;
248  int32_t yStride = stride / sizeof(dataType);
249 
250  dataType *pLFirstRow = pOutULocal;
251  dataType recipDiagValue;
252 
253  c7x::uchar_vec vMask, vMaskInit;
254  vMaskInit.s[0] = 0;
255  vMaskInit.s[1] = 1;
256  vMaskInit.s[2] = 2;
257  vMaskInit.s[3] = 3;
258  vMaskInit.s[4] = 4;
259  vMaskInit.s[5] = 5;
260  vMaskInit.s[6] = 6;
261  vMaskInit.s[7] = 7;
262 
263  c7x::uchar_vec vMaskIncrement = DSPLIB_cholesky_getMaskIncrement<dataType>();
264 
265  int32_t blockMax = int32_t((uint32_t) (order + vecLen - 1) >> (uint32_t) shiftForVecLenDiv);
266  int32_t extraRows = vecLen - (int32_t) ((uint32_t) order & (uint32_t) (vecLen - 1)); // gives extra rows needed
267  // to make the height of matrix
268  // integral multiple of vecLen
269  if (extraRows == vecLen) {
270  extraRows = 0;
271  }
272 
273  int32_t elemsPerRow = order;
274  int32_t offset = 0;
275  int32_t rowNumber = 0;
276  int32_t block;
277  int32_t elemsPerRowCeil = elemsPerRow + vecLen - 1;
278 
279  int32_t lezrCount[UNROLL_COUNT];
280  __SE_LEZR lezrDim[UNROLL_COUNT];
281 
282  int32_t *lezrCountPtr = lezrCount;
283  __SE_LEZR *lezrDimPtr = lezrDim;
284 
285  *lezrCountPtr = 0;
286  *lezrDimPtr = __SE_LEZR_OFF;
287  lezrCountPtr++;
288  lezrDimPtr++;
289 
290  for (int32_t i = UNROLL_COUNT - 1; i > 0; i--) {
291  *lezrCountPtr = i;
292  *lezrDimPtr = __SE_LEZR_ICNT1;
293  lezrCountPtr++;
294  lezrDimPtr++;
295  }
296 
297  for (block = 0; block < blockMax - 2; block++) {
298  // configuration for sa
299  saParamLStore.ICNT0 = elemsPerRow;
300  saParamALoad.ICNT0 = elemsPerRow;
301 
302  vMask = vMaskInit;
303  for (row = 0; row < vecLen; row++) {
304  int32_t fetchesPerRow =
305  (int32_t) ((uint32_t) elemsPerRowCeil >> (uint32_t) shiftForVecLenDiv); // number of vector fetches per row
306  int32_t leftFetchesPerRow = (int32_t) ((uint32_t) (fetchesPerRow + 1) >> 1u); // number of fetches by SE0
307  int32_t rightFetchesPerRow = fetchesPerRow - leftFetchesPerRow; // number of fetches by SE1
308  int32_t lezrIndex =
309  (int32_t) ((uint32_t) rowNumber & (uint32_t) (UNROLL_COUNT - 1)); // rowNumber%4=>4 is unroll count
310 
311  // configuration for SE
312  seParamFetchL.ICNT1 = rowNumber;
313  seParamFetchL.ICNT2 = leftFetchesPerRow;
314  seParamFetchL.LEZR = lezrDim[lezrIndex];
315  seParamFetchL.LEZR_CNT = lezrCount[lezrIndex];
316 
317  seParamFetchR.ICNT1 = rowNumber;
318  seParamFetchR.ICNT2 = rightFetchesPerRow;
319  seParamFetchR.LEZR = lezrDim[lezrIndex];
320  seParamFetchR.LEZR_CNT = lezrCount[lezrIndex];
321 
322  if (rowNumber > 0) {
323  __SE0_OPEN(pLFirstRow, seParamFetchL);
324  __SE1_OPEN(pLFirstRow + vecLen, seParamFetchR);
325  }
326 
327  __SA1_OPEN(saParamLStore);
328  __SA2_OPEN(saParamALoad);
329 
330  dataType *ptrL = pOutULocal + offset;
331  dataType *ptrA = pInALocal + offset;
332 
333  vec vLSum0 = vec(0); // Holds sum of all LxLy
334  vec vLSum1 = vec(0);
335  vec vLSum2 = vec(0);
336  vec vLSum3 = vec(0);
337 
338  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
339 
340  vec vRSum0 = vec(0); // Holds sum of all LxLy
341  vec vRSum1 = vec(0);
342  vec vRSum2 = vec(0);
343  vec vRSum3 = vec(0);
344 
345  vec vRA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
346 
347  dataType *pMulStore = pMulBuffer;
348  for (lRow = 0; lRow < rowNumber; lRow += UNROLL_COUNT) {
349  vec vLL0 = c7x::strm_eng<0, vec>::get_adv(); // L value vector from each row
350  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
351  vLSum0 += vLL0 * vLL0Temp.s[0];
352  vec vLR0 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
353  vRSum0 += vLR0 * vLL0Temp.s[0];
354  *pMulStore = vLL0Temp.s[0];
355  pMulStore++;
356 
357  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
358  vec vLL1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL1)));
359  vLSum1 += vLL1 * vLL1Temp.s[0];
360  vec vLR1 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
361  vRSum1 += vLR1 * vLL1Temp.s[0];
362  *pMulStore = vLL1Temp.s[0];
363  pMulStore++;
364 
365  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
366  vec vLL2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL2)));
367  vLSum2 += vLL2 * vLL2Temp.s[0];
368  vec vLR2 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
369  vRSum2 += vLR2 * vLL2Temp.s[0];
370  *pMulStore = vLL2Temp.s[0];
371  pMulStore++;
372 
373  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
374  vec vLL3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL3)));
375  vLSum3 += vLL3 * vLL3Temp.s[0];
376  vec vLR3 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
377  vRSum3 += vLR3 * vLL3Temp.s[0];
378  *pMulStore = vLL3Temp.s[0];
379  pMulStore++;
380  }
381  vLSum0 += vLSum1;
382  vLSum2 += vLSum3;
383 
384  vRSum0 += vRSum1;
385  vRSum2 += vRSum3;
386 
387  vLA -= vLSum2;
388  vec vLDiff = vLA - vLSum0;
389 
390  vRA -= vRSum2;
391  vec vRDiff = vRA - vRSum0;
392 
393  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
394 
395  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
396 
397  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
398  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
399  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
400 
401  __vpred vpStoreR = c7x::strm_agen<1, vec>::get_vpred();
402  vec *outVecPtrR = c7x::strm_agen<1, vec>::get_adv(ptrL);
403  __vstore_pred(vpStoreR, outVecPtrR, vRDiff * recipDiagValue);
404 
405  /* Handling all the pong fetches */
406  for (fetch = 0; fetch < leftFetchesPerRow - 1; fetch++) {
407  vLSum0 = vec(0);
408  vLSum1 = vec(0);
409  vLSum2 = vec(0);
410  vLSum3 = vec(0);
411 
412  vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
413 
414  vRSum0 = vec(0);
415  vRSum1 = vec(0);
416  vRSum2 = vec(0);
417  vRSum3 = vec(0);
418 
419  vRA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
420 
421  pMulStore = pMulBuffer;
422  for (lRow = 0; lRow < rowNumber; lRow += UNROLL_COUNT) {
423  vec multiplier0 = __vload_dup(pMulStore);
424  pMulStore++;
425  vec vLL0 = c7x::strm_eng<0, vec>::get_adv();
426  vec vLR0 = c7x::strm_eng<1, vec>::get_adv();
427  vLSum0 += vLL0 * multiplier0;
428  vRSum0 += vLR0 * multiplier0;
429 
430  vec multiplier1 = __vload_dup(pMulStore);
431  pMulStore++;
432  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
433  vec vLR1 = c7x::strm_eng<1, vec>::get_adv();
434  vLSum1 += vLL1 * multiplier1;
435  vRSum1 += vLR1 * multiplier1;
436 
437  vec multiplier2 = __vload_dup(pMulStore);
438  pMulStore++;
439  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
440  vec vLR2 = c7x::strm_eng<1, vec>::get_adv();
441  vLSum2 += vLL2 * multiplier2;
442  vRSum2 += vLR2 * multiplier2;
443 
444  vec multiplier3 = __vload_dup(pMulStore);
445  pMulStore++;
446  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
447  vec vLR3 = c7x::strm_eng<1, vec>::get_adv();
448  vLSum3 += vLL3 * multiplier3;
449  vRSum3 += vLR3 * multiplier3;
450  }
451  vLSum0 += vLSum1;
452  vLSum2 += vLSum3;
453 
454  vRSum0 += vRSum1;
455  vRSum2 += vRSum3;
456 
457  vLA -= vLSum2;
458  vec vLDiff1 = vLA - vLSum0;
459 
460  vRA -= vRSum2;
461  vec vRDiff1 = vRA - vRSum0;
462 
463  __vpred vpStoreL1 = c7x::strm_agen<1, vec>::get_vpred();
464  vec *outVecPtrL1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
465  __vstore_pred(vpStoreL1, outVecPtrL1, vLDiff1 * recipDiagValue);
466 
467  __vpred vpStoreR1 = c7x::strm_agen<1, vec>::get_vpred();
468  vec *outVecPtrR1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
469  __vstore_pred(vpStoreR1, outVecPtrR1, vRDiff1 * recipDiagValue);
470  }
471 
472  offset += yStride;
473  rowNumber++;
474  vMask += vMaskIncrement; // uchar(4);
475  }
476  pLFirstRow += vecLen;
477  offset += vecLen;
478  elemsPerRow -= vecLen;
479  elemsPerRowCeil -= vecLen;
480  }
481 
482  for (; block < blockMax - 1; block++) {
483  // configuration for sa
484  saParamLStore.ICNT0 = elemsPerRow;
485  saParamALoad.ICNT0 = elemsPerRow;
486 
487  vMask = vMaskInit;
488  for (row = 0; row < vecLen; row++) {
489  int32_t lezrIndex =
490  (int32_t) ((uint32_t) rowNumber & (uint32_t) (UNROLL_COUNT - 1)); // rowNumber%4=>4 is unroll count
491 
492  // configuration for SE
493  seParamFetchL.ICNT1 = rowNumber;
494  seParamFetchL.ICNT2 = 1;
495  seParamFetchL.LEZR = lezrDim[lezrIndex];
496  seParamFetchL.LEZR_CNT = lezrCount[lezrIndex];
497 
498  seParamFetchR.ICNT1 = rowNumber;
499  seParamFetchR.ICNT2 = 1;
500  seParamFetchR.LEZR = lezrDim[lezrIndex];
501  seParamFetchR.LEZR_CNT = lezrCount[lezrIndex];
502 
503  if (rowNumber > 0) {
504  __SE0_OPEN(pLFirstRow, seParamFetchL);
505  __SE1_OPEN(pLFirstRow + vecLen, seParamFetchR);
506  }
507 
508  __SA1_OPEN(saParamLStore);
509  __SA2_OPEN(saParamALoad);
510  dataType *ptrL = pOutULocal + offset;
511  dataType *ptrA = pInALocal + offset;
512 
513  vec vLSum0 = vec(0); // Holds sum of all LxLy
514  vec vLSum1 = vec(0);
515  vec vLSum2 = vec(0);
516  vec vLSum3 = vec(0);
517 
518  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
519 
520  vec vRSum0 = vec(0); // Holds sum of all LxLy
521  vec vRSum1 = vec(0);
522  vec vRSum2 = vec(0);
523  vec vRSum3 = vec(0);
524 
525  vec vRA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
526  for (lRow = 0; lRow < rowNumber; lRow += UNROLL_COUNT) {
527  vec vLL0 = c7x::strm_eng<0, vec>::get_adv(); // L value vector from each row
528  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
529  vLSum0 += vLL0 * vLL0Temp.s[0];
530  vec vLR0 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
531  vRSum0 += vLR0 * vLL0Temp.s[0];
532 
533  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
534  vec vLL1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL1)));
535  vLSum1 += vLL1 * vLL1Temp.s[0];
536  vec vLR1 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
537  vRSum1 += vLR1 * vLL1Temp.s[0];
538 
539  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
540  vec vLL2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL2)));
541  vLSum2 += vLL2 * vLL2Temp.s[0];
542  vec vLR2 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
543  vRSum2 += vLR2 * vLL2Temp.s[0];
544 
545  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
546  vec vLL3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL3)));
547  vLSum3 += vLL3 * vLL3Temp.s[0];
548  vec vLR3 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
549  vRSum3 += vLR3 * vLL3Temp.s[0];
550  }
551  vLSum0 += vLSum1;
552  vLSum2 += vLSum3;
553 
554  vRSum0 += vRSum1;
555  vRSum2 += vRSum3;
556 
557  vLA -= vLSum2;
558  vec vLDiff = vLA - vLSum0;
559 
560  vRA -= vRSum2;
561  vec vRDiff = vRA - vRSum0;
562 
563  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
564 
565  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
566 
567  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
568  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
569  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
570 
571  __vpred vpStoreR = c7x::strm_agen<1, vec>::get_vpred();
572  vec *outVecPtrR = c7x::strm_agen<1, vec>::get_adv(ptrL);
573  __vstore_pred(vpStoreR, outVecPtrR, vRDiff * recipDiagValue);
574 
575  offset += yStride;
576  rowNumber++;
577  vMask += vMaskIncrement; // uchar(4);
578  }
579 
580  pLFirstRow += vecLen;
581  offset += vecLen;
582  elemsPerRow -= vecLen;
583  }
584 
585  for (; block < blockMax; block++) {
586  // configuration for sa
587  saParamLStore.ICNT0 = elemsPerRow;
588  saParamALoad.ICNT0 = elemsPerRow;
589 
590  /*seParamFetchL => Up fetch seParamFetchR => Down fetch*/
591  seParamFetchL.ICNT0 = vecLen;
592  seParamFetchL.DIM1 = (int32_t) ((uint32_t) yStride << 1u); // order
593  seParamFetchR.ICNT0 = vecLen;
594  seParamFetchR.DIM1 = (int32_t) ((uint32_t) yStride << 1u); // order
595 
596  seParamFetchL.DIMFMT = __SE_DIMFMT_2D;
597  seParamFetchR.DIMFMT = __SE_DIMFMT_2D;
598 
599  vMask = vMaskInit;
600 
601  for (row = 0; row < vecLen - extraRows; row++) {
602 
603  if (rowNumber < 2) {
604 
605  // configuration for SE
606  seParamFetchL.ICNT1 = rowNumber;
607 
608  if (rowNumber > 0) {
609  __SE0_OPEN(pLFirstRow, seParamFetchL);
610 
611  __SA1_OPEN(saParamLStore);
612  __SA2_OPEN(saParamALoad);
613 
614  dataType *ptrA = pInALocal + offset;
615 
616  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
617 
618  vec vLSum0 = vec(0); // Holds sum of all LxLy
619  dataType *ptrL = pOutULocal + offset;
620 
621  vec vLL0 = c7x::strm_eng<0, vec>::get(); // L value vector from each row
622  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
623  vLSum0 += vLL0 * vLL0Temp.s[0];
624 
625  vec vLDiff = vLA - vLSum0;
626 
627  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
628 
629  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
630 
631  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
632  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
633  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
634 
635  offset += yStride;
636  rowNumber++;
637  }
638  else {
639 
640  __SA1_OPEN(saParamLStore);
641  __SA2_OPEN(saParamALoad);
642 
643  dataType *ptrA = pInALocal + offset;
644  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
645  dataType *ptrL = pOutULocal + offset;
646 
647  vec vLDiff = vLA; // - vLSum0;
648  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
649  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
650 
651  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
652  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
653  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
654 
655  offset += yStride;
656  rowNumber++;
657  }
658  }
659  else {
660  int32_t upFetchesPerRow = (int32_t) ((uint32_t) (rowNumber + 1) >> 1u);
661  int32_t downFetchesPerRow = (int32_t) ((uint32_t) rowNumber >> 1u);
662 
663  // configuration for SE
664  seParamFetchL.ICNT1 = upFetchesPerRow;
665  seParamFetchR.ICNT1 = downFetchesPerRow;
666 
667  __SE0_OPEN(pLFirstRow, seParamFetchL);
668  __SE1_OPEN(pLFirstRow + yStride, seParamFetchR);
669 
670  __SA1_OPEN(saParamLStore);
671  __SA2_OPEN(saParamALoad);
672  dataType *ptrL = pOutULocal + offset;
673  dataType *ptrA = pInALocal + offset;
674 
675  vec vLSum0 = vec(0); // Holds sum of all LxLy
676  vec vLSum1 = vec(0);
677  vec vLSum2 = vec(0);
678  vec vLSum3 = vec(0);
679 
680  vec vRSum0 = vec(0); // Holds sum of all LxLy
681  vec vRSum1 = vec(0);
682  vec vRSum2 = vec(0);
683  vec vRSum3 = vec(0);
684  vec vLA = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
685 
686  for (lRow = 0; lRow < upFetchesPerRow; lRow += UNROLL_COUNT) {
687  vec vLL0 = c7x::strm_eng<0, vec>::get_adv(); // L value vector from each row
688  vec vLL0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL0)));
689  vLSum0 += vLL0 * vLL0Temp.s[0];
690 
691  vec vLR0 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
692  vec vLR0Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR0)));
693  vRSum0 += vLR0 * vLR0Temp.s[0];
694 
695  vec vLL1 = c7x::strm_eng<0, vec>::get_adv();
696  vec vLL1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL1)));
697  vLSum1 += vLL1 * vLL1Temp.s[0];
698  vec vLR1 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
699  vec vLR1Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR1)));
700  vRSum1 += vLR1 * vLR1Temp.s[0];
701 
702  vec vLL2 = c7x::strm_eng<0, vec>::get_adv();
703  vec vLL2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL2)));
704  vLSum2 += vLL2 * vLL2Temp.s[0];
705  vec vLR2 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
706  vec vLR2Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR2)));
707  vRSum2 += vLR2 * vLR2Temp.s[0];
708 
709  vec vLL3 = c7x::strm_eng<0, vec>::get_adv();
710  vec vLL3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLL3)));
711  vLSum3 += vLL3 * vLL3Temp.s[0];
712  vec vLR3 = c7x::strm_eng<1, vec>::get_adv(); // L value vector from each row
713  vec vLR3Temp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLR3)));
714  vRSum3 += vLR3 * vLR3Temp.s[0];
715  }
716  vLSum0 += vLSum1;
717  vLSum2 += vLSum3;
718 
719  vRSum0 += vRSum1;
720  vRSum2 += vRSum3;
721 
722  vLSum2 += vLSum0;
723  vRSum2 += vRSum0;
724 
725  vec vLDiff = vLA - vLSum2 - vRSum2;
726 
727  vec vLDiffTemp = c7x::reinterpret<vec>(__permute(vMask, __as_uchar64(vLDiff)));
728 
729  recipDiagValue = getRecipSqrt(vLDiffTemp.s[0]);
730 
731  __vpred vpStoreL = c7x::strm_agen<1, vec>::get_vpred();
732  vec *outVecPtrL = c7x::strm_agen<1, vec>::get_adv(ptrL);
733  __vstore_pred(vpStoreL, outVecPtrL, vLDiff * recipDiagValue);
734 
735  offset += yStride;
736  rowNumber++;
737  }
738  vMask += vMaskIncrement; // uchar(4);
739  }
740 
741  pLFirstRow += vecLen;
742  offset += vecLen;
743  elemsPerRow -= vecLen;
744  }
745 
746  __SE0_CLOSE();
747  __SE1_CLOSE();
748  __SA1_CLOSE();
749  __SA2_CLOSE();
750 
751  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
752  return DSPLIB_SUCCESS;
753 }
754 
755 template <typename dataType>
757  void *restrict pInA,
758  void *restrict pOutU,
759  void *restrict pMulBuffer)
760 {
761  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
762 
763  DSPLIB_STATUS status = DSPLIB_SUCCESS;
764 
765  DSPLIB_cholesky_PrivArgs *pKerPrivArgs = (DSPLIB_cholesky_PrivArgs *) handle;
766  dataType *pLocalA = (dataType *) pInA;
767  dataType *pOutULocal = (dataType *) pOutU;
768 
769  dataType *pLocalMul = (dataType *) pMulBuffer;
770  uint8_t *pBlock = pKerPrivArgs->bufPblock;
771  int32_t order = pKerPrivArgs->order;
772  int32_t enable_test = pKerPrivArgs->enableTest;
773  typedef typename c7x::make_full_vector<dataType>::type vec;
774  int32_t eleCount = c7x::element_count_of<vec>::value;
775 
776  DSPLIB_DEBUGPRINTFN(0, "pLocalA: %p pOutUocal: %p\n", pLocalA, pOutULocal);
777  if (enable_test) {
778  dataType sum = DSPLIB_cholesky_inplace_isPosDefinite(pLocalA, order, eleCount, pBlock);
779 
780  if (sum <= 0) {
781  status = DSPLIB_ERR_FAILURE;
782  }
783  else {
784  status = DSPLIB_cholesky_c7x_PingPong(enable_test, pKerPrivArgs, pLocalA, pOutULocal, pLocalMul);
785  }
786  }
787  else {
788  status = DSPLIB_cholesky_c7x_PingPong(enable_test, pKerPrivArgs, pLocalA, pOutULocal, pLocalMul);
789  }
790 
791  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", status);
792  return status;
793 }
794 
796  void *restrict pInA,
797  void *restrict pOutU,
798  void *restrict pMulBuffer);
799 
801  void *restrict pInA,
802  void *restrict pOutU,
803  void *restrict pMulBuffer);
804 
805 /* ======================================================================== */
806 /* End of file: DSPLIB_cholesky_ci.cpp */
807 /* ======================================================================== */
template DSPLIB_STATUS DSPLIB_cholesky_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pInA, void *restrict pOutU, void *restrict pMulBuffer)
c7x::uchar_vec DSPLIB_cholesky_getMaskIncrement()
c7x::uchar_vec DSPLIB_cholesky_getMaskIncrement< float >()
template DSPLIB_STATUS DSPLIB_cholesky_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_cholesky_InitArgs *pKerInitArgs)
c7x::uchar_vec DSPLIB_cholesky_getMaskIncrement< double >()
#define UNROLL_COUNT
DSPLIB_STATUS DSPLIB_cholesky_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_cholesky_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_cholesky_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_cholesky_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_cholesky_c7x_PingPong_init(DSPLIB_kernelHandle handle)
DSPLIB_STATUS DSPLIB_cholesky_exec_ci(DSPLIB_kernelHandle handle, void *restrict pInA, void *restrict pOutU, void *restrict pMulBuffer)
This function is the main execution function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_cholesky_c7x_PingPong_init< double >(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_cholesky_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pInA, void *restrict pOutU, void *restrict pMulBuffer)
dataType getRecipSqrt(dataType a)
DSPLIB_STATUS DSPLIB_cholesky_c7x_PingPong(int enable_test, DSPLIB_cholesky_PrivArgs *pKerPrivArgs, dataType *restrict pInALocal, dataType *restrict pOutULocal, dataType *restrict pMulBuffer)
template DSPLIB_STATUS DSPLIB_cholesky_c7x_PingPong_init< float >(DSPLIB_kernelHandle handle)
dataType DSPLIB_cholesky_inplace_isPosDefinite(dataType *A, const int32_t order, const int32_t eleCount, uint8_t *pBlock)
#define SA_SA0_PARAM_OFFSET
#define SA_SA1_PARAM_OFFSET
#define SE_SE2_PARAM_OFFSET
#define SE_SE3_PARAM_OFFSET
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_cholesky.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
@ DSPLIB_ERR_FAILURE
Definition: DSPLIB_types.h:153
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
int32_t order
Order of input buffer for different batches DSPLIB_cholesky_init that will be retrieved and used by D...
uint8_t bufPblock[DSPLIB_cholesky_IXX_IXX_OXX_PBLOCK_SIZE]