DSPLIB User Guide
DSPLIB_cholesky_inplace_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
48 
49 /*******************************************************************************
50  *
51  * DEFINES
52  *
53  ******************************************************************************/
54 
55 #define UNROLL_COUNT 4
56 #define MIN_HORIZONTAL_COLUMNS_FOR_UNROLL 2
57 #define NUM_VECS_IN_TILE 6
58 
59 /*******************************************************************************
60  *
61  * INITIALIZATION
62  *
63  ******************************************************************************/
64 
66 {
67  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
68 
70 
71  __SE_TEMPLATE_v1 seParamFetchU; // Left fetch
72  __SE_TEMPLATE_v1 seParamFetchD; // Right fetch
73  __SA_TEMPLATE_v1 saParamMulFetch; // Stores the multiplier values
74  __SA_TEMPLATE_v1 saParamALoad; // Stores the L (Left half) values
75  __SA_TEMPLATE_v1 saParamLStore; // Stores the L (Right half) values
76 
77  __SE_ELETYPE SE_ELETYPE;
78  __SE_VECLEN SE_VECLEN;
79  __SA_VECLEN SA_VECLEN;
80 
82 
83  typedef typename c7x::make_full_vector<dataType>::type vec;
84 
85  SE_VECLEN = c7x::se_veclen<vec>::value;
86  SE_ELETYPE = c7x::se_eletype<vec>::value;
87  SA_VECLEN = c7x::sa_veclen<vec>::value;
88 
89  uint32_t eleCount = c7x::element_count_of<vec>::value;
90 
91  uint32_t yStride = pKerPrivArgs->stride / sizeof(dataType);
92 
93  /**********************************************************************/
94  /* Prepare streaming engine for fetching L values(Up) */
95  /**********************************************************************/
96  seParamFetchU = __gen_SE_TEMPLATE_v1();
97 
98  seParamFetchU.ICNT0 = NUM_VECS_IN_TILE * eleCount;
99  seParamFetchU.ICNT1 = 0; // (row+1)/2 or (row+1)>>1; To be
100  // reconfigured in the loop
101 
102  seParamFetchU.DIM1 = (yStride) << 1; // yStride * 2 (row+1)<<1; How much
103  // the vecc start should shift so that
104  // it fetches the correct next element
105 
106  seParamFetchU.ICNT2 = 0; // Number of tiles; To be recofnigured in
107  // the loop. This will be ceil(order/vecLen)
108 
109  seParamFetchU.DIM2 = NUM_VECS_IN_TILE * eleCount; // Distance from one tile to the next tile
110  seParamFetchU.DECDIM1 = __SE_DECDIM_DIM2;
111  seParamFetchU.DECDIM1_WIDTH = pKerPrivArgs->order; // No of (validElemsPerRow)
112 
113  seParamFetchU.ELETYPE = SE_ELETYPE;
114  seParamFetchU.VECLEN = SE_VECLEN;
115  seParamFetchU.DIMFMT = __SE_DIMFMT_3D;
116 
117  /**********************************************************************/
118  /* Prepare streaming engine for fetching L values(Down) */
119  /**********************************************************************/
120  seParamFetchD = __gen_SE_TEMPLATE_v1();
121 
122  seParamFetchD.ICNT0 = NUM_VECS_IN_TILE * eleCount;
123  seParamFetchD.ICNT1 = 0; // (row)/2 or (row)>>1
124  seParamFetchD.DIM1 = (yStride) << 1; // yStride * 2 (row+1)<<1
125  seParamFetchD.ICNT2 = 0; // 2
126  seParamFetchD.DIM2 = NUM_VECS_IN_TILE * eleCount;
127  seParamFetchD.DECDIM1 = __SE_DECDIM_DIM2;
128  seParamFetchD.DECDIM1_WIDTH = pKerPrivArgs->order; // No of (validElemsPerRow)
129 
130  seParamFetchD.ELETYPE = SE_ELETYPE;
131  seParamFetchD.VECLEN = SE_VECLEN;
132  seParamFetchD.DIMFMT = __SE_DIMFMT_3D;
133 
134  /**********************************************************************/
135  /* Prepare Address generator to fetch the multipliers */
136  /**********************************************************************/
137  saParamMulFetch = __gen_SA_TEMPLATE_v1();
138 
139  saParamMulFetch.ICNT0 = 1; // Number of multiplier for a row is 1
140  saParamMulFetch.ICNT1 = 0; // To be reconfigured in the loop
141  saParamMulFetch.DIM1 = yStride; // Distance to the next multiplier value
142  saParamMulFetch.ICNT2 = 0; // To be reconfigured to number of tiles
143  saParamMulFetch.DIM2 = 0; // Distance to the next multiplier value
144 
145  saParamMulFetch.VECLEN = __SA_VECLEN_1ELEM; // We need 1 element
146  saParamMulFetch.DIMFMT = __SA_DIMFMT_3D;
147 
148  /**********************************************************************/
149  /* Prepare Address generator to store the L values */
150  /**********************************************************************/
151  saParamLStore = __gen_SA_TEMPLATE_v1();
152 
153  saParamLStore.ICNT0 =
154  0; // Valid elements in a row without the junk values. To be reconfigured every row => order - row
155  saParamLStore.VECLEN = SA_VECLEN;
156  saParamLStore.DIMFMT = __SA_DIMFMT_1D;
157 
158  /**********************************************************************/
159  /* Prepare Address generator to store the Right L values */
160  /**********************************************************************/
161  saParamALoad = __gen_SA_TEMPLATE_v1();
162 
163  saParamALoad.ICNT0 =
164  0; // Valid elements in a row without the junk values. To be reconfigured every row => order - row
165  saParamALoad.VECLEN = SA_VECLEN;
166  saParamALoad.DIMFMT = __SA_DIMFMT_1D;
167 
168  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE2_PARAM_OFFSET) = seParamFetchU;
169  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE3_PARAM_OFFSET) = seParamFetchD;
170  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA0_PARAM_OFFSET) = saParamMulFetch;
171  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA1_PARAM_OFFSET) = saParamLStore;
172  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA2_PARAM_OFFSET) = saParamALoad;
173 
174  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
175  return status;
176 }
177 
180 
181 // Initialize SE params
182 template <typename dataType>
184  DSPLIB_bufParams2D_t *bufParamsA,
185  DSPLIB_bufParams1D_t *bufParamsMul,
186  const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
187 {
188  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
189 
190  DSPLIB_STATUS status = DSPLIB_SUCCESS;
192  uint8_t *pBlock = pKerPrivArgs->bufPblock;
193  int32_t order = pKerPrivArgs->order;
194  int32_t strideA = pKerPrivArgs->stride;
195  int32_t colAStride = strideA / sizeof(dataType);
196 
197  DSPLIB_cholesky_inplace_c7x_PingPong_init<dataType>(handle);
198  DSPLIB_cholesky_inplace_isPosDefinite_init<dataType>(order, colAStride, pBlock);
199 
200  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
201 
202  return status;
203 }
204 
206  DSPLIB_bufParams2D_t *bufParamsA,
207  DSPLIB_bufParams1D_t *bufParamsMul,
208  const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs);
209 
211  DSPLIB_bufParams2D_t *bufParamsA,
212  DSPLIB_bufParams1D_t *bufParamsMul,
213  const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs);
214 
215 /*******************************************************************************
216  *
217  * IMPLEMENTATION
218  *
219  ******************************************************************************/
220 
221 template <typename dataType> inline dataType getRecipSqrt(dataType a)
222 {
223 
224  const dataType Half = 0.5f;
225  const dataType OneP5 = 1.5f;
226  dataType x;
227 
228  x = __recip_sqrt(a); // compute square root reciprocal
229 
230  x = x * (OneP5 - (a * x * x * Half));
231  x = x * (OneP5 - (a * x * x * Half));
232  // PRAGMA: do not unroll this loop
233  // int i;
234  // #pragma UNROLL(1)
235  // for (i = 0; i < 2; i++)
236  // {
237  // x = x * (OneP5 - (a * x * x * Half));
238  // }
239 
240  return x;
241 }
242 
243 template <typename dataType>
245  DSPLIB_cholesky_inplace_PrivArgs *pKerPrivArgs,
246  dataType *restrict pInALocal,
247  dataType *restrict pOutULocal,
248  dataType *restrict pMulBuffer)
249 {
250  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
251 
252  typedef typename c7x::make_full_vector<dataType>::type vec;
253  int32_t eleCount = c7x::element_count_of<vec>::value;
254 
255  // SE Template loading
256  __SE_TEMPLATE_v1 seParamFetchU = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE2_PARAM_OFFSET);
257  __SE_TEMPLATE_v1 seParamFetchD = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SE_SE3_PARAM_OFFSET);
258 
259  // SA Template loading
260  __SA_TEMPLATE_v1 saParamMulFetch = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA0_PARAM_OFFSET);
261  __SA_TEMPLATE_v1 saParamLStore = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA1_PARAM_OFFSET);
262  __SA_TEMPLATE_v1 saParamALoad = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pKerPrivArgs->bufPblock + SA_SA2_PARAM_OFFSET);
263 
264  int32_t order = pKerPrivArgs->order;
265  int32_t vecLen = eleCount;
266 
267  int32_t row, lRow, tile_i;
268  int32_t stride = pKerPrivArgs->stride;
269  int32_t yStride = stride / sizeof(dataType);
270 
271  dataType *pLFirstRow = pOutULocal;
272  dataType *ptrL = pOutULocal;
273  dataType *ptrA = pInALocal;
274 
275  dataType *pMultiplier = NULL;
276  dataType recipDiagValue;
277 
278  bool toggle = 0;
279  int32_t validElemsPerRow = order;
280  int32_t tile_width = NUM_VECS_IN_TILE * vecLen;
281  int32_t tileWidthCeilValue = validElemsPerRow + tile_width - 1;
282 
283  __SE_ELEDUP SE_ELEDUP;
284  SE_ELEDUP = c7x::se_eledup<dataType, vec>::value;
285 
286  vec vec00, vec01, vec02, vec03, vec04, vec05; // vec for holding se0 fetches
287  vec vec10, vec11, vec12, vec13, vec14, vec15; // vec for holding se1 fetches
288  vec acc00, acc01, acc02, acc03, acc04, acc05; // vec for holding the accumulated values
289  vec acc10, acc11, acc12, acc13, acc14, acc15; // vec for holding the accumulated values
290 
291  int32_t lRowSE0;
292  int32_t lRowSE1;
293 
294  for (row = 0; row < order; row++) {
295  pMultiplier = pLFirstRow;
296 
300  saParamLStore.ICNT0 = validElemsPerRow;
301  saParamALoad.ICNT0 = validElemsPerRow;
302 
303  __SA1_OPEN(saParamLStore);
304  __SA2_OPEN(saParamALoad);
305 
306  if (validElemsPerRow > MIN_HORIZONTAL_COLUMNS_FOR_UNROLL * vecLen) {
307 
308  lRowSE0 = (int32_t) (((uint32_t) row + 1U) >> 1U);
309  lRowSE1 = (int32_t) (((uint32_t) row) >> 1U);
310 
311  seParamFetchU.DECDIM1_WIDTH = validElemsPerRow;
312  seParamFetchD.DECDIM1_WIDTH = validElemsPerRow;
313 
314  // SE and SA Configurations
315  int32_t nTiles = (tileWidthCeilValue) / tile_width;
316 
317  // SA configurations depends on each tile
318  saParamMulFetch.ICNT1 = row; //((row + toggle) == 0) ? 1 : (row + toggle);
319  saParamMulFetch.ICNT2 = nTiles;
320  if (row > 0) {
321  __SA0_OPEN(saParamMulFetch);
322  }
323 
324  seParamFetchU.ICNT1 = lRowSE0;
325  seParamFetchU.ICNT2 = nTiles;
326 
327  seParamFetchD.ICNT1 = lRowSE1;
328  seParamFetchD.ICNT2 = nTiles;
329 
330  if (lRowSE0 > 0) {
331  __SE0_OPEN(pLFirstRow, seParamFetchU);
332  }
333  if (lRowSE1 > 0) {
334  __SE1_OPEN(pLFirstRow + yStride, seParamFetchD);
335  }
336 
337  // Loops to iterate through the tiles
338 
339  for (tile_i = 0; tile_i < nTiles; tile_i++) {
340  acc00 = vec(0);
341  acc01 = vec(0);
342  acc02 = vec(0);
343  acc03 = vec(0);
344  acc04 = vec(0);
345  acc05 = vec(0);
346 
347  acc10 = vec(0);
348  acc11 = vec(0);
349  acc12 = vec(0);
350  acc13 = vec(0);
351  acc14 = vec(0);
352  acc15 = vec(0);
353 
354  for (lRow = 0; lRow < lRowSE1; lRow++) {
355  dataType *pMultiplierUp = c7x::strm_agen<0, dataType>::get_adv(pMultiplier);
356  vec multiplier0 = __vload_dup(pMultiplierUp);
357 
358  dataType *pMultiplierDown = c7x::strm_agen<0, dataType>::get_adv(pMultiplier);
359  vec multiplier1 = __vload_dup(pMultiplierDown);
360  vec00 = c7x::strm_eng<0, vec>::get_adv();
361  vec10 = c7x::strm_eng<1, vec>::get_adv();
362  acc00 += (vec00 * multiplier0);
363  acc10 += (vec10 * multiplier1);
364 
365  vec01 = c7x::strm_eng<0, vec>::get_adv();
366  vec11 = c7x::strm_eng<1, vec>::get_adv();
367  acc01 += (vec01 * multiplier0);
368  acc11 += (vec11 * multiplier1);
369 
370  vec02 = c7x::strm_eng<0, vec>::get_adv();
371  vec12 = c7x::strm_eng<1, vec>::get_adv();
372  acc02 += (vec02 * multiplier0);
373  acc12 += (vec12 * multiplier1);
374 
375  vec03 = c7x::strm_eng<0, vec>::get_adv();
376  vec13 = c7x::strm_eng<1, vec>::get_adv();
377  acc03 += (vec03 * multiplier0);
378  acc13 += (vec13 * multiplier1);
379 
380  vec04 = c7x::strm_eng<0, vec>::get_adv();
381  vec14 = c7x::strm_eng<1, vec>::get_adv();
382  acc04 += (vec04 * multiplier0);
383  acc14 += (vec14 * multiplier1);
384 
385  vec05 = c7x::strm_eng<0, vec>::get_adv();
386  vec15 = c7x::strm_eng<1, vec>::get_adv();
387  acc05 += (vec05 * multiplier0);
388  acc15 += (vec15 * multiplier1);
389  }
390 
391  if (lRowSE0 != lRowSE1) {
392  dataType *pMultiplierUp = c7x::strm_agen<0, dataType>::get_adv(pMultiplier);
393  vec multiplier0 = __vload_dup(pMultiplierUp);
394 
395  vec00 = c7x::strm_eng<0, vec>::get_adv();
396  acc00 += (vec00 * multiplier0);
397 
398  vec01 = c7x::strm_eng<0, vec>::get_adv();
399  acc01 += (vec01 * multiplier0);
400 
401  vec02 = c7x::strm_eng<0, vec>::get_adv();
402  acc02 += (vec02 * multiplier0);
403 
404  vec03 = c7x::strm_eng<0, vec>::get_adv();
405  acc03 += (vec03 * multiplier0);
406 
407  vec04 = c7x::strm_eng<0, vec>::get_adv();
408  acc04 += (vec04 * multiplier0);
409 
410  vec05 = c7x::strm_eng<0, vec>::get_adv();
411  acc05 += (vec05 * multiplier0);
412  }
413 
414  acc00 += acc10;
415  acc01 += acc11;
416  acc02 += acc12;
417  acc03 += acc13;
418  acc04 += acc14;
419  acc05 += acc15;
420 
421  // Using vec0x to hold A values
422  vec00 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
423  vec01 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
424  vec02 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
425  vec03 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
426  vec04 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
427  vec05 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
428 
429  // Using vec1x to hold the difference
430  vec10 = vec00 - acc00;
431  vec11 = vec01 - acc01;
432  vec12 = vec02 - acc02;
433  vec13 = vec03 - acc03;
434  vec14 = vec04 - acc04;
435  vec15 = vec05 - acc05;
436 
437  if (tile_i == 0) {
438  recipDiagValue = getRecipSqrt(vec10.s[0]);
439  }
440 
441  __vpred vpred0 = c7x::strm_agen<1, vec>::get_vpred();
442  vec *storePtr0 = c7x::strm_agen<1, vec>::get_adv(ptrL);
443  __vstore_pred(vpred0, storePtr0, vec10 * recipDiagValue);
444 
445  __vpred vpred1 = c7x::strm_agen<1, vec>::get_vpred();
446  vec *storePtr1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
447  __vstore_pred(vpred1, storePtr1, vec11 * recipDiagValue);
448 
449  __vpred vpred2 = c7x::strm_agen<1, vec>::get_vpred();
450  vec *storePtr2 = c7x::strm_agen<1, vec>::get_adv(ptrL);
451  __vstore_pred(vpred2, storePtr2, vec12 * recipDiagValue);
452 
453  __vpred vpred3 = c7x::strm_agen<1, vec>::get_vpred();
454  vec *storePtr3 = c7x::strm_agen<1, vec>::get_adv(ptrL);
455  __vstore_pred(vpred3, storePtr3, vec13 * recipDiagValue);
456 
457  __vpred vpred4 = c7x::strm_agen<1, vec>::get_vpred();
458  vec *storePtr4 = c7x::strm_agen<1, vec>::get_adv(ptrL);
459  __vstore_pred(vpred4, storePtr4, vec14 * recipDiagValue);
460 
461  __vpred vpred5 = c7x::strm_agen<1, vec>::get_vpred();
462  vec *storePtr5 = c7x::strm_agen<1, vec>::get_adv(ptrL);
463  __vstore_pred(vpred5, storePtr5, vec15 * recipDiagValue);
464  }
465  }
466  else {
467 
468  lRowSE0 = (int32_t) (((uint32_t) row + 1U) >> 1U);
469  lRowSE1 = (int32_t) (((uint32_t) row) >> 1U);
470 
471  seParamFetchU.ICNT0 = 1;
472  seParamFetchU.ICNT1 = row;
473  seParamFetchU.ICNT2 = 1;
474  seParamFetchU.DIM1 = (yStride);
475  seParamFetchU.DIM2 = 0;
476  seParamFetchU.ELEDUP = SE_ELEDUP;
477 
478  seParamFetchD.ICNT0 = MIN_HORIZONTAL_COLUMNS_FOR_UNROLL * eleCount;
479  seParamFetchD.ICNT1 = row;
480  seParamFetchD.ICNT2 = 1;
481  seParamFetchD.DIM1 = (yStride);
482  seParamFetchD.DIM2 = 0;
483 
484  if (row > 0) {
485  __SE0_OPEN(pLFirstRow, seParamFetchU);
486  __SE1_OPEN(pLFirstRow, seParamFetchD);
487  }
488 
489  acc00 = vec(0);
490  acc01 = vec(0);
491  acc10 = vec(0);
492  acc11 = vec(0);
493  vec acc20 = vec(0);
494  vec acc21 = vec(0);
495  vec acc30 = vec(0);
496  vec acc31 = vec(0);
497 
498  vec vecMul0, vecMul1, vecMul2, vecMul3;
499  vec vecMul4, vecMul5, vecMul6, vecMul7;
500  vec vec0, vec1;
501  vec vec2, vec3;
502  vec vec4, vec5;
503  vec vec6, vec7;
504 
505  for (lRow = 0; lRow < row; lRow += 4) {
506  // UNROLL 1
507  // SE0 fetch and duplicate
508  vecMul0 = c7x::strm_eng<0, vec>::get();
509  vec0 = c7x::strm_eng<1, vec>::get_adv();
510 
511  acc00 += (vec0 * vecMul0);
512 
513  vecMul1 = c7x::strm_eng<0, vec>::get_adv();
514  vec1 = c7x::strm_eng<1, vec>::get_adv();
515  acc01 += (vec1 * vecMul1);
516 
517  // UNROLL 2
518  // SE0 fetch and duplicate
519  vecMul2 = c7x::strm_eng<0, vec>::get();
520  vec2 = c7x::strm_eng<1, vec>::get_adv();
521  acc10 += (vec2 * vecMul2);
522 
523  vecMul3 = c7x::strm_eng<0, vec>::get_adv();
524  vec3 = c7x::strm_eng<1, vec>::get_adv();
525  acc11 += (vec3 * vecMul3);
526 
527  // UNROLL 3
528  // SE0 fetch and duplicate
529  vecMul4 = c7x::strm_eng<0, vec>::get();
530  vec4 = c7x::strm_eng<1, vec>::get_adv();
531  acc20 += (vec4 * vecMul4);
532 
533  vecMul5 = c7x::strm_eng<0, vec>::get_adv();
534  vec5 = c7x::strm_eng<1, vec>::get_adv();
535  acc21 += (vec5 * vecMul5);
536 
537  // UNROLL 4
538  // SE0 fetch and duplicate
539  vecMul6 = c7x::strm_eng<0, vec>::get();
540  vec6 = c7x::strm_eng<1, vec>::get_adv();
541  acc30 += (vec6 * vecMul6);
542 
543  vecMul7 = c7x::strm_eng<0, vec>::get_adv();
544  vec7 = c7x::strm_eng<1, vec>::get_adv();
545  acc31 += (vec7 * vecMul7);
546  }
547 
548  acc00 += acc10;
549  acc01 += acc11;
550 
551  vec accLTmp = acc20 + acc30;
552  vec accRTmp = acc21 + acc31;
553 
554  acc00 += accLTmp;
555  acc01 += accRTmp;
556 
557  // Using vec0x to hold A values
558  vec00 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
559  vec01 = *(c7x::strm_agen<2, vec>::get_adv(ptrA));
560 
561  // Using vec1x to hold the difference
562  vec10 = vec00 - acc00;
563  vec11 = vec01 - acc01;
564 
565  recipDiagValue = getRecipSqrt(vec10.s[0]);
566 
567  __vpred vpred0 = c7x::strm_agen<1, vec>::get_vpred();
568  vec *storePtr0 = c7x::strm_agen<1, vec>::get_adv(ptrL);
569  __vstore_pred(vpred0, storePtr0, vec10 * recipDiagValue);
570 
571  __vpred vpred1 = c7x::strm_agen<1, vec>::get_vpred();
572  vec *storePtr1 = c7x::strm_agen<1, vec>::get_adv(ptrL);
573  __vstore_pred(vpred1, storePtr1, vec11 * recipDiagValue);
574  }
575 
576  // Updating pointers for L, A and multiplier pointer
577  ptrL += yStride + 1;
578  ptrA += yStride + 1;
579  pLFirstRow++;
580 
581  // Toggle to see if odd or even number of L rows to be fetched
582  toggle = !toggle;
583  validElemsPerRow--;
584  tileWidthCeilValue--;
585  }
586 
587  __SE0_CLOSE();
588  __SE1_CLOSE();
589  __SA0_CLOSE();
590  __SA1_CLOSE();
591  __SA2_CLOSE();
592  __SA3_CLOSE();
593 
594  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
595  return DSPLIB_SUCCESS;
596 }
597 
598 template <typename dataType>
599 DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
600 {
601  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
602 
603  DSPLIB_STATUS status = DSPLIB_SUCCESS;
604 
606  dataType *pLocalA = (dataType *) pA;
607  dataType *pLocalMul = (dataType *) pMul;
608  uint8_t *pBlock = pKerPrivArgs->bufPblock;
609  int32_t order = pKerPrivArgs->order;
610  int32_t enable_test = pKerPrivArgs->enableTest;
611  typedef typename c7x::make_full_vector<dataType>::type vec;
612  int32_t eleCount = c7x::element_count_of<vec>::value;
613 
614  DSPLIB_DEBUGPRINTFN(0, "pLocalA: %p\n", pLocalA);
615  if (enable_test) {
616  dataType sum = DSPLIB_cholesky_inplace_isPosDefinite(pLocalA, order, eleCount, pBlock);
617  if (sum <= 0) {
618  status = DSPLIB_ERR_FAILURE;
619  }
620  else {
621  status = DSPLIB_cholesky_inplace_c7x_PingPong(enable_test, pKerPrivArgs, pLocalA, pLocalA, pLocalMul);
622  }
623  }
624  else {
625  status = DSPLIB_cholesky_inplace_c7x_PingPong(enable_test, pKerPrivArgs, pLocalA, pLocalA, pLocalMul);
626  }
627 
628  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", status);
629  return status;
630 }
631 
633 DSPLIB_cholesky_inplace_exec_ci<float>(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul);
634 
636 DSPLIB_cholesky_inplace_exec_ci<double>(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul);
637 
638 /* ======================================================================== */
639 /* End of file: DSPLIB_cholesky_inplace_ci.cpp */
640 /* ======================================================================== */
dataType DSPLIB_cholesky_inplace_isPosDefinite(dataType *A, const int32_t order, const int32_t eleCount, uint8_t *pBlock)
#define SA_SA0_PARAM_OFFSET
#define SA_SA1_PARAM_OFFSET
#define SA_SA2_PARAM_OFFSET
#define SE_SE2_PARAM_OFFSET
#define SE_SE3_PARAM_OFFSET
#define NUM_VECS_IN_TILE
#define MIN_HORIZONTAL_COLUMNS_FOR_UNROLL
DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init< float >(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci< double >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
This function is the main execution function for the C7x implementation of the kernel....
DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong(int enable_test, DSPLIB_cholesky_inplace_PrivArgs *pKerPrivArgs, dataType *restrict pInALocal, dataType *restrict pOutULocal, dataType *restrict pMulBuffer)
dataType getRecipSqrt(dataType a)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_init_ci< float >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsA, DSPLIB_bufParams1D_t *bufParamsMul, const DSPLIB_cholesky_inplace_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pMul)
template DSPLIB_STATUS DSPLIB_cholesky_inplace_c7x_PingPong_init< double >(DSPLIB_kernelHandle handle)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_cholesky_inplace...
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
@ DSPLIB_ERR_FAILURE
Definition: DSPLIB_types.h:153
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_CHOLESKY_INPLACE_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t order
Order of input buffer for different batches DSPLIB_cholesky_inplace_init that will be retrieved and u...