46 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
47 int32_t strideR = pKerPrivArgs->
strideR;
48 int32_t colStrideR = strideR /
sizeof(dataType);
49 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
52 sa0Params.DIMFMT = __SA_DIMFMT_2D;
53 sa0Params.VECLEN = __SA_VECLEN_1ELEM;
54 sa0Params.DIM1 = 2 * colStrideR;
56 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = sa0Params;
66 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
67 int32_t strideQ = pKerPrivArgs->
strideQ;
68 int32_t strideR = pKerPrivArgs->
strideR;
69 int32_t nRows = pKerPrivArgs->
heightA;
71 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
72 __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
73 __SE_TEMPLATE_v1 se2Params = __gen_SE_TEMPLATE_v1();
75 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
76 __SA_TEMPLATE_v1 sa1Params = __gen_SA_TEMPLATE_v1();
77 __SA_TEMPLATE_v1 sa2Params = __gen_SA_TEMPLATE_v1();
78 __SA_TEMPLATE_v1 sa5Params = __gen_SA_TEMPLATE_v1();
80 typedef typename c7x::make_full_vector<dataType>::type vec;
81 int32_t eleCount = c7x::element_count_of<vec>::value;
82 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
83 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
84 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
85 int32_t colStrideQ = strideQ /
sizeof(dataType);
86 int32_t colStrideR = strideR /
sizeof(dataType);
88 int32_t nVec = DSPLIB_ceilingDiv(nRows, eleCount);
89 int32_t se0TICNT2 = nVec / 2;
90 int32_t se1TICNT2 = nVec - se0TICNT2;
92 se0Params.ICNT1 = eleCount;
93 se0Params.DIM1 = colStrideQ;
94 se0Params.DIM2 = eleCount * colStrideQ;
95 se0Params.DIMFMT = __SE_DIMFMT_3D;
96 se0Params.ELETYPE = SE_ELETYPE;
97 se0Params.VECLEN = SE_VECLEN;
98 se0Params.ICNT2 = se0TICNT2;
99 if (
sizeof(dataType) == 4) {
100 se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
103 se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
106 se1Params.ICNT2 = se1TICNT2;
107 se1Params.ICNT1 = eleCount;
108 se1Params.DIM1 = colStrideQ;
109 se1Params.DIM2 = eleCount * colStrideQ;
110 se1Params.DIMFMT = __SE_DIMFMT_3D;
111 se1Params.ELETYPE = SE_ELETYPE;
112 se1Params.VECLEN = SE_VECLEN;
113 if (
sizeof(dataType) == 4) {
114 se1Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
117 se1Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
121 sa2Params.DIMFMT = __SA_DIMFMT_2D;
122 sa2Params.VECLEN = __SA_VECLEN_1ELEM;
123 sa2Params.ICNT0 = nRows;
124 sa2Params.ICNT1 = se1TICNT2;
126 sa0Params.ICNT0 = se0TICNT2 * eleCount;
127 sa0Params.DIMFMT = __SA_DIMFMT_1D;
128 sa0Params.VECLEN = SA_VECLEN;
130 sa1Params.DIMFMT = __SA_DIMFMT_1D;
131 sa1Params.VECLEN = SA_VECLEN;
132 sa1Params.ICNT0 = nRows - (se0TICNT2 * eleCount);
134 int32_t se1ICNT1 = nRows / 2;
135 int32_t se0ICNT1 = nRows - se1ICNT1;
138 se2Params.ICNT0 = (eleCount * lenTile);
139 se2Params.DIM1 = colStrideR * 2;
140 se2Params.DIM2 = (eleCount * lenTile);
141 se2Params.ICNT1 = se0ICNT1;
142 se2Params.DIMFMT = __SE_DIMFMT_3D;
143 se2Params.ELETYPE = SE_ELETYPE;
144 se2Params.VECLEN = SE_VECLEN;
145 se2Params.DECDIM1 = __SE_DECDIM_DIM2;
147 sa5Params.ICNT0 = eleCount;
148 sa5Params.DIMFMT = __SA_DIMFMT_1D;
149 sa5Params.VECLEN = SA_VECLEN;
151 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = se0Params;
152 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = se1Params;
153 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = sa0Params;
154 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = sa1Params;
155 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = sa2Params;
156 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE)) = se2Params;
157 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE)) = sa5Params;
159 int32_t lenTile8 = 8;
160 __SE_TEMPLATE_v1 seScalarParams = __gen_SE_TEMPLATE_v1();
161 __SE_TEMPLATE_v1 seMatrixParams = __gen_SE_TEMPLATE_v1();
163 __SA_TEMPLATE_v1 saMatrixParams = __gen_SA_TEMPLATE_v1();
165 __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
167 seScalarParams.DIM1 = 0;
168 seScalarParams.ELEDUP = SE_ELEDUP;
169 seScalarParams.DIMFMT = __SE_DIMFMT_2D;
170 seScalarParams.VECLEN = SE_VECLEN;
171 seScalarParams.ELETYPE = SE_ELETYPE;
173 seMatrixParams.ICNT0 = (eleCount * lenTile8);
174 seMatrixParams.DIM1 = colStrideR;
175 seMatrixParams.DIM2 = (eleCount * lenTile8);
176 seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
177 seMatrixParams.ELETYPE = SE_ELETYPE;
178 seMatrixParams.VECLEN = SE_VECLEN;
179 seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
181 saMatrixParams.ICNT0 = (eleCount * lenTile8);
182 saMatrixParams.DIM1 = colStrideR;
183 saMatrixParams.DIM2 = (eleCount * lenTile8);
184 saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
185 saMatrixParams.VECLEN = SA_VECLEN;
186 saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
188 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE)) = seScalarParams;
189 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE)) = seMatrixParams;
191 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE)) = saMatrixParams;
193 int32_t lenTile4 = 4;
194 seScalarParams = __gen_SE_TEMPLATE_v1();
195 seMatrixParams = __gen_SE_TEMPLATE_v1();
197 saMatrixParams = __gen_SA_TEMPLATE_v1();
199 SE_ELEDUP = c7x::se_eledup<dataType>::value;
201 seMatrixParams.ICNT0 = (eleCount * lenTile4);
202 seMatrixParams.DIM1 = colStrideQ;
203 seMatrixParams.DIM2 = (eleCount * lenTile4);
204 seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
205 seMatrixParams.ELETYPE = SE_ELETYPE;
206 seMatrixParams.VECLEN = SE_VECLEN;
207 seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
209 saMatrixParams.ICNT0 = (eleCount * lenTile4);
210 saMatrixParams.DIM1 = colStrideQ;
211 saMatrixParams.DIM2 = (eleCount * lenTile4);
212 saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
213 saMatrixParams.VECLEN = SA_VECLEN;
214 saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
216 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE)) = seMatrixParams;
217 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (17 * SE_PARAM_SIZE)) = saMatrixParams;
219 se2Params = __gen_SE_TEMPLATE_v1();
220 se2Params.ICNT0 = (eleCount * lenTile4);
221 se2Params.DIM1 = colStrideR * 2;
222 se2Params.DIM2 = (eleCount * lenTile4);
223 se2Params.ICNT1 = se0ICNT1;
224 se2Params.DIMFMT = __SE_DIMFMT_3D;
225 se2Params.ELETYPE = SE_ELETYPE;
226 se2Params.VECLEN = SE_VECLEN;
227 se2Params.DECDIM1 = __SE_DECDIM_DIM2;
228 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE)) = se2Params;
241 template <
typename dataType>
253 typedef typename c7x::make_full_vector<dataType>::type vec;
254 dataType *rStore = pLocalR + 1;
256 int32_t eleCount = c7x::element_count_of<vec>::value;
257 int32_t lenTile8 = 8;
258 int32_t lenTile4 = 4;
259 int32_t lenTile2 = 2;
260 int32_t lenTile1 = 1;
262 int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
263 int32_t nTiles8 = nTiles1 / lenTile8;
264 nTiles1 -= nTiles8 * lenTile8;
265 int32_t nTiles4 = nTiles1 / lenTile4;
266 nTiles1 -= nTiles4 * lenTile4;
267 int32_t nTiles2 = nTiles1 / lenTile2;
268 nTiles1 -= nTiles2 * lenTile2;
270 int32_t remainingCols = nCols;
271 int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
272 colLimit8 = (remainingCols < (colLimit8)) ? remainingCols : colLimit8;
274 remainingCols = remainingCols - colLimit8;
275 int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
276 colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
278 remainingCols = remainingCols - colLimit4;
279 int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
280 colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
282 int32_t colLimit1 = remainingCols - colLimit2;
284 __SE_TEMPLATE_v1 se0Params;
285 __SE_TEMPLATE_v1 se1Params;
287 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
288 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
290 __SA_TEMPLATE_v1 sa0Params;
291 __SA_TEMPLATE_v1 sa1Params;
293 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
294 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE));
296 sa1Params.ICNT0 = nCols;
298 __SA1_OPEN(sa1Params);
301 sa0Params.ICNT0 = nRows;
302 sa0Params.ICNT1 = nTiles8 + nTiles4 + nTiles2 + nTiles1;
303 __SA0_OPEN(sa0Params);
305 int32_t se1ICNT1 = nRows / 2;
306 int32_t se0ICNT1 = nRows - se1ICNT1;
307 se0Params.ICNT1 = se0ICNT1;
308 se1Params.ICNT1 = se1ICNT1;
311 se0Params.ICNT2 = nTiles8;
312 se0Params.DECDIM1_WIDTH = colLimit8;
313 se0Params.ICNT0 = (eleCount * lenTile8);
315 se1Params.ICNT2 = nTiles8;
316 se1Params.DECDIM1_WIDTH = colLimit8;
317 se1Params.ICNT0 = (eleCount * lenTile8);
337 __SE0_OPEN(rStore, se0Params);
338 __SE1_OPEN(rStore + colStrideR, se1Params);
341 for (
int tile = 0; tile < nTiles8; tile++) {
360 for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
361 vec v01 = c7x::strm_eng<0, vec>::get_adv();
362 vec v02 = c7x::strm_eng<0, vec>::get_adv();
363 vec v03 = c7x::strm_eng<0, vec>::get_adv();
364 vec v04 = c7x::strm_eng<0, vec>::get_adv();
365 vec v05 = c7x::strm_eng<0, vec>::get_adv();
366 vec v06 = c7x::strm_eng<0, vec>::get_adv();
367 vec v07 = c7x::strm_eng<0, vec>::get_adv();
368 vec v08 = c7x::strm_eng<0, vec>::get_adv();
370 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
371 vec uV1 = __vload_dup(pU);
373 vec v11 = c7x::strm_eng<1, vec>::get_adv();
374 vec v12 = c7x::strm_eng<1, vec>::get_adv();
375 vec v13 = c7x::strm_eng<1, vec>::get_adv();
376 vec v14 = c7x::strm_eng<1, vec>::get_adv();
377 vec v15 = c7x::strm_eng<1, vec>::get_adv();
378 vec v16 = c7x::strm_eng<1, vec>::get_adv();
379 vec v17 = c7x::strm_eng<1, vec>::get_adv();
380 vec v18 = c7x::strm_eng<1, vec>::get_adv();
382 pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
383 vec uV2 = __vload_dup(pU);
404 if (se1ICNT1 != se0ICNT1)
406 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
407 vec uV1 = __vload_dup(pU);
409 vec v01 = c7x::strm_eng<0, vec>::get_adv();
410 vec v02 = c7x::strm_eng<0, vec>::get_adv();
411 vec v03 = c7x::strm_eng<0, vec>::get_adv();
412 vec v04 = c7x::strm_eng<0, vec>::get_adv();
413 vec v05 = c7x::strm_eng<0, vec>::get_adv();
414 vec v06 = c7x::strm_eng<0, vec>::get_adv();
415 vec v07 = c7x::strm_eng<0, vec>::get_adv();
416 vec v08 = c7x::strm_eng<0, vec>::get_adv();
446 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
447 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
448 __vstore_pred(pred, pStoreVec, sV01);
450 pred = c7x::strm_agen<1, vec>::get_vpred();
451 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
452 __vstore_pred(pred, pStoreVec, sV02);
454 pred = c7x::strm_agen<1, vec>::get_vpred();
455 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
456 __vstore_pred(pred, pStoreVec, sV03);
458 pred = c7x::strm_agen<1, vec>::get_vpred();
459 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
460 __vstore_pred(pred, pStoreVec, sV04);
462 pred = c7x::strm_agen<1, vec>::get_vpred();
463 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
464 __vstore_pred(pred, pStoreVec, sV05);
466 pred = c7x::strm_agen<1, vec>::get_vpred();
467 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
468 __vstore_pred(pred, pStoreVec, sV06);
470 pred = c7x::strm_agen<1, vec>::get_vpred();
471 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
472 __vstore_pred(pred, pStoreVec, sV07);
474 pred = c7x::strm_agen<1, vec>::get_vpred();
475 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
476 __vstore_pred(pred, pStoreVec, sV08);
483 se0Params.ICNT2 = nTiles4;
484 se0Params.DECDIM1_WIDTH = colLimit4;
485 se0Params.ICNT0 = (eleCount * lenTile4);
487 se1Params.ICNT2 = nTiles4;
488 se1Params.DECDIM1_WIDTH = colLimit4;
489 se1Params.ICNT0 = (eleCount * lenTile4);
501 dataType *pSE0 = rStore + colLimit8;
502 dataType *pSE1 = pSE0 + colStrideR;
504 __SE0_OPEN(pSE0, se0Params);
506 __SE1_OPEN(pSE1, se1Params);
508 for (
int tile = 0; tile < nTiles4; tile++) {
520 for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
522 vec v01 = c7x::strm_eng<0, vec>::get_adv();
523 vec v02 = c7x::strm_eng<0, vec>::get_adv();
524 vec v03 = c7x::strm_eng<0, vec>::get_adv();
525 vec v04 = c7x::strm_eng<0, vec>::get_adv();
526 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
527 vec uV1 = __vload_dup(pU);
529 vec v11 = c7x::strm_eng<1, vec>::get_adv();
530 vec v12 = c7x::strm_eng<1, vec>::get_adv();
531 vec v13 = c7x::strm_eng<1, vec>::get_adv();
532 vec v14 = c7x::strm_eng<1, vec>::get_adv();
534 pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
535 vec uV2 = __vload_dup(pU);
548 if (se1ICNT1 != se0ICNT1)
550 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
551 vec uV1 = __vload_dup(pU);
553 vec v01 = c7x::strm_eng<0, vec>::get_adv();
554 vec v02 = c7x::strm_eng<0, vec>::get_adv();
555 vec v03 = c7x::strm_eng<0, vec>::get_adv();
556 vec v04 = c7x::strm_eng<0, vec>::get_adv();
574 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
575 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
576 __vstore_pred(pred, pStoreVec, sV01);
578 pred = c7x::strm_agen<1, vec>::get_vpred();
579 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
580 __vstore_pred(pred, pStoreVec, sV02);
582 pred = c7x::strm_agen<1, vec>::get_vpred();
583 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
584 __vstore_pred(pred, pStoreVec, sV03);
586 pred = c7x::strm_agen<1, vec>::get_vpred();
587 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
588 __vstore_pred(pred, pStoreVec, sV04);
595 se0Params.ICNT2 = nTiles2;
596 se0Params.DECDIM1_WIDTH = colLimit2;
597 se0Params.ICNT0 = (eleCount * lenTile2);
599 se1Params.ICNT2 = nTiles2;
600 se1Params.DECDIM1_WIDTH = colLimit2;
601 se1Params.ICNT0 = (eleCount * lenTile2);
609 dataType *pSE0 = rStore + colLimit8 + colLimit4;
610 dataType *pSE1 = pSE0 + colStrideR;
612 __SE0_OPEN(pSE0, se0Params);
613 __SE1_OPEN(pSE1, se1Params);
616 for (
int tile = 0; tile < nTiles2; tile++) {
623 for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
624 vec v01 = c7x::strm_eng<0, vec>::get_adv();
625 vec v02 = c7x::strm_eng<0, vec>::get_adv();
627 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
628 vec uV1 = __vload_dup(pU);
630 vec v11 = c7x::strm_eng<1, vec>::get_adv();
631 vec v12 = c7x::strm_eng<1, vec>::get_adv();
633 pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
634 vec uV2 = __vload_dup(pU);
643 if (se1ICNT1 != se0ICNT1)
645 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
646 vec uV1 = __vload_dup(pU);
648 vec v01 = c7x::strm_eng<0, vec>::get_adv();
649 vec v02 = c7x::strm_eng<0, vec>::get_adv();
661 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
662 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
663 __vstore_pred(pred, pStoreVec, sV01);
665 pred = c7x::strm_agen<1, vec>::get_vpred();
666 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
667 __vstore_pred(pred, pStoreVec, sV02);
673 se0Params.ICNT1 = se0ICNT1;
674 se0Params.ICNT2 = nTiles1;
675 se0Params.DECDIM1_WIDTH = colLimit1;
676 se0Params.ICNT0 = (eleCount * lenTile1);
678 se1Params.ICNT1 = se1ICNT1;
679 se1Params.ICNT2 = nTiles1;
680 se1Params.DECDIM1_WIDTH = colLimit1;
681 se1Params.ICNT0 = (eleCount * lenTile1);
687 dataType *pSE0 = rStore + colLimit8 + colLimit4 + colLimit2;
688 dataType *pSE1 = pSE0 + colStrideR;
690 __SE0_OPEN(pSE0, se0Params);
691 __SE1_OPEN(pSE1, se1Params);
694 for (
int tile = 0; tile < nTiles1; tile++) {
700 for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
701 vec v01 = c7x::strm_eng<0, vec>::get_adv();
703 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
704 vec uV1 = __vload_dup(pU);
706 vec v11 = c7x::strm_eng<1, vec>::get_adv();
708 pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
709 vec uV2 = __vload_dup(pU);
716 if (se1ICNT1 != se0ICNT1)
718 dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
719 vec uV1 = __vload_dup(pU);
721 vec v01 = c7x::strm_eng<0, vec>::get_adv();
730 __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
731 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
732 __vstore_pred(pred, pStoreVec, sV01);
740 __SE_TEMPLATE_v1 seScalarParams;
741 __SE_TEMPLATE_v1 seMatrixParams;
743 __SA_TEMPLATE_v1 saMatrixParams;
745 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
746 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE));
747 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
749 sa1Params.ICNT0 = nCols;
750 seScalarParams.ICNT0 = nRows;
751 seScalarParams.ICNT1 = nTiles8 + nTiles4 + nTiles2 + nTiles1;
753 sa1Params.ICNT0 = nCols;
755 __SA0_OPEN(sa1Params);
756 __SE0_OPEN(pLocalU, seScalarParams);
758 seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
761 seMatrixParams.ICNT2 = nTiles8;
762 seMatrixParams.DECDIM1_WIDTH = colLimit8;
764 saMatrixParams.ICNT2 = nTiles8;
765 saMatrixParams.DECDIM1_WIDTH = colLimit8;
767 __SE1_OPEN(rStore, seMatrixParams);
768 __SA1_OPEN(saMatrixParams);
770 for (int32_t tile = 0; tile < nTiles8; tile++) {
771 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
772 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
773 vec sV1 = __vload_pred(lPred, pLoadVec);
775 lPred = c7x::strm_agen<0, vec>::get_vpred();
776 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
777 vec sV2 = __vload_pred(lPred, pLoadVec);
779 lPred = c7x::strm_agen<0, vec>::get_vpred();
780 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
781 vec sV3 = __vload_pred(lPred, pLoadVec);
783 lPred = c7x::strm_agen<0, vec>::get_vpred();
784 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
785 vec sV4 = __vload_pred(lPred, pLoadVec);
787 lPred = c7x::strm_agen<0, vec>::get_vpred();
788 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
789 vec sV5 = __vload_pred(lPred, pLoadVec);
791 lPred = c7x::strm_agen<0, vec>::get_vpred();
792 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
793 vec sV6 = __vload_pred(lPred, pLoadVec);
795 lPred = c7x::strm_agen<0, vec>::get_vpred();
796 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
797 vec sV7 = __vload_pred(lPred, pLoadVec);
799 lPred = c7x::strm_agen<0, vec>::get_vpred();
800 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
801 vec sV8 = __vload_pred(lPred, pLoadVec);
803 for (int32_t vertical = 0; vertical < nRows; vertical++) {
805 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
807 vec v1 = c7x::strm_eng<1, vec>::get_adv();
808 vec v2 = c7x::strm_eng<1, vec>::get_adv();
809 vec v3 = c7x::strm_eng<1, vec>::get_adv();
810 vec v4 = c7x::strm_eng<1, vec>::get_adv();
811 vec v5 = c7x::strm_eng<1, vec>::get_adv();
812 vec v6 = c7x::strm_eng<1, vec>::get_adv();
813 vec v7 = c7x::strm_eng<1, vec>::get_adv();
814 vec v8 = c7x::strm_eng<1, vec>::get_adv();
816 v1 -= sV1 * scalarDup1;
817 v2 -= sV2 * scalarDup1;
818 v3 -= sV3 * scalarDup1;
819 v4 -= sV4 * scalarDup1;
820 v5 -= sV5 * scalarDup1;
821 v6 -= sV6 * scalarDup1;
822 v7 -= sV7 * scalarDup1;
823 v8 -= sV8 * scalarDup1;
825 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
826 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
827 __vstore_pred(sPred, pStoreVec, v1);
829 sPred = c7x::strm_agen<1, vec>::get_vpred();
830 pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
831 __vstore_pred(sPred, pStoreVec, v2);
833 sPred = c7x::strm_agen<1, vec>::get_vpred();
834 pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
835 __vstore_pred(sPred, pStoreVec, v3);
837 sPred = c7x::strm_agen<1, vec>::get_vpred();
838 pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
839 __vstore_pred(sPred, pStoreVec, v4);
841 sPred = c7x::strm_agen<1, vec>::get_vpred();
842 pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
843 __vstore_pred(sPred, pStoreVec, v5);
845 sPred = c7x::strm_agen<1, vec>::get_vpred();
846 pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
847 __vstore_pred(sPred, pStoreVec, v6);
849 sPred = c7x::strm_agen<1, vec>::get_vpred();
850 pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
851 __vstore_pred(sPred, pStoreVec, v7);
853 sPred = c7x::strm_agen<1, vec>::get_vpred();
854 pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
855 __vstore_pred(sPred, pStoreVec, v8);
865 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
866 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
867 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
869 dataType *pSE1 = rStore + colLimit8;
870 dataType *pSA0 = pSum;
872 __SE1_OPEN(pSE1, seMatrixParams);
873 __SA1_OPEN(saMatrixParams);
875 for (int32_t tile = 0; tile < nTiles4; tile++) {
876 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
877 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
878 vec sV1 = __vload_pred(lPred, pLoadVec);
880 lPred = c7x::strm_agen<0, vec>::get_vpred();
881 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
882 vec sV2 = __vload_pred(lPred, pLoadVec);
884 lPred = c7x::strm_agen<0, vec>::get_vpred();
885 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
886 vec sV3 = __vload_pred(lPred, pLoadVec);
888 lPred = c7x::strm_agen<0, vec>::get_vpred();
889 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
890 vec sV4 = __vload_pred(lPred, pLoadVec);
892 for (int32_t vertical = 0; vertical < nRows; vertical++) {
893 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
895 vec v1 = c7x::strm_eng<1, vec>::get_adv();
896 vec v2 = c7x::strm_eng<1, vec>::get_adv();
897 vec v3 = c7x::strm_eng<1, vec>::get_adv();
898 vec v4 = c7x::strm_eng<1, vec>::get_adv();
900 v1 -= sV1 * scalarDup1;
901 v2 -= sV2 * scalarDup1;
902 v3 -= sV3 * scalarDup1;
903 v4 -= sV4 * scalarDup1;
905 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
906 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
907 __vstore_pred(sPred, pStoreVec, v1);
909 sPred = c7x::strm_agen<1, vec>::get_vpred();
910 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
911 __vstore_pred(sPred, pStoreVec, v2);
913 sPred = c7x::strm_agen<1, vec>::get_vpred();
914 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
915 __vstore_pred(sPred, pStoreVec, v3);
917 sPred = c7x::strm_agen<1, vec>::get_vpred();
918 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
919 __vstore_pred(sPred, pStoreVec, v4);
928 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
929 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
930 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
932 dataType *pSE1 = rStore + colLimit8 + colLimit4;
933 dataType *pSA0 = pSum;
935 __SE1_OPEN(pSE1, seMatrixParams);
936 __SA1_OPEN(saMatrixParams);
938 for (int32_t tile = 0; tile < nTiles2; tile++) {
939 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
940 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
941 vec sV1 = __vload_pred(lPred, pLoadVec);
943 lPred = c7x::strm_agen<0, vec>::get_vpred();
944 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
945 vec sV2 = __vload_pred(lPred, pLoadVec);
947 for (int32_t vertical = 0; vertical < nRows; vertical++) {
948 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
950 vec v1 = c7x::strm_eng<1, vec>::get_adv();
951 vec v2 = c7x::strm_eng<1, vec>::get_adv();
953 v1 -= sV1 * scalarDup1;
954 v2 -= sV2 * scalarDup1;
956 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
957 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
958 __vstore_pred(sPred, pStoreVec, v1);
960 sPred = c7x::strm_agen<1, vec>::get_vpred();
961 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
962 __vstore_pred(sPred, pStoreVec, v2);
971 seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
972 seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
973 seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
975 dataType *pSE1 = rStore + colLimit8 + colLimit4 + colLimit2;
976 dataType *pSA0 = pSum;
978 __SE1_OPEN(pSE1, seMatrixParams);
979 __SA1_OPEN(saMatrixParams);
981 for (int32_t tile = 0; tile < nTiles1; tile++) {
982 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
983 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
984 vec sV1 = __vload_pred(lPred, pLoadVec);
986 for (int32_t vertical = 0; vertical < nRows; vertical++) {
987 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
989 vec v1 = c7x::strm_eng<1, vec>::get_adv();
990 v1 -= sV1 * scalarDup1;
992 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
993 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
994 __vstore_pred(sPred, pStoreVec, v1);
1023 template <
typename dataType>
1035 typedef typename c7x::make_full_vector<dataType>::type vec;
1037 int32_t eleCount = c7x::element_count_of<vec>::value;
1041 __SE_TEMPLATE_v1 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1042 __SE_TEMPLATE_v1 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1043 __SA_TEMPLATE_v1 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
1044 __SA_TEMPLATE_v1 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
1045 __SA_TEMPLATE_v1 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
1047 int32_t se0TICNT2 = se0Params.ICNT2;
1048 int32_t se1TICNT2 = se1Params.ICNT2;
1050 se0Params.ICNT0 = se1Params.ICNT0 = nCols;
1052 sa2Params.ICNT0 = nCols;
1054 dataType *pSE0 = pLocalQ;
1055 dataType *pSE1Local = pLocalQ + (se0TICNT2 * eleCount * colStrideQ);
1056 dataType *pSA0 = pSum;
1057 dataType *pSA1 = pSum + (se0TICNT2 * eleCount);
1059 vec scaleV = (vec) (scale);
1061 __SA1_OPEN(sa1Params);
1062 __SA2_OPEN(sa2Params);
1063 __SE1_OPEN(pSE1Local, se1Params);
1064 if (se0TICNT2 > 0) {
1065 __SA0_OPEN(sa0Params);
1066 __SE0_OPEN(pSE0, se0Params);
1069 for (int32_t verticalCnt = 0; verticalCnt < se0TICNT2; verticalCnt++) {
1079 int32_t horizontal = 0;
1081 for (; horizontal < (nCols) -3; horizontal += 4) {
1083 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1084 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1086 vec v3 = c7x::strm_eng<0, vec>::get_adv();
1087 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1089 vec v5 = c7x::strm_eng<0, vec>::get_adv();
1090 vec v6 = c7x::strm_eng<1, vec>::get_adv();
1092 vec v7 = c7x::strm_eng<0, vec>::get_adv();
1093 vec v8 = c7x::strm_eng<1, vec>::get_adv();
1095 dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1096 vec u1 = __vload_dup(pU1);
1098 dataType *pU2 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1099 vec u2 = __vload_dup(pU2);
1101 dataType *pU3 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1102 vec u3 = __vload_dup(pU3);
1104 dataType *pU4 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1105 vec u4 = __vload_dup(pU4);
1119 for (; horizontal < (nCols) -1; horizontal += 2) {
1120 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1121 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1123 vec v3 = c7x::strm_eng<0, vec>::get_adv();
1124 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1126 dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1127 vec u1 = __vload_dup(pU1);
1129 dataType *pU2 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1130 vec u2 = __vload_dup(pU2);
1140 for (; horizontal < (nCols); horizontal++) {
1141 vec v1 = c7x::strm_eng<0, vec>::get_adv();
1142 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1144 dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1145 vec u1 = __vload_dup(pU1);
1154 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1155 vec *pStoreVec1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1156 __vstore_pred(pred1, pStoreVec1, sV1 * scaleV);
1158 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1159 vec *pStoreVec2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1160 __vstore_pred(pred2, pStoreVec2, sV2 * scaleV);
1163 if (se1TICNT2 != se0TICNT2) {
1166 for (int32_t horizontal = 0; horizontal < nCols; horizontal++) {
1167 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1169 dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1170 vec u1 = __vload_dup(pU1);
1174 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1175 vec *pStoreVec2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1176 __vstore_pred(pred2, pStoreVec2, sV2 * scaleV);
1182 if (se0TICNT2 > 0) {
1188 dataType *qStore = pLocalQ;
1190 __SE_TEMPLATE_v1 seScalarParams;
1191 __SE_TEMPLATE_v1 seMatrixParams;
1193 __SA_TEMPLATE_v1 saMatrixParams;
1194 __SA_TEMPLATE_v1 saRefParams;
1196 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
1197 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE));
1198 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (17 * SE_PARAM_SIZE));
1199 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1201 int32_t lenTile4 = 4;
1202 int32_t lenTile2 = 2;
1203 int32_t lenTile1 = 1;
1205 int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
1206 int32_t nTiles4 = nTiles1 / lenTile4;
1207 nTiles1 -= nTiles4 * lenTile4;
1208 int32_t nTiles2 = nTiles1 / lenTile2;
1209 nTiles1 -= nTiles2 * lenTile2;
1211 int32_t remainingCols = nCols;
1212 int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
1213 colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
1215 remainingCols = remainingCols - colLimit4;
1216 int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
1217 colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
1219 int32_t colLimit1 = remainingCols - colLimit2;
1221 seScalarParams.ICNT0 = nRows;
1222 seScalarParams.ICNT1 = nTiles4 + nTiles2 + nTiles1;
1224 seMatrixParams.ICNT1 = nRows;
1225 saMatrixParams.ICNT1 = nRows;
1227 saRefParams.ICNT0 = nCols;
1229 __SE0_OPEN(pSum, seScalarParams);
1230 __SA0_OPEN(saRefParams);
1232 seMatrixParams.ICNT2 = nTiles4;
1233 seMatrixParams.DECDIM1_WIDTH = colLimit4;
1235 saMatrixParams.ICNT2 = nTiles4;
1236 saMatrixParams.DECDIM1_WIDTH = colLimit4;
1238 __SE1_OPEN(qStore, seMatrixParams);
1239 __SA1_OPEN(saMatrixParams);
1241 for (int32_t tile = 0; tile < nTiles4; tile++) {
1242 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1243 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1244 vec sV1 = __vload_pred(lPred, pLoadVec);
1246 lPred = c7x::strm_agen<0, vec>::get_vpred();
1247 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1248 vec sV2 = __vload_pred(lPred, pLoadVec);
1250 lPred = c7x::strm_agen<0, vec>::get_vpred();
1251 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1252 vec sV3 = __vload_pred(lPred, pLoadVec);
1254 lPred = c7x::strm_agen<0, vec>::get_vpred();
1255 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1256 vec sV4 = __vload_pred(lPred, pLoadVec);
1258 int32_t vertical = 0;
1260 for (; vertical < nRows - 3; vertical += 4) {
1261 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1262 vec scalarDup2 = c7x::strm_eng<0, vec>::get_adv();
1263 vec scalarDup3 = c7x::strm_eng<0, vec>::get_adv();
1264 vec scalarDup4 = c7x::strm_eng<0, vec>::get_adv();
1266 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1267 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1268 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1269 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1270 vec v5 = c7x::strm_eng<1, vec>::get_adv();
1271 vec v6 = c7x::strm_eng<1, vec>::get_adv();
1272 vec v7 = c7x::strm_eng<1, vec>::get_adv();
1273 vec v8 = c7x::strm_eng<1, vec>::get_adv();
1275 vec v9 = c7x::strm_eng<1, vec>::get_adv();
1276 vec v10 = c7x::strm_eng<1, vec>::get_adv();
1277 vec v11 = c7x::strm_eng<1, vec>::get_adv();
1278 vec v12 = c7x::strm_eng<1, vec>::get_adv();
1279 vec v13 = c7x::strm_eng<1, vec>::get_adv();
1280 vec v14 = c7x::strm_eng<1, vec>::get_adv();
1281 vec v15 = c7x::strm_eng<1, vec>::get_adv();
1282 vec v16 = c7x::strm_eng<1, vec>::get_adv();
1284 v1 -= sV1 * scalarDup1;
1285 v2 -= sV2 * scalarDup1;
1286 v3 -= sV3 * scalarDup1;
1287 v4 -= sV4 * scalarDup1;
1288 v5 -= sV1 * scalarDup2;
1289 v6 -= sV2 * scalarDup2;
1290 v7 -= sV3 * scalarDup2;
1291 v8 -= sV4 * scalarDup2;
1293 v9 -= sV1 * scalarDup3;
1294 v10 -= sV2 * scalarDup3;
1295 v11 -= sV3 * scalarDup3;
1296 v12 -= sV4 * scalarDup3;
1297 v13 -= sV1 * scalarDup4;
1298 v14 -= sV2 * scalarDup4;
1299 v15 -= sV3 * scalarDup4;
1300 v16 -= sV4 * scalarDup4;
1302 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1303 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1304 __vstore_pred(sPred, pStoreVec, v1);
1306 sPred = c7x::strm_agen<1, vec>::get_vpred();
1307 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1308 __vstore_pred(sPred, pStoreVec, v2);
1310 sPred = c7x::strm_agen<1, vec>::get_vpred();
1311 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1312 __vstore_pred(sPred, pStoreVec, v3);
1314 sPred = c7x::strm_agen<1, vec>::get_vpred();
1315 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1316 __vstore_pred(sPred, pStoreVec, v4);
1318 sPred = c7x::strm_agen<1, vec>::get_vpred();
1319 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1320 __vstore_pred(sPred, pStoreVec, v5);
1322 sPred = c7x::strm_agen<1, vec>::get_vpred();
1323 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1324 __vstore_pred(sPred, pStoreVec, v6);
1326 sPred = c7x::strm_agen<1, vec>::get_vpred();
1327 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1328 __vstore_pred(sPred, pStoreVec, v7);
1330 sPred = c7x::strm_agen<1, vec>::get_vpred();
1331 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1332 __vstore_pred(sPred, pStoreVec, v8);
1334 sPred = c7x::strm_agen<1, vec>::get_vpred();
1335 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1336 __vstore_pred(sPred, pStoreVec, v9);
1338 sPred = c7x::strm_agen<1, vec>::get_vpred();
1339 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1340 __vstore_pred(sPred, pStoreVec, v10);
1342 sPred = c7x::strm_agen<1, vec>::get_vpred();
1343 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1344 __vstore_pred(sPred, pStoreVec, v11);
1346 sPred = c7x::strm_agen<1, vec>::get_vpred();
1347 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1348 __vstore_pred(sPred, pStoreVec, v12);
1350 sPred = c7x::strm_agen<1, vec>::get_vpred();
1351 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1352 __vstore_pred(sPred, pStoreVec, v13);
1354 sPred = c7x::strm_agen<1, vec>::get_vpred();
1355 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1356 __vstore_pred(sPred, pStoreVec, v14);
1358 sPred = c7x::strm_agen<1, vec>::get_vpred();
1359 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1360 __vstore_pred(sPred, pStoreVec, v15);
1362 sPred = c7x::strm_agen<1, vec>::get_vpred();
1363 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1364 __vstore_pred(sPred, pStoreVec, v16);
1367 for (; vertical < nRows; vertical++) {
1368 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1370 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1371 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1372 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1373 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1375 v1 -= sV1 * scalarDup1;
1376 v2 -= sV2 * scalarDup1;
1377 v3 -= sV3 * scalarDup1;
1378 v4 -= sV4 * scalarDup1;
1380 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1381 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1382 __vstore_pred(sPred, pStoreVec, v1);
1384 sPred = c7x::strm_agen<1, vec>::get_vpred();
1385 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1386 __vstore_pred(sPred, pStoreVec, v2);
1388 sPred = c7x::strm_agen<1, vec>::get_vpred();
1389 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1390 __vstore_pred(sPred, pStoreVec, v3);
1392 sPred = c7x::strm_agen<1, vec>::get_vpred();
1393 pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1394 __vstore_pred(sPred, pStoreVec, v4);
1401 seMatrixParams.ICNT2 = nTiles2;
1402 seMatrixParams.DECDIM1_WIDTH = colLimit2;
1403 seMatrixParams.ICNT0 = (eleCount * lenTile2);
1405 saMatrixParams.ICNT2 = nTiles2;
1406 saMatrixParams.DECDIM1_WIDTH = colLimit2;
1407 saMatrixParams.ICNT0 = (eleCount * lenTile2);
1409 dataType *pSE1 = qStore + colLimit4;
1411 __SE1_OPEN(pSE1, seMatrixParams);
1412 __SA1_OPEN(saMatrixParams);
1414 for (int32_t tile = 0; tile < nTiles2; tile++) {
1415 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1416 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1417 vec sV1 = __vload_pred(lPred, pLoadVec);
1419 lPred = c7x::strm_agen<0, vec>::get_vpred();
1420 pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1421 vec sV2 = __vload_pred(lPred, pLoadVec);
1423 int32_t vertical = 0;
1425 for (; vertical < nRows - 3; vertical += 4) {
1426 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1427 vec scalarDup2 = c7x::strm_eng<0, vec>::get_adv();
1428 vec scalarDup3 = c7x::strm_eng<0, vec>::get_adv();
1429 vec scalarDup4 = c7x::strm_eng<0, vec>::get_adv();
1431 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1432 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1433 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1434 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1435 vec v5 = c7x::strm_eng<1, vec>::get_adv();
1436 vec v6 = c7x::strm_eng<1, vec>::get_adv();
1437 vec v7 = c7x::strm_eng<1, vec>::get_adv();
1438 vec v8 = c7x::strm_eng<1, vec>::get_adv();
1440 v1 -= sV1 * scalarDup1;
1441 v2 -= sV2 * scalarDup1;
1442 v3 -= sV1 * scalarDup2;
1443 v4 -= sV2 * scalarDup2;
1444 v5 -= sV1 * scalarDup3;
1445 v6 -= sV2 * scalarDup3;
1446 v7 -= sV1 * scalarDup4;
1447 v8 -= sV2 * scalarDup4;
1449 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1450 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1451 __vstore_pred(sPred, pStoreVec, v1);
1453 sPred = c7x::strm_agen<1, vec>::get_vpred();
1454 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1455 __vstore_pred(sPred, pStoreVec, v2);
1457 sPred = c7x::strm_agen<1, vec>::get_vpred();
1458 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1459 __vstore_pred(sPred, pStoreVec, v3);
1461 sPred = c7x::strm_agen<1, vec>::get_vpred();
1462 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1463 __vstore_pred(sPred, pStoreVec, v4);
1465 sPred = c7x::strm_agen<1, vec>::get_vpred();
1466 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1467 __vstore_pred(sPred, pStoreVec, v5);
1469 sPred = c7x::strm_agen<1, vec>::get_vpred();
1470 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1471 __vstore_pred(sPred, pStoreVec, v6);
1473 sPred = c7x::strm_agen<1, vec>::get_vpred();
1474 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1475 __vstore_pred(sPred, pStoreVec, v7);
1477 sPred = c7x::strm_agen<1, vec>::get_vpred();
1478 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1479 __vstore_pred(sPred, pStoreVec, v8);
1482 for (; vertical < nRows; vertical++) {
1483 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1485 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1486 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1488 v1 -= sV1 * scalarDup1;
1489 v2 -= sV2 * scalarDup1;
1491 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1492 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1493 __vstore_pred(sPred, pStoreVec, v1);
1495 sPred = c7x::strm_agen<1, vec>::get_vpred();
1496 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1497 __vstore_pred(sPred, pStoreVec, v2);
1504 seMatrixParams.ICNT2 = nTiles1;
1505 seMatrixParams.DECDIM1_WIDTH = colLimit1;
1506 seMatrixParams.ICNT0 = (eleCount * lenTile1);
1508 saMatrixParams.ICNT2 = nTiles1;
1509 saMatrixParams.DECDIM1_WIDTH = colLimit1;
1510 saMatrixParams.ICNT0 = (eleCount * lenTile1);
1512 dataType *pSE1 = qStore + colLimit4 + colLimit2;
1514 __SE1_OPEN(pSE1, seMatrixParams);
1515 __SA1_OPEN(saMatrixParams);
1517 for (int32_t tile = 0; tile < nTiles1; tile++) {
1518 __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1519 vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1520 vec sV1 = __vload_pred(lPred, pLoadVec);
1522 int32_t vertical = 0;
1524 for (; vertical < nRows - 3; vertical += 4) {
1525 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1526 vec scalarDup2 = c7x::strm_eng<0, vec>::get_adv();
1527 vec scalarDup3 = c7x::strm_eng<0, vec>::get_adv();
1528 vec scalarDup4 = c7x::strm_eng<0, vec>::get_adv();
1530 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1531 vec v2 = c7x::strm_eng<1, vec>::get_adv();
1532 vec v3 = c7x::strm_eng<1, vec>::get_adv();
1533 vec v4 = c7x::strm_eng<1, vec>::get_adv();
1535 v1 -= sV1 * scalarDup1;
1536 v2 -= sV1 * scalarDup2;
1537 v3 -= sV1 * scalarDup3;
1538 v4 -= sV1 * scalarDup4;
1540 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1541 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1542 __vstore_pred(sPred, pStoreVec, v1);
1544 sPred = c7x::strm_agen<1, vec>::get_vpred();
1545 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1546 __vstore_pred(sPred, pStoreVec, v2);
1548 sPred = c7x::strm_agen<1, vec>::get_vpred();
1549 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1550 __vstore_pred(sPred, pStoreVec, v3);
1552 sPred = c7x::strm_agen<1, vec>::get_vpred();
1553 pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1554 __vstore_pred(sPred, pStoreVec, v4);
1556 for (; vertical < nRows; vertical++) {
1557 vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1559 vec v1 = c7x::strm_eng<1, vec>::get_adv();
1561 v1 -= sV1 * scalarDup1;
1563 __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1564 vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1565 __vstore_pred(sPred, pStoreVec, v1);
template void DSPLIB_qrd_R_matrix_exec_ci< float >(float *pLocalR, float *pLocalU, float *sum, float scale, int32_t colStrideR, int32_t nRows, int32_t nCols, uint8_t *pBlock)
template void DSPLIB_Q_matrix_init_ci< float >(DSPLIB_kernelHandle handle)
void DSPLIB_R_column_init_ci(DSPLIB_kernelHandle handle)
void DSPLIB_qrd_Q_matrix_exec_ci(dataType *pLocalQ, dataType *pLocalU, dataType *pSum, dataType scale, int32_t colStrideQ, int32_t nRows, int32_t nCols, uint8_t *pBlock)
template void DSPLIB_qrd_Q_matrix_exec_ci< double >(double *pLocalQ, double *pLocalU, double *sum, double scale, int32_t colStrideQ, int32_t nRows, int32_t nCols, uint8_t *pBlock)
void DSPLIB_qrd_R_matrix_exec_ci(dataType *pLocalR, dataType *pLocalU, dataType *pSum, dataType scale, int32_t colStrideR, int32_t nRows, int32_t nCols, uint8_t *pBlock)
void DSPLIB_Q_matrix_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_R_column_init_ci< double >(DSPLIB_kernelHandle handle)
template void DSPLIB_R_column_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_qrd_R_matrix_exec_ci< double >(double *pLocalR, double *pLocalU, double *sum, double scale, int32_t colStrideR, int32_t nRows, int32_t nCols, uint8_t *pBlock)
template void DSPLIB_Q_matrix_init_ci< double >(DSPLIB_kernelHandle handle)
template void DSPLIB_qrd_Q_matrix_exec_ci< float >(float *pLocalQ, float *pLocalU, float *sum, float scale, int32_t colStrideQ, int32_t nRows, int32_t nCols, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_qrd.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Structure that is reserved for internal use by the kernel.
int32_t strideR
Stride between rows of R output data matrix
uint32_t heightA
Height of input data matrix
uint8_t bufPblock[DSPLIB_QRD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters.
int32_t strideQ
Stride between rows of Q output data matrix