48 template <
typename dType>
60 uint8_t * pBlock = pKerPrivArgs->
bufPblock;
63 typedef typename c7x::make_full_vector<dType>::type vec;
66 size_t width = pKerPrivArgs->
width;
67 size_t widthOut = pKerPrivArgs->
widthOut;
68 size_t heightOut = pKerPrivArgs->
heightOut;
71 uint32_t eleCount = c7x::element_count_of<vec>::value;
72 uint32_t numBlocks = (widthOut + eleCount - 1) / eleCount;
73 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
74 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
75 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
78 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
79 __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
80 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
82 se0Params.ICNT0 = eleCount;
84 se0Params.DIM1 = strideInElements;
85 se0Params.ICNT2 = heightOut;
86 se0Params.DIM2 = strideInElements;
87 se0Params.ICNT3 = numBlocks;
88 se0Params.DIM3 = eleCount;
91 se0Params.DIMFMT = __SE_DIMFMT_5D;
92 se0Params.ELETYPE = SE_ELETYPE;
93 se0Params.VECLEN = SE_VECLEN;
94 se0Params.DECDIM1 = __SE_DECDIM_DIM3;
95 se0Params.DECDIM1_WIDTH = width;
99 se1Params.ICNT0 = eleCount;
103 se1Params.DIM2 = strideInElements;
104 se1Params.ICNT3 = heightOut;
105 se1Params.DIM3 = strideInElements;
106 se1Params.ICNT4 = numBlocks;
107 se1Params.DIM4 = eleCount;
108 se1Params.DIMFMT = __SE_DIMFMT_5D;
109 se1Params.ELETYPE = SE_ELETYPE;
110 se1Params.VECLEN = SE_VECLEN;
111 se1Params.DECDIM1 = __SE_DECDIM_DIM4;
112 se1Params.DECDIM1SD = __SE_DECDIMSD_DIM1;
113 se1Params.DECDIM1_WIDTH = width - 1;
117 sa0Params.ICNT0 = eleCount;
118 sa0Params.ICNT1 = heightOut;
119 sa0Params.DIM1 = strideOutElements;
120 sa0Params.ICNT2 = numBlocks;
121 sa0Params.DIM2 = eleCount;
122 sa0Params.DIMFMT = __SA_DIMFMT_3D;
123 sa0Params.VECLEN = SA_VECLEN;
124 sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
125 sa0Params.DECDIM1_WIDTH = widthOut;
171 template <
typename dType>
174 void *restrict pMask,
176 void *restrict pScratch)
183 dType *pInLocal = (dType *) pIn;
184 dType *pOutLocal = (dType *) pOut;
186 typedef typename c7x::make_full_vector<dType>::type vec;
188 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
190 __SE_TEMPLATE_v1 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SE0_PARAM_OFFSET);
191 __SE0_OPEN(pInLocal, se0Params);
193 __SE_TEMPLATE_v1 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SE1_PARAM_OFFSET);
194 __SE1_OPEN(pInLocal + 1, se1Params);
196 __SA_TEMPLATE_v1 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SA0_PARAM_OFFSET);
197 __SA0_OPEN(sa0Params);
199 int32_t outCols = pKerPrivArgs->
widthOut;
200 int32_t outRows = pKerPrivArgs->
heightOut;
201 uint32_t eleCount = c7x::element_count_of<vec>::value;
202 int32_t numBlocks = (outCols + eleCount - 1) / eleCount;
205 #pragma MUST_ITERATE(1, , )
206 for (i = 0; i < numBlocks; i++) {
207 #pragma MUST_ITERATE(1, , )
208 for (j = 0; j < outRows; j++) {
220 vec row0_col0 = c7x::strm_eng<0, vec>::get_adv();
221 vec row0_col1 = c7x::strm_eng<1, vec>::get_adv();
222 vec row0_col2 = c7x::strm_eng<1, vec>::get_adv();
224 vec row0_01_min =
MIN_VAL(row0_col0, row0_col1);
225 vec row0_01_max =
MAX_VAL(row0_col0, row0_col1);
228 vec row0_012_min =
MIN_VAL(row0_01_min, row0_col2);
229 vec row0_012_max =
MAX_VAL(row0_01_max, row0_col2);
232 vec row0_012_med =
MAX_VAL(
MIN_VAL(row0_01_max, row0_col2), row0_01_min);
235 vec row1_col0 = c7x::strm_eng<0, vec>::get_adv();
236 vec row1_col1 = c7x::strm_eng<1, vec>::get_adv();
237 vec row1_col2 = c7x::strm_eng<1, vec>::get_adv();
239 vec row1_01_min =
MIN_VAL(row1_col0, row1_col1);
240 vec row1_01_max =
MAX_VAL(row1_col0, row1_col1);
243 vec row1_012_min =
MIN_VAL(row1_01_min, row1_col2);
244 vec row1_012_max =
MAX_VAL(row1_01_max, row1_col2);
247 vec row1_012_med =
MAX_VAL(
MIN_VAL(row1_01_max, row1_col2), row1_01_min);
250 vec row2_col0 = c7x::strm_eng<0, vec>::get_adv();
251 vec row2_col1 = c7x::strm_eng<1, vec>::get_adv();
252 vec row2_col2 = c7x::strm_eng<1, vec>::get_adv();
254 vec row2_01_min =
MIN_VAL(row2_col0, row2_col1);
255 vec row2_01_max =
MAX_VAL(row2_col0, row2_col1);
258 vec row2_012_min =
MIN_VAL(row2_01_min, row2_col2);
259 vec row2_012_max =
MAX_VAL(row2_01_max, row2_col2);
262 vec row2_012_med =
MAX_VAL(
MIN_VAL(row2_01_max, row2_col2), row2_01_min);
276 vec col0_012_max =
MAX_VAL(
MAX_VAL(row0_012_min, row1_012_min), row2_012_min);
279 vec col2_012_min =
MIN_VAL(
MIN_VAL(row0_012_max, row1_012_max), row2_012_max);
290 __vpred vpStore = c7x::strm_agen<0, vec>::get_vpred();
291 vec * vStore = c7x::strm_agen<0, vec>::get_adv(pOutLocal);
292 __vstore_pred(vpStore, vStore, median);
309 void *restrict pMask,
311 void *restrict pScratch);
314 void *restrict pMask,
316 void *restrict pScratch);
319 void *restrict pMask,
321 void *restrict pScratch);
324 void *restrict pMask,
326 void *restrict pScratch);
352 uint32_t eleCount = 0;
354 typedef typename c7x::make_full_vector<uint8_t>::type vec;
355 eleCount = c7x::element_count_of<vec>::value;
358 typedef typename c7x::make_full_vector<int8_t>::type vec;
359 eleCount = c7x::element_count_of<vec>::value;
362 typedef typename c7x::make_full_vector<uint16_t>::type vec;
363 eleCount = c7x::element_count_of<vec>::value;
366 typedef typename c7x::make_full_vector<int16_t>::type vec;
367 eleCount = c7x::element_count_of<vec>::value;
370 size_t overheadCnt = 55;
374 uint32_t numBlocks = (pKerPrivArgs->
heightOut) * VXLIB_ceilingDiv(pKerPrivArgs->
widthOut, eleCount);
375 *archCycles = 7 + numBlocks * 8;
379 uint32_t M = pKerPrivArgs->
M;
380 uint32_t N = pKerPrivArgs->
N;
381 uint32_t width = pKerPrivArgs->
width;
382 uint32_t widthOut = pKerPrivArgs->
widthOut;
383 uint32_t heightOut = pKerPrivArgs->
heightOut;
384 uint32_t unrollFactor = 0;
385 if (width <= eleCount) {
388 else if (width <= 3 * eleCount) {
391 else if (width <= 6 * eleCount) {
399 size_t trueCntLoopCnt = M * N;
400 size_t trueCntCycles = 8 + trueCntLoopCnt * 2;
403 size_t scratchLoop = M * N * VXLIB_ceilingDiv(width, eleCount);
404 size_t scratchFillCycles = 4 + scratchLoop * 2;
407 size_t loop1Count = VXLIB_ceilingDiv((M * N), 2);
408 size_t loop1Exclusive = 0;
410 size_t widthBlocks = VXLIB_ceilingDiv(width, eleCount);
411 size_t loop2Count = VXLIB_ceilingDiv(widthBlocks, unrollFactor);
412 size_t loop2Exclusive = 0;
414 size_t loop3Count = ((trueCntLoopCnt - 2 > 0) ? ((trueCntLoopCnt - 2) / 2) : 1);
416 size_t loop3Cycles = 0;
417 if (unrollFactor == 1) {
418 loop3Cycles = 2 + loop3Count * 2;
422 else if (unrollFactor == 2) {
423 loop3Cycles = 2 + loop3Count * 4;
427 else if (unrollFactor == 4) {
428 loop3Cycles = 3 + loop3Count * 8;
433 loop3Cycles = 4 + loop3Count * 16;
438 size_t loop2Cycles = loop2Count * (loop2Exclusive + loop3Cycles);
439 size_t sortCycles = loop1Count * (loop1Exclusive + loop2Cycles);
442 size_t storeLoopCnt = VXLIB_ceilingDiv(widthOut, eleCount);
443 size_t storeCycles = 6 + storeLoopCnt * 1;
445 size_t outerLoopExclusive = 25;
447 *archCycles = trueCntCycles + heightOut * (outerLoopExclusive + scratchFillCycles + sortCycles + storeCycles);
450 *estCycles = overheadCnt + *archCycles;
void * VXLIB_kernelHandle
Handle type for VXLIB operations.
VXLIB_STATUS_NAME
The enumeration of all status codes.
#define VXLIB_DEBUGPRINTFN(N, fmt,...)
A structure for a 2 dimensional buffer descriptor.
uint32_t data_type
Values are of type VXLIB_data_type_e.