41 #include "VXLIB_types.h"
42 #include "VXLIB_utility.h"
43 #include "c7x_scalable.h"
44 #include <type_traits>
51 #define SE_PARAM_BASE (0x0000)
52 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
53 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + VXLIB_SE_PARAM_SIZE)
54 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + VXLIB_SE_PARAM_SIZE)
56 #define VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE (sizeof(uint64_t) * 8)
59 #define Q_FACTOR (1 << Q_BITS)
69 template <u
int32_t dTypeIn, u
int32_t dTypeOut>
78 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
79 __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
80 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
83 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<c7x::uchar_vec>::value;
84 __SE_VECLEN SE_VECLEN = c7x::se_veclen<c7x::uchar_hvec>::value;
85 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<c7x::uchar_hvec>::value;
91 size_t elemCount = c7x::element_count_of<c7x::char_hvec>::value;
94 size_t width = bufParamsIn->
dim_x;
95 size_t height = bufParamsIn->
dim_y;
96 size_t stride = bufParamsIn->
stride_y;
99 size_t padLeft = pKerInitArgs->
padLeft;
100 size_t padRight = pKerInitArgs->
padRight;
101 size_t padTop = pKerInitArgs->
padTop;
102 size_t padBottom = pKerInitArgs->
padBottom;
105 bool isNotPadded = (padLeft == 0) && (padRight == 0) && (padTop == 0) && (padBottom == 0);
107 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
108 pKerPrivArgs->
numBlocks = width * VXLIB_ceilingDiv(width, elemCount);
116 int16_t gaussianFilter3x3[6] = {2048, 4096, 2048, 4096, 8192, 4096};
122 se0Params.DIMFMT = __SE_DIMFMT_3D;
123 se0Params.ELETYPE = SE_ELETYPE;
124 se0Params.VECLEN = SE_VECLEN;
125 se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
127 se0Params.ICNT0 = elemCount;
129 se0Params.DIM1 = stride;
130 se0Params.ICNT1 = (filterSize - 1) + VXLIB_ceilingDiv(bufParamsOut->
dim_y, filterSize) * filterSize;
132 se0Params.DIM2 = elemCount;
133 se0Params.ICNT2 = VXLIB_ceilingDiv(width, elemCount);
136 se0Params.DECDIM1 = __SE_DECDIM_DIM2;
137 se0Params.DECDIM1_WIDTH = width;
138 se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
141 se0Params.DECDIM2 = __SE_DECDIM_DIM1;
142 se0Params.DECDIM2_WIDTH = height * stride;
145 se1Params.DIMFMT = __SE_DIMFMT_3D;
146 se1Params.ELETYPE = SE_ELETYPE;
147 se1Params.VECLEN = SE_VECLEN;
148 se1Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
150 se1Params.ICNT0 = (width <= elemCount) ? 0 : elemCount;
152 se1Params.DIM1 = stride;
153 se1Params.ICNT1 = (filterSize - 1) + VXLIB_ceilingDiv(bufParamsOut->
dim_y, filterSize) * filterSize;
155 se1Params.DIM2 = elemCount;
156 se1Params.ICNT2 = VXLIB_ceilingDiv(width, elemCount);
159 se1Params.DECDIM1 = __SE_DECDIM_DIM2;
160 se1Params.DECDIM1_WIDTH = width - elemCount;
161 se1Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
164 se1Params.DECDIM2 = __SE_DECDIM_DIM1;
165 se1Params.DECDIM2_WIDTH = height * stride;
168 sa0Params.DIMFMT = __SA_DIMFMT_3D;
169 sa0Params.VECLEN = SA_VECLEN;
171 sa0Params.ICNT0 = (bufParamsOut->
dim_x < elemCount) ? bufParamsOut->
dim_x : elemCount;
173 sa0Params.ICNT1 = VXLIB_ceilingDiv(bufParamsOut->
dim_y, filterSize) * filterSize;
175 sa0Params.DIM2 = elemCount;
176 sa0Params.ICNT2 = VXLIB_ceilingDiv(bufParamsOut->
dim_x, elemCount);
179 sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
180 sa0Params.DECDIM1_WIDTH = bufParamsOut->
dim_x;
181 sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
184 sa0Params.DECDIM2 = __SA_DECDIM_DIM1;
185 sa0Params.DECDIM2_WIDTH = bufParamsOut->
dim_y * bufParamsOut->
stride_y;
201 uint64_t filter = gaussianFilter3x3[2];
202 filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[1];
203 filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[0];
207 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
210 filter = gaussianFilter3x3[5];
211 filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[4];
212 filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[3];
216 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
227 int16_t gaussianFilter5x5[15] = {1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6};
231 size_t elemCountOut = c7x::element_count_of<c7x::uchar_qvec>::value;
234 se0Params.DIMFMT = __SE_DIMFMT_4D;
235 se0Params.ELETYPE = SE_ELETYPE;
236 se0Params.VECLEN = SE_VECLEN;
237 se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
240 se0Params.ICNT0 = elemCount;
243 se0Params.DIM1 = stride;
244 se0Params.ICNT1 = filterSize;
247 se0Params.DIM2 = stride;
248 se0Params.ICNT2 = height - filterSize + 1;
251 se0Params.DIM3 = elemCount / 2;
252 se0Params.ICNT3 = VXLIB_ceilingDiv(width, (elemCount / 2));
255 se0Params.DECDIM1 = __SE_DECDIM_DIM3;
256 se0Params.DECDIM1_WIDTH = width;
257 se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
260 sa0Params.DIMFMT = __SA_DIMFMT_3D;
261 sa0Params.VECLEN = SA_VECLEN;
264 sa0Params.ICNT0 = elemCountOut;
268 sa0Params.ICNT1 = bufParamsOut->
dim_y;
271 sa0Params.DIM2 = elemCountOut;
272 sa0Params.ICNT2 = VXLIB_ceilingDiv(bufParamsOut->
dim_x, elemCountOut);
275 sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
276 sa0Params.DECDIM1_WIDTH = bufParamsOut->
dim_x;
277 sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
293 uint64_t filter = gaussianFilter5x5[3];
294 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[2];
295 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[1];
296 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[0];
300 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
303 filter = gaussianFilter5x5[4];
306 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
321 filter = gaussianFilter5x5[8];
322 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[7];
323 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[6];
324 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[5];
328 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
331 filter = gaussianFilter5x5[9];
334 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
349 filter = gaussianFilter5x5[13];
350 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[12];
351 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[11];
352 filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[10];
356 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
359 filter = gaussianFilter5x5[14];
362 *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
404 template <
typename dTypeIn,
typename dTypeOut>
411 __SE_TEMPLATE_v1 se0Params;
412 __SA_TEMPLATE_v1 sa0Params;
415 dTypeIn *restrict pInLocal = (dTypeIn *) pIn;
416 dTypeOut *restrict pOutLocal = (dTypeOut *) pOut;
421 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
430 uint64_t scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + filterOffset);
433 c7x::ulong_vec ulongvCoeff = __vload_dup(&scalarCoeff);
436 c7x::short_vec vCoeff1_lo = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
440 ulongvCoeff = __vload_dup(&scalarCoeff);
441 c7x::short_vec vCoeff1_hi = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
444 ulongvCoeff = __vload_dup(&scalarCoeff);
445 c7x::short_vec vCoeff2_lo = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
448 ulongvCoeff = __vload_dup(&scalarCoeff);
449 c7x::short_vec vCoeff2_hi = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
452 ulongvCoeff = __vload_dup(&scalarCoeff);
453 c7x::short_vec vCoeff3_lo = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
456 ulongvCoeff = __vload_dup(&scalarCoeff);
457 c7x::short_vec vCoeff3_hi = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
460 __SE0_OPEN(pInLocal, se0Params);
461 __SA0_OPEN(sa0Params);
465 size_t elemCountLoop = c7x::element_count_of<c7x::uchar_hvec>::value;
466 size_t wBlocks = VXLIB_ceilingDiv(pKerPrivArgs->
width, (elemCountLoop / 2));
467 size_t hBlocks = pKerPrivArgs->
height - filterSize + 1;
469 c7x::int_vec shiftVec = (c7x::int_vec) shift;
471 c7x::int_vec vResA, vResB, vResC, vResD, vResE, vResABCDE;
474 for (int32_t wCounter = 0; wCounter < wBlocks; wCounter++) {
476 for (int32_t hCounter = 0; hCounter < hBlocks; hCounter++) {
477 vResA = __vfir8hw_wvv(vCoeff1_lo, vCoeff1_hi, __SE_REG_0_ADV);
478 vResB = __vfir8hw_wvv(vCoeff2_lo, vCoeff2_hi, __SE_REG_0_ADV);
479 vResC = __vfir8hw_wvv(vCoeff3_lo, vCoeff3_hi, __SE_REG_0_ADV);
480 vResD = __vfir8hw_wvv(vCoeff2_lo, vCoeff2_hi, __SE_REG_0_ADV);
481 vResE = __vfir8hw_wvv(vCoeff1_lo, vCoeff1_hi, __SE_REG_0_ADV);
483 vResABCDE = vResA + vResB + vResC + vResD + vResE;
484 vResABCDE = __shift_right(vResABCDE, shiftVec);
487 c7x::uint_vec vUResABCDE = c7x::convert<c7x::uint_vec>(vResABCDE);
489 __vpred tmp = c7x::strm_agen<0, c7x::uint_vec>::get_vpred();
490 c7x::uchar_qvec *addr = c7x::strm_agen<0, c7x::uchar_qvec>::get_adv(pOutLocal);
491 __vstore_pred_pack_byte(tmp, addr, vUResABCDE);
500 template <
typename dTypeIn,
typename dTypeOut>
507 __SE_TEMPLATE_v1 se0Params;
508 __SE_TEMPLATE_v1 se1Params;
509 __SA_TEMPLATE_v1 sa0Params;
512 dTypeIn *restrict pInLocal = (dTypeIn *) pIn;
513 dTypeOut *restrict pOutLocal = (dTypeOut *) pOut;
516 typedef typename c7x::char_hvec out_hvec;
519 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
520 size_t elemCount = c7x::element_count_of<c7x::short_vec>::value;
529 uint64_t scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + filterOffset);
532 c7x::ulong_vec ulongvCoeff = __vload_dup(&scalarCoeff);
535 c7x::short_vec vCoeff1 = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
539 ulongvCoeff = __vload_dup(&scalarCoeff);
540 c7x::short_vec vCoeff2 = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
543 __SE0_OPEN(pInLocal, se0Params);
544 __SE1_OPEN(pInLocal + elemCount, se1Params);
545 __SA0_OPEN(sa0Params);
548 size_t wBlocks = VXLIB_ceilingDiv(pKerPrivArgs->
width, elemCount);
549 size_t hBlocks = VXLIB_ceilingDiv((pKerPrivArgs->
height - filterSize + 1), filterSize);
551 c7x::int_vec vQ_BITS = (c7x::int_vec)
Q_BITS;
553 c7x::int_vec vResA_lo, vResA_hi, vResB_lo, vResB_hi, vResC_lo, vResC_hi;
554 c7x::int_vec vResAB_lo, vResAB_hi, vResBC_lo, vResBC_hi, vResCA_lo, vResCA_hi;
555 c7x::int_vec vResABC_lo, vResABC_hi, vResBCA_lo, vResBCA_hi, vResCAB_lo, vResCAB_hi;
558 for (int32_t wCounter = 0; wCounter < wBlocks; wCounter++) {
560 __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0_ADV, vResA_lo, vResA_hi);
561 __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0, vResB_lo, vResB_hi);
563 vResAB_lo = vResA_lo + vResB_lo;
564 vResAB_hi = vResA_hi + vResB_hi;
565 __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0_ADV, vResB_lo, vResB_hi);
568 vResBC_lo = vResB_lo;
569 vResBC_hi = vResB_hi;
571 for (int32_t hCounter = 0; hCounter < hBlocks; hCounter++) {
574 __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0, vResC_lo, vResC_hi);
577 vResABC_lo = vResAB_lo + vResC_lo;
578 vResABC_hi = vResAB_hi + vResC_hi;
581 vResABC_lo = __shift_right(vResABC_lo, vQ_BITS);
582 vResABC_hi = __shift_right(vResABC_hi, vQ_BITS);
585 __vpred tmp = c7x::strm_agen<0, c7x::char_vec>::get_vpred();
586 out_hvec *addr = c7x::strm_agen<0, out_hvec>::get_adv(pOutLocal);
587 __vstore_pred_pack_byte_2src(tmp, addr, vResABC_lo, vResABC_hi);
590 vResCA_lo = vResC_lo;
591 vResCA_hi = vResC_hi;
594 __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0_ADV, vResC_lo, vResC_hi);
597 vResBC_lo += vResC_lo;
598 vResBC_hi += vResC_hi;
600 __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0, vResA_lo, vResA_hi);
602 vResBCA_lo = vResBC_lo + vResA_lo;
603 vResBCA_hi = vResBC_hi + vResA_hi;
605 vResBCA_lo = __shift_right(vResBCA_lo, vQ_BITS);
606 vResBCA_hi = __shift_right(vResBCA_hi, vQ_BITS);
609 tmp = c7x::strm_agen<0, c7x::char_vec>::get_vpred();
610 addr = c7x::strm_agen<0, out_hvec>::get_adv(pOutLocal);
611 __vstore_pred_pack_byte_2src(tmp, addr, vResBCA_lo, vResBCA_hi);
613 __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0, vResA_lo, vResA_hi);
616 vResCA_lo += vResA_lo;
617 vResCA_hi += vResA_hi;
620 __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0_ADV, vResA_lo, vResA_hi);
623 __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0, vResB_lo, vResB_hi);
624 vResBC_lo = vResB_lo;
625 vResBC_hi = vResB_hi;
627 vResCAB_lo = vResCA_lo + vResB_lo;
628 vResCAB_hi = vResCA_hi + vResB_hi;
630 vResCAB_lo = __shift_right(vResCAB_lo, vQ_BITS);
631 vResCAB_hi = __shift_right(vResCAB_hi, vQ_BITS);
634 __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0_ADV, vResB_lo, vResB_hi);
637 vResAB_lo = vResA_lo + vResB_lo;
638 vResAB_hi = vResA_hi + vResB_hi;
641 tmp = c7x::strm_agen<0, c7x::char_vec>::get_vpred();
642 addr = c7x::strm_agen<0, out_hvec>::get_adv(pOutLocal);
643 __vstore_pred_pack_byte_2src(tmp, addr, vResCAB_lo, vResCAB_hi);
653 template <
typename dTypeIn,
typename dTypeOut>
665 VXLIB_gaussian_3x3_exec_ci<VXLIB_GAUSSIAN_TYPENAME_I8U_O8U>(handle, pIn, pOut);
668 VXLIB_gaussian_5x5_exec_ci<VXLIB_GAUSSIAN_TYPENAME_I8U_O8U>(handle, pIn, pOut);
687 void *restrict pOut);
695 size_t numBlocks = pKerPrivArgs->
numBlocks;
696 size_t overheadCycles = 17;
697 size_t iterConst = 5;
700 *archCycles = iterConst + numBlocks * ii;
701 *estCycles = overheadCycles + *archCycles;
void VXLIB_gaussian_3x3_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template VXLIB_STATUS VXLIB_gaussian_exec_ci< VXLIB_GAUSSIAN_TYPENAME_I8U_O8U >(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
#define VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE
template VXLIB_STATUS VXLIB_gaussian_init_ci< VXLIB_GAUSSIAN_DTYPE_I8U_O8U >(VXLIB_kernelHandle handle, const VXLIB_bufParams2D_t *bufParamsIn, const VXLIB_bufParams2D_t *bufParamsOut, const VXLIB_gaussian_InitArgs *pKerInitArgs)
VXLIB_STATUS VXLIB_gaussian_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
void VXLIB_gaussian_5x5_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
VXLIB_STATUS VXLIB_gaussian_init_ci(VXLIB_kernelHandle handle, const VXLIB_bufParams2D_t *bufParamsIn, const VXLIB_bufParams2D_t *bufParamsOut, const VXLIB_gaussian_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
Header file for kernel's internal use. For the kernel's interface, please see VXLIB_gaussian.
#define VXLIB_GAUSSIAN_FILTER_5x5
Macros for 5x5 filter dimension.
#define VXLIB_GAUSSIAN_FILTER_3x3
Macros for 3x3 filter dimension.
void * VXLIB_kernelHandle
Handle type for VXLIB operations.
VXLIB_STATUS_NAME
The enumeration of all status codes.
@ VXLIB_ERR_NOT_IMPLEMENTED
void VXLIB_gaussian_perfEst(VXLIB_kernelHandle handle, size_t *archCycles, size_t *estCycles)
A structure for a 2 dimensional buffer descriptor.
uint32_t dim_y
Height of buffer in Y dimension in elements.
uint32_t dim_x
Width of buffer in X dimension in elements.
int32_t stride_y
Stride in Y dimension in bytes.
Structure containing the parameters to initialize the kernel.
uint8_t shift
Shift parameter for 5x5 filter
int8_t filterSize
Width and height of filter
int32_t padLeft
Padding options
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[VXLIB_GAUSSIAN_IXX_IXX_OXX_PBLOCK_SIZE]
Array to hold SE/SA params.
size_t width
Width of image
size_t height
Height of image
size_t strideOutElements
Stride of output in elements.
VXLIB_gaussian_InitArgs pKerInitArgs
Initargs of the kernel.
size_t numBlocks
Number of blocks to be processed after simidfication.