VXLIB User Guide
VXLIB_gaussian_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *
11  * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the
14  * distribution.
15  *
16  * Neither the name of Texas Instruments Incorporated nor the names of
17  * its contributors may be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  *
32  ******************************************************************************/
33 
34 /**********************************************************************************************************************/
35 /* */
36 /* INCLUDES */
37 /* */
38 /**********************************************************************************************************************/
39 
40 #include "VXLIB_gaussian_priv.h"
41 #include "VXLIB_types.h"
42 #include "VXLIB_utility.h"
43 #include "c7x_scalable.h"
44 #include <type_traits>
45 /**********************************************************************************************************************/
46 /* */
47 /* DEFINES */
48 /* */
49 /**********************************************************************************************************************/
50 
51 #define SE_PARAM_BASE (0x0000)
52 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
53 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + VXLIB_SE_PARAM_SIZE)
54 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + VXLIB_SE_PARAM_SIZE)
55 
56 #define VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE (sizeof(uint64_t) * 8)
57 
58 #define Q_BITS 15
59 #define Q_FACTOR (1 << Q_BITS)
60 
61 /**********************************************************************************************************************/
62 /* */
63 /* VXLIB_gaussian_init_ci */
64 /* */
65 /**********************************************************************************************************************/
66 
67 // this method initializes the kernel-specific parameters
68 // mainly, the streaming engine and streaming address generators
69 template <uint32_t dTypeIn, uint32_t dTypeOut>
71  const VXLIB_bufParams2D_t *bufParamsIn,
72  const VXLIB_bufParams2D_t *bufParamsOut,
73  const VXLIB_gaussian_InitArgs *pKerInitArgs)
74 {
75  VXLIB_STATUS status = VXLIB_SUCCESS; // assign status to success by default
76 
77  // structs to hold SE and SA parameters
78  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
79  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
80  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
81 
82  // fetch + SE promote, store all half SIMD width
83  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<c7x::uchar_vec>::value;
84  __SE_VECLEN SE_VECLEN = c7x::se_veclen<c7x::uchar_hvec>::value;
85  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<c7x::uchar_hvec>::value;
86 
87  // typecast handle (void) to struct pointer type associated to kernel
88  VXLIB_gaussian_PrivArgs *pKerPrivArgs = (VXLIB_gaussian_PrivArgs *) handle;
89 
90  // number of elements fetched from input each SE get
91  size_t elemCount = c7x::element_count_of<c7x::char_hvec>::value;
92 
93  // get init args
94  size_t width = bufParamsIn->dim_x;
95  size_t height = bufParamsIn->dim_y;
96  size_t stride = bufParamsIn->stride_y;
97 
98  size_t filterSize = pKerInitArgs->filterSize;
99  size_t padLeft = pKerInitArgs->padLeft;
100  size_t padRight = pKerInitArgs->padRight;
101  size_t padTop = pKerInitArgs->padTop;
102  size_t padBottom = pKerInitArgs->padBottom;
103 
104  // determine non-padded vs padded implementation
105  bool isNotPadded = (padLeft == 0) && (padRight == 0) && (padTop == 0) && (padBottom == 0);
106 
107  uint8_t *pBlock = pKerPrivArgs->bufPblock;
108  pKerPrivArgs->numBlocks = width * VXLIB_ceilingDiv(width, elemCount);
109 
110  // setup se/sa for k x k implementation
111  if (filterSize == VXLIB_GAUSSIAN_FILTER_3x3) {
112 
113  // Creating gaussian filter 3x3 coefficient matrix
114  // gaussianFilter3x3 = [[1, 2, 1], [2, 4, 2], [1, 2, 1]] * 1/16
115  // gaussianFilter3x3 = gaussianFilter3x3 * Q_FACTOR
116  int16_t gaussianFilter3x3[6] = {2048, 4096, 2048, 4096, 8192, 4096};
117 
118  // call 3x3 gaussian init non-padded implementation
119  if (isNotPadded) {
120 
121  // set SE0 Params
122  se0Params.DIMFMT = __SE_DIMFMT_3D;
123  se0Params.ELETYPE = SE_ELETYPE;
124  se0Params.VECLEN = SE_VECLEN;
125  se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
126 
127  se0Params.ICNT0 = elemCount;
128 
129  se0Params.DIM1 = stride;
130  se0Params.ICNT1 = (filterSize - 1) + VXLIB_ceilingDiv(bufParamsOut->dim_y, filterSize) * filterSize;
131 
132  se0Params.DIM2 = elemCount;
133  se0Params.ICNT2 = VXLIB_ceilingDiv(width, elemCount);
134 
135  // input column dimension DECDIM
136  se0Params.DECDIM1 = __SE_DECDIM_DIM2;
137  se0Params.DECDIM1_WIDTH = width;
138  se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
139 
140  // input row dimension DECDIM
141  se0Params.DECDIM2 = __SE_DECDIM_DIM1;
142  se0Params.DECDIM2_WIDTH = height * stride;
143 
144  // set SE1 Params
145  se1Params.DIMFMT = __SE_DIMFMT_3D;
146  se1Params.ELETYPE = SE_ELETYPE;
147  se1Params.VECLEN = SE_VECLEN;
148  se1Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
149 
150  se1Params.ICNT0 = (width <= elemCount) ? 0 : elemCount;
151 
152  se1Params.DIM1 = stride;
153  se1Params.ICNT1 = (filterSize - 1) + VXLIB_ceilingDiv(bufParamsOut->dim_y, filterSize) * filterSize;
154 
155  se1Params.DIM2 = elemCount;
156  se1Params.ICNT2 = VXLIB_ceilingDiv(width, elemCount);
157 
158  // input column dimension DECDIM
159  se1Params.DECDIM1 = __SE_DECDIM_DIM2;
160  se1Params.DECDIM1_WIDTH = width - elemCount;
161  se1Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
162 
163  // input row dimension DECDIM
164  se1Params.DECDIM2 = __SE_DECDIM_DIM1;
165  se1Params.DECDIM2_WIDTH = height * stride;
166 
167  // set SA0 Params
168  sa0Params.DIMFMT = __SA_DIMFMT_3D;
169  sa0Params.VECLEN = SA_VECLEN;
170 
171  sa0Params.ICNT0 = (bufParamsOut->dim_x < elemCount) ? bufParamsOut->dim_x : elemCount;
172  sa0Params.DIM1 = pKerPrivArgs->strideOutElements;
173  sa0Params.ICNT1 = VXLIB_ceilingDiv(bufParamsOut->dim_y, filterSize) * filterSize;
174 
175  sa0Params.DIM2 = elemCount;
176  sa0Params.ICNT2 = VXLIB_ceilingDiv(bufParamsOut->dim_x, elemCount);
177 
178  // output column dimension DECDIM
179  sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
180  sa0Params.DECDIM1_WIDTH = bufParamsOut->dim_x;
181  sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
182 
183  // output row dimension DECDIM
184  sa0Params.DECDIM2 = __SA_DECDIM_DIM1;
185  sa0Params.DECDIM2_WIDTH = bufParamsOut->dim_y * bufParamsOut->stride_y;
186 
187  // store gaussian filter 3x3 coefficients
188  // 1st and 3rd rows of matrix are same, only store once
189 
190  /* 0 64b */
191  /* +-----------------------+ */
192  /* | 2048 | 4096 | 2048 | 0 | */
193  /* +-----------------------+ */
194 
195  /* 0 64b */
196  /* +-----------------------+ */
197  /* | 4096 | 8192 | 4096 | 0 | */
198  /* +-----------------------+ */
199 
200  // Store first row of coefficient matrix
201  uint64_t filter = gaussianFilter3x3[2];
202  filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[1];
203  filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[0];
204 
205  size_t filter_offset = SA0_PARAM_OFFSET + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
206 
207  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
208 
209  // Store second row of the coefficient matrix
210  filter = gaussianFilter3x3[5];
211  filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[4];
212  filter = (filter << 16LLU) | (uint64_t)gaussianFilter3x3[3];
213 
214  filter_offset = filter_offset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
215 
216  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
217  }
218 
219  // return error for padded version; not implemented
220  else {
221 
222  status = VXLIB_ERR_NOT_IMPLEMENTED;
223  }
224  }
225 
226  else if (filterSize == VXLIB_GAUSSIAN_FILTER_5x5) {
227  int16_t gaussianFilter5x5[15] = {1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6};
228 
229  // call 3x3 gaussian init non-padded implementation
230  if (isNotPadded) {
231  size_t elemCountOut = c7x::element_count_of<c7x::uchar_qvec>::value;
232 
233  // set SE0 Params
234  se0Params.DIMFMT = __SE_DIMFMT_4D;
235  se0Params.ELETYPE = SE_ELETYPE;
236  se0Params.VECLEN = SE_VECLEN;
237  se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
238 
239  // 0: number of elements to be computed per SE fetch
240  se0Params.ICNT0 = elemCount;
241 
242  // 1: number of accumulation rows in single convolution window
243  se0Params.DIM1 = stride;
244  se0Params.ICNT1 = filterSize;
245 
246  // 2: vertical iterations
247  se0Params.DIM2 = stride;
248  se0Params.ICNT2 = height - filterSize + 1;
249 
250  // 3: horizontal iterations
251  se0Params.DIM3 = elemCount / 2; // vfir8 is half-vector fetch, quarter-vec adv
252  se0Params.ICNT3 = VXLIB_ceilingDiv(width, (elemCount / 2));
253 
254  // input column dimension DECDIM
255  se0Params.DECDIM1 = __SE_DECDIM_DIM3;
256  se0Params.DECDIM1_WIDTH = width;
257  se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
258 
259  // set SA0 Params
260  sa0Params.DIMFMT = __SA_DIMFMT_3D;
261  sa0Params.VECLEN = SA_VECLEN;
262 
263  // 0: number of elements to store per SE fetch
264  sa0Params.ICNT0 = elemCountOut;
265 
266  // 1: vertical iterations (1 store per convolution window)
267  sa0Params.DIM1 = pKerPrivArgs->strideOutElements; // SA DIM jumps are given in num elements
268  sa0Params.ICNT1 = bufParamsOut->dim_y;
269 
270  // 2: horizontal iterations
271  sa0Params.DIM2 = elemCountOut;
272  sa0Params.ICNT2 = VXLIB_ceilingDiv(bufParamsOut->dim_x, elemCountOut);
273 
274  // output column dimension DECDIM
275  sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
276  sa0Params.DECDIM1_WIDTH = bufParamsOut->dim_x;
277  sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
278 
279  // store gaussian filter 5x5 coefficients
280  // 1st row of coefficients split into low and high for VFIR8
281 
282  /* 0 64b */
283  /* +-----------------------+ */
284  /* | 1 | 4 | 6 | 4 | */
285  /* +-----------------------+ */
286 
287  /* 0 64b */
288  /* +-----------------------+ */
289  /* | 1 | 0 | 0 | 0 | */
290  /* +-----------------------+ */
291 
292  // Store first row (low) of coefficient matrix
293  uint64_t filter = gaussianFilter5x5[3];
294  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[2];
295  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[1];
296  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[0];
297 
298  size_t filter_offset = SA0_PARAM_OFFSET + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
299 
300  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
301 
302  // high
303  filter = gaussianFilter5x5[4];
304  filter_offset = filter_offset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
305 
306  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
307 
308  // 2nd row of coefficients split into low and high for VFIR8
309 
310  /* 0 64b */
311  /* +--------------------------+ */
312  /* | 4 | 16 | 24 | 16 | */
313  /* +--------------------------+ */
314 
315  /* 0 64b */
316  /* +-----------------------+ */
317  /* | 4 | 0 | 0 | 0 | */
318  /* +-----------------------+ */
319 
320  // Store second row (low) of the coefficient matrix
321  filter = gaussianFilter5x5[8];
322  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[7];
323  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[6];
324  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[5];
325 
326  filter_offset = filter_offset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
327 
328  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
329 
330  // high
331  filter = gaussianFilter5x5[9];
332  filter_offset = filter_offset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
333 
334  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
335 
336  // 3rd row of coefficients split into low and high for VFIR8
337 
338  /* 0 64b */
339  /* +--------------------------+ */
340  /* | 6 | 24 | 36 | 24 | */
341  /* +--------------------------+ */
342 
343  /* 0 64b */
344  /* +-----------------------+ */
345  /* | 6 | 0 | 0 | 0 | */
346  /* +-----------------------+ */
347 
348  // Store third row (low) of the coefficient matrix
349  filter = gaussianFilter5x5[13];
350  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[12];
351  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[11];
352  filter = (filter << 16LLU) | (uint64_t)gaussianFilter5x5[10];
353 
354  filter_offset = filter_offset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
355 
356  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
357 
358  // high
359  filter = gaussianFilter5x5[14];
360  filter_offset = filter_offset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE;
361 
362  *(uint64_t *) ((uint8_t *) pBlock + filter_offset) = filter;
363 
364  // 4th and 5th row not stored as same as 1 and 2 of coefficient matrix
365  }
366 
367  // return error for padded version; not implemented
368  else {
369 
370  status = VXLIB_ERR_NOT_IMPLEMENTED;
371  }
372  }
373 
374  else {
375  // return error for any other filter size (only 3x3 supported for now)
376  status = VXLIB_ERR_NOT_IMPLEMENTED;
377  }
378 
379  // store SE and SA params
380  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET) = se0Params;
381  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET) = se1Params;
382  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET) = sa0Params;
383 
384  return status;
385 }
386 
387 /**********************************************************************************************************************/
388 /* */
389 /* Explicit instantiations for VXLIB_gaussian_init_ci */
390 /* */
391 /**********************************************************************************************************************/
392 
394  const VXLIB_bufParams2D_t *bufParamsIn,
395  const VXLIB_bufParams2D_t *bufParamsOut,
396  const VXLIB_gaussian_InitArgs *pKerInitArgs);
397 
398 /**********************************************************************************************************************/
399 /* */
400 /* VXLIB_gaussian_exec_ci */
401 /* */
402 /**********************************************************************************************************************/
403 
404 template <typename dTypeIn, typename dTypeOut>
405 void VXLIB_gaussian_5x5_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
406 {
407  // get kernel handle
408  VXLIB_gaussian_PrivArgs *pKerPrivArgs = (VXLIB_gaussian_PrivArgs *) handle;
409 
410  // SE/SA params template
411  __SE_TEMPLATE_v1 se0Params;
412  __SA_TEMPLATE_v1 sa0Params;
413 
414  // create local pointers
415  dTypeIn *restrict pInLocal = (dTypeIn *) pIn;
416  dTypeOut *restrict pOutLocal = (dTypeOut *) pOut;
417 
418  // define vector types
419 
420 
421  uint8_t *pBlock = pKerPrivArgs->bufPblock;
423  uint8_t shift = pKerPrivArgs->pKerInitArgs.shift;
424 
425  // get SE/SA params
426  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET);
427  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET);
428 
429  // get coeff from memory
430  uint64_t scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + filterOffset);
431 
432  // duplicate scalar coeff to full vector length
433  c7x::ulong_vec ulongvCoeff = __vload_dup(&scalarCoeff);
434 
435  // reinterpret to 16b element coeffs
436  c7x::short_vec vCoeff1_lo = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
437 
438  // Repeat for remaining coefficient vectors
439  scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + (filterOffset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE));
440  ulongvCoeff = __vload_dup(&scalarCoeff);
441  c7x::short_vec vCoeff1_hi = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
442 
443  scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + (filterOffset + 2 * VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE));
444  ulongvCoeff = __vload_dup(&scalarCoeff);
445  c7x::short_vec vCoeff2_lo = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
446 
447  scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + (filterOffset + 3 * VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE));
448  ulongvCoeff = __vload_dup(&scalarCoeff);
449  c7x::short_vec vCoeff2_hi = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
450 
451  scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + (filterOffset + 4 * VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE));
452  ulongvCoeff = __vload_dup(&scalarCoeff);
453  c7x::short_vec vCoeff3_lo = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
454 
455  scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + (filterOffset + 5 * VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE));
456  ulongvCoeff = __vload_dup(&scalarCoeff);
457  c7x::short_vec vCoeff3_hi = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
458 
459  // open streams
460  __SE0_OPEN(pInLocal, se0Params);
461  __SA0_OPEN(sa0Params);
462 
463  size_t filterSize = pKerPrivArgs->pKerInitArgs.filterSize;
464 
465  size_t elemCountLoop = c7x::element_count_of<c7x::uchar_hvec>::value;
466  size_t wBlocks = VXLIB_ceilingDiv(pKerPrivArgs->width, (elemCountLoop / 2));
467  size_t hBlocks = pKerPrivArgs->height - filterSize + 1;
468 
469  c7x::int_vec shiftVec = (c7x::int_vec) shift;
470 
471  c7x::int_vec vResA, vResB, vResC, vResD, vResE, vResABCDE;
472 
473  // TODO: nested loop coalescing (NLC compiler pragma)
474  for (int32_t wCounter = 0; wCounter < wBlocks; wCounter++) {
475 
476  for (int32_t hCounter = 0; hCounter < hBlocks; hCounter++) {
477  vResA = __vfir8hw_wvv(vCoeff1_lo, vCoeff1_hi, __SE_REG_0_ADV);
478  vResB = __vfir8hw_wvv(vCoeff2_lo, vCoeff2_hi, __SE_REG_0_ADV);
479  vResC = __vfir8hw_wvv(vCoeff3_lo, vCoeff3_hi, __SE_REG_0_ADV);
480  vResD = __vfir8hw_wvv(vCoeff2_lo, vCoeff2_hi, __SE_REG_0_ADV);
481  vResE = __vfir8hw_wvv(vCoeff1_lo, vCoeff1_hi, __SE_REG_0_ADV);
482 
483  vResABCDE = vResA + vResB + vResC + vResD + vResE;
484  vResABCDE = __shift_right(vResABCDE, shiftVec);
485 
486  // saturate from 32b signed to 8b unsigned
487  c7x::uint_vec vUResABCDE = c7x::convert<c7x::uint_vec>(vResABCDE);
488 
489  __vpred tmp = c7x::strm_agen<0, c7x::uint_vec>::get_vpred();
490  c7x::uchar_qvec *addr = c7x::strm_agen<0, c7x::uchar_qvec>::get_adv(pOutLocal);
491  __vstore_pred_pack_byte(tmp, addr, vUResABCDE);
492  }
493  }
494 
495  __SE0_CLOSE();
496  __SA0_CLOSE();
497 }
498 
499 // this method performs 3x3 gaussian filter computation
500 template <typename dTypeIn, typename dTypeOut>
501 void VXLIB_gaussian_3x3_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
502 {
503  // get kernel handle
504  VXLIB_gaussian_PrivArgs *pKerPrivArgs = (VXLIB_gaussian_PrivArgs *) handle;
505 
506  // SE/SA params template
507  __SE_TEMPLATE_v1 se0Params;
508  __SE_TEMPLATE_v1 se1Params;
509  __SA_TEMPLATE_v1 sa0Params;
510 
511  // create local pointers
512  dTypeIn *restrict pInLocal = (dTypeIn *) pIn;
513  dTypeOut *restrict pOutLocal = (dTypeOut *) pOut;
514 
515  // define vector types
516  typedef typename c7x::char_hvec out_hvec;
517 
518 
519  uint8_t *pBlock = pKerPrivArgs->bufPblock;
520  size_t elemCount = c7x::element_count_of<c7x::short_vec>::value;
522 
523  // get SE/SA params
524  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET);
525  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET);
526  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET);
527 
528  // get coeff from memory
529  uint64_t scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + filterOffset);
530 
531  // duplicate scalar coeff to full vector length
532  c7x::ulong_vec ulongvCoeff = __vload_dup(&scalarCoeff);
533 
534  // reinterpret to 16b element coeffs
535  c7x::short_vec vCoeff1 = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
536 
537  // Do similar process to retreive 2nd row of coeffcients
538  scalarCoeff = *(uint64_t *) ((uint8_t *) pBlock + (filterOffset + VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE));
539  ulongvCoeff = __vload_dup(&scalarCoeff);
540  c7x::short_vec vCoeff2 = c7x::reinterpret<c7x::short_vec>(ulongvCoeff);
541 
542  // open streams
543  __SE0_OPEN(pInLocal, se0Params);
544  __SE1_OPEN(pInLocal + elemCount, se1Params);
545  __SA0_OPEN(sa0Params);
546 
547  size_t filterSize = pKerPrivArgs->pKerInitArgs.filterSize;
548  size_t wBlocks = VXLIB_ceilingDiv(pKerPrivArgs->width, elemCount);
549  size_t hBlocks = VXLIB_ceilingDiv((pKerPrivArgs->height - filterSize + 1), filterSize);
550 
551  c7x::int_vec vQ_BITS = (c7x::int_vec) Q_BITS;
552 
553  c7x::int_vec vResA_lo, vResA_hi, vResB_lo, vResB_hi, vResC_lo, vResC_hi;
554  c7x::int_vec vResAB_lo, vResAB_hi, vResBC_lo, vResBC_hi, vResCA_lo, vResCA_hi;
555  c7x::int_vec vResABC_lo, vResABC_hi, vResBCA_lo, vResBCA_hi, vResCAB_lo, vResCAB_hi;
556 
557  // TODO: nested loop coalescing (NLC compiler pragma)
558  for (int32_t wCounter = 0; wCounter < wBlocks; wCounter++) {
559 
560  __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0_ADV, vResA_lo, vResA_hi);
561  __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0, vResB_lo, vResB_hi);
562 
563  vResAB_lo = vResA_lo + vResB_lo;
564  vResAB_hi = vResA_hi + vResB_hi;
565  __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0_ADV, vResB_lo, vResB_hi);
566 
567  // store FIR of row B with Coeffcient 1 for BCA calculation later
568  vResBC_lo = vResB_lo;
569  vResBC_hi = vResB_hi;
570 
571  for (int32_t hCounter = 0; hCounter < hBlocks; hCounter++) {
572 
573  // vfir4 row C
574  __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0, vResC_lo, vResC_hi);
575 
576  // vertical sum ABC
577  vResABC_lo = vResAB_lo + vResC_lo;
578  vResABC_hi = vResAB_hi + vResC_hi;
579 
580  // adjust ABC by Q factor
581  vResABC_lo = __shift_right(vResABC_lo, vQ_BITS);
582  vResABC_hi = __shift_right(vResABC_hi, vQ_BITS);
583 
584  // predicated pack store ABC, words to bytes (VSTPWPACKB)
585  __vpred tmp = c7x::strm_agen<0, c7x::char_vec>::get_vpred();
586  out_hvec *addr = c7x::strm_agen<0, out_hvec>::get_adv(pOutLocal);
587  __vstore_pred_pack_byte_2src(tmp, addr, vResABC_lo, vResABC_hi);
588 
589  // store FIR of row C with coefficient 1 for CAB calculation later
590  vResCA_lo = vResC_lo;
591  vResCA_hi = vResC_hi;
592 
593  // vfir4 row C
594  __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0_ADV, vResC_lo, vResC_hi);
595 
596  // vertical sum BC
597  vResBC_lo += vResC_lo;
598  vResBC_hi += vResC_hi;
599 
600  __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0, vResA_lo, vResA_hi);
601 
602  vResBCA_lo = vResBC_lo + vResA_lo;
603  vResBCA_hi = vResBC_hi + vResA_hi;
604 
605  vResBCA_lo = __shift_right(vResBCA_lo, vQ_BITS);
606  vResBCA_hi = __shift_right(vResBCA_hi, vQ_BITS);
607 
608  // store BCA
609  tmp = c7x::strm_agen<0, c7x::char_vec>::get_vpred();
610  addr = c7x::strm_agen<0, out_hvec>::get_adv(pOutLocal);
611  __vstore_pred_pack_byte_2src(tmp, addr, vResBCA_lo, vResBCA_hi);
612 
613  __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0, vResA_lo, vResA_hi);
614 
615  // add row A with coefficient 2 for CAB calculation
616  vResCA_lo += vResA_lo;
617  vResCA_hi += vResA_hi;
618 
619  // row A with coefficient 1 for next loop iteration of ABC
620  __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0_ADV, vResA_lo, vResA_hi);
621 
622  // add row B with coefficient 1 for BCA calculation
623  __vfir4hw_vww(vCoeff1, __SE_REG_PAIR_0, vResB_lo, vResB_hi);
624  vResBC_lo = vResB_lo;
625  vResBC_hi = vResB_hi;
626 
627  vResCAB_lo = vResCA_lo + vResB_lo;
628  vResCAB_hi = vResCA_hi + vResB_hi;
629 
630  vResCAB_lo = __shift_right(vResCAB_lo, vQ_BITS);
631  vResCAB_hi = __shift_right(vResCAB_hi, vQ_BITS);
632 
633  // row B with coefficient 2 for next loop iteration of ABC
634  __vfir4hw_vww(vCoeff2, __SE_REG_PAIR_0_ADV, vResB_lo, vResB_hi);
635 
636  // row AB for next loop iteration of ABC
637  vResAB_lo = vResA_lo + vResB_lo;
638  vResAB_hi = vResA_hi + vResB_hi;
639 
640  // store CAB
641  tmp = c7x::strm_agen<0, c7x::char_vec>::get_vpred();
642  addr = c7x::strm_agen<0, out_hvec>::get_adv(pOutLocal);
643  __vstore_pred_pack_byte_2src(tmp, addr, vResCAB_lo, vResCAB_hi);
644  }
645  }
646 
647  __SE0_CLOSE();
648  __SE1_CLOSE();
649  __SA0_CLOSE();
650 }
651 
652 // this method is the top level module for k x k gaussian filter operation on an input image
653 template <typename dTypeIn, typename dTypeOut>
654 VXLIB_STATUS VXLIB_gaussian_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
655 {
656  VXLIB_STATUS status = VXLIB_SUCCESS;
657 
658  // typecast handle (void) to struct pointer type associated to kernel
659  VXLIB_gaussian_PrivArgs *pKerPrivArgs = (VXLIB_gaussian_PrivArgs *) handle;
660 
661  size_t filterSize = pKerPrivArgs->pKerInitArgs.filterSize;
662 
663  // call 3x3 gaussian filter exec
664  if (filterSize == VXLIB_GAUSSIAN_FILTER_3x3) {
665  VXLIB_gaussian_3x3_exec_ci<VXLIB_GAUSSIAN_TYPENAME_I8U_O8U>(handle, pIn, pOut);
666  }
667  else if (filterSize == VXLIB_GAUSSIAN_FILTER_5x5) {
668  VXLIB_gaussian_5x5_exec_ci<VXLIB_GAUSSIAN_TYPENAME_I8U_O8U>(handle, pIn, pOut);
669  }
670 
671  // return errors for other filter sizes; not implemented
672  else {
673  status = VXLIB_ERR_NOT_IMPLEMENTED;
674  }
675 
676  return status;
677 }
678 
679 /**********************************************************************************************************************/
680 /* */
681 /* Explicit instantiations for VXLIB_gaussian_exec_ci */
682 /* */
683 /**********************************************************************************************************************/
684 
686  void *restrict pIn,
687  void *restrict pOut);
688 
689 void VXLIB_gaussian_perfEst(VXLIB_kernelHandle handle, size_t *archCycles, size_t *estCycles)
690 {
691  // typecast handle (void) to struct pointer type associated to kernel
692  VXLIB_gaussian_PrivArgs *pKerPrivArgs = (VXLIB_gaussian_PrivArgs *) handle;
693 
694  // obtain loop count for compute loop
695  size_t numBlocks = pKerPrivArgs->numBlocks;
696  size_t overheadCycles = 17; // profiled code before entering compute loop
697  size_t iterConst = 5;
698  size_t ii = 7;
699 
700  *archCycles = iterConst + numBlocks * ii; // obtained from asm
701  *estCycles = overheadCycles + *archCycles;
702 }
void VXLIB_gaussian_3x3_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
#define SE0_PARAM_OFFSET
template VXLIB_STATUS VXLIB_gaussian_exec_ci< VXLIB_GAUSSIAN_TYPENAME_I8U_O8U >(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
#define VXLIB_GAUSSIANFILTER3x3_COEFF_SIZE
template VXLIB_STATUS VXLIB_gaussian_init_ci< VXLIB_GAUSSIAN_DTYPE_I8U_O8U >(VXLIB_kernelHandle handle, const VXLIB_bufParams2D_t *bufParamsIn, const VXLIB_bufParams2D_t *bufParamsOut, const VXLIB_gaussian_InitArgs *pKerInitArgs)
VXLIB_STATUS VXLIB_gaussian_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
void VXLIB_gaussian_5x5_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
#define Q_BITS
#define SA0_PARAM_OFFSET
#define SE1_PARAM_OFFSET
VXLIB_STATUS VXLIB_gaussian_init_ci(VXLIB_kernelHandle handle, const VXLIB_bufParams2D_t *bufParamsIn, const VXLIB_bufParams2D_t *bufParamsOut, const VXLIB_gaussian_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
Header file for kernel's internal use. For the kernel's interface, please see VXLIB_gaussian.
#define VXLIB_GAUSSIAN_FILTER_5x5
Macros for 5x5 filter dimension.
#define VXLIB_GAUSSIAN_FILTER_3x3
Macros for 3x3 filter dimension.
void * VXLIB_kernelHandle
Handle type for VXLIB operations.
Definition: VXLIB_types.h:247
VXLIB_STATUS_NAME
The enumeration of all status codes.
Definition: VXLIB_types.h:220
@ VXLIB_ERR_NOT_IMPLEMENTED
Definition: VXLIB_types.h:227
@ VXLIB_SUCCESS
Definition: VXLIB_types.h:221
void VXLIB_gaussian_perfEst(VXLIB_kernelHandle handle, size_t *archCycles, size_t *estCycles)
A structure for a 2 dimensional buffer descriptor.
uint32_t dim_y
Height of buffer in Y dimension in elements.
uint32_t dim_x
Width of buffer in X dimension in elements.
int32_t stride_y
Stride in Y dimension in bytes.
Structure containing the parameters to initialize the kernel.
uint8_t shift
Shift parameter for 5x5 filter
int8_t filterSize
Width and height of filter
int32_t padLeft
Padding options
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[VXLIB_GAUSSIAN_IXX_IXX_OXX_PBLOCK_SIZE]
Array to hold SE/SA params.
size_t width
Width of image
size_t height
Height of image
size_t strideOutElements
Stride of output in elements.
VXLIB_gaussian_InitArgs pKerInitArgs
Initargs of the kernel.
size_t numBlocks
Number of blocks to be processed after simidfication.