VXLIB User Guide
VXLIB_histogram_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *
11  * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the
14  * distribution.
15  *
16  * Neither the name of Texas Instruments Incorporated nor the names of
17  * its contributors may be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  *
32  ******************************************************************************/
33 
34 /**********************************************************************************************************************/
35 /* */
36 /* INCLUDES */
37 /* */
38 /**********************************************************************************************************************/
39 
40 #include "VXLIB_histogram_priv.h"
41 
42 /**********************************************************************************************************************/
43 /* */
44 /* DEFINES */
45 /* */
46 /**********************************************************************************************************************/
47 #define SE_PARAM_BASE (0x0000)
48 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
49 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + VXLIB_SE_PARAM_SIZE)
50 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + VXLIB_SE_PARAM_SIZE)
51 
52 /**********************************************************************************************************************/
53 /* */
54 /* VXLIB_histogram_init_ci */
55 /* */
56 /**********************************************************************************************************************/
57 
58 void VXLIB_histogram_val_init(uint16_t numBins)
59 {
60  ulong8 vSeed;
61  uint32_t row_offset;
62 
63  row_offset = 0;
64  for (int32_t j = 0; j < numBins; j += 2) {
65  // Let the seed be 0, here one quanta will be 2, 32-bit elements
66  vSeed = (ulong8) (0x0000000000000000);
67 
68  // Update the quanta in all N ways
69  __lookup_init(__LUT_SET0, __as_ushort32(vSeed), row_offset);
70 
71  // Increment the row_offset by 2 as there are 2, 32-bit bins every row in a 64-bit bank
72  row_offset += 2;
73  }
74 }
75 
76 // this method initializes the kernel-specific parameters
77 // mainly, the streaming engine and streaming address generators
78 template <uint32_t dTypeIn, uint32_t dTypeOut>
80  const VXLIB_bufParams2D_t *bufParamsIn,
81  const VXLIB_bufParams1D_t *bufParamsOut,
82  const VXLIB_histogram_InitArgs *pKerInitArgs)
83 {
84  VXLIB_STATUS status = VXLIB_SUCCESS; // assign status to success by default
85 
86  // typecast handle (void) to struct pointer type associated to kernel
87  VXLIB_histogram_PrivArgs *pKerPrivArgs = (VXLIB_histogram_PrivArgs *) handle;
88 
89  // obtain image size and compute number of blocks to process
90  size_t width = pKerPrivArgs->width;
91  size_t height = pKerPrivArgs->height;
92  size_t elemCount = VXLIB_max_simd<dTypeOut>::value;
93  size_t wBlocks = (width + (elemCount - 1)) / (elemCount);
94  size_t numBlocks = height * wBlocks;
95  uint16_t numBins = pKerInitArgs->numBins;
96  size_t strideInElements = pKerPrivArgs->strideInElements;
97  pKerPrivArgs->numBlocks = numBlocks;
98 
99  // structs to hold SE and SA parameters
100  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
101  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
102  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
103 
104  __SE_ELETYPE SE_ELETYPE;
105  __SE_VECLEN SE_VECLEN;
106  __SA_VECLEN SA_VECLEN;
107 
108  uint8_t *pBlock = pKerPrivArgs->bufPblock; // address to retrieve to store SE/SA params
109 
110  SE_ELETYPE = c7x::se_eletype<c7x::uchar_vec>::value;
111  SE_VECLEN = c7x::se_veclen<c7x::int_vec>::value;
112  SA_VECLEN = c7x::sa_veclen<c7x::int_vec>::value;
113 
114  if (width == strideInElements) {
115  se0Params.ELETYPE = SE_ELETYPE;
116  se0Params.VECLEN = SE_VECLEN;
117  se0Params.DIMFMT = __SE_DIMFMT_1D;
118  se0Params.PROMOTE = __SE_PROMOTE_4X_ZEROEXT;
119  se0Params.ICNT0 = width * height;
120  }
121 
122  else {
123  se0Params.ELETYPE = SE_ELETYPE;
124  se0Params.VECLEN = SE_VECLEN;
125  se0Params.DIMFMT = __SE_DIMFMT_2D;
126  se0Params.PROMOTE = __SE_PROMOTE_4X_ZEROEXT;
127 
128  se0Params.ICNT0 = width;
129  se0Params.ICNT1 = height;
130  se0Params.DIM1 = strideInElements;
131  }
132 
133  sa0Params.VECLEN = SA_VECLEN;
134  sa0Params.DIMFMT = __SA_DIMFMT_1D;
135  sa0Params.ICNT0 = numBins;
136 
137  /**************************/
138  /* Store SE and SA params */
139  /**************************/
140 
141  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET) = se0Params;
142  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET) = sa0Params;
143 
144  /******************************************/
145  /* Initialize LUT for !C7504 C7x variants */
146  /******************************************/
147 
148  __sLTCRFlags_t set0flags;
149 
150  // 16 way lut for 16-bit data with 64 entries (6-bit index)
151  set0flags.INTERPOLATION = __LUT_INTERP_OFF; // NA, applicable only in LUT mode
152  set0flags.SATURATION = __LUT_SAT_ON; // Saturate after reaching limit
153  set0flags.SIGNUNSIGN = __LUT_UNSIGNED; // data elements are unsigned
154  set0flags.ELETYPE = __LUT_ELETYPE_32BIT; // This is the bit-depth of each bin
155  set0flags.NUMTBLS = __LUT_NUM_TBL_16; // 16 parallel lookup every cycle
156  set0flags.TBLSIZE = __LUT_TBL_SIZE_16KBYTES; // Total table size
157  set0flags.WEIGHTSIZE = __LUT_WSIZE_8BIT; // Set weight size as 8-bit
158  set0flags.PROMOTION = __LUT_PROMOTE_OFF; // NA, applicable only in LUT mode
159 
160  // Set configuration register for SET0
161  __LUT_SET_LTCR(__LUT_SET0, __lut_set_param(&set0flags));
162 
163  // Start the table at offset 0 from the beginning of L1D-SRAM
164  __LUT_SET_LTBR(__LUT_SET0, 0x0000);
165 
166  // Enable set 1
167  __LUT_SET_LTER(__LUT_ENABLE_0);
168 
169  VXLIB_histogram_val_init(numBins);
170 
171  /* Generate predicate buffer */
172 
173  if (width != strideInElements) {
174 
175  int32_t blockCounter = 0;
176 
177  uint64_t *predRegister = (uint64_t *) pKerPrivArgs->bufPredicateStore;
178 
179  uint32_t i = 0;
180  int32_t j = 0;
181  uint64_t predictedValue = 0x0;
182 
183  while (blockCounter < wBlocks) {
184  for (i = 0; i < elemCount; i++) {
185  if (j < width) {
186  predictedValue |= (uint64_t) ((uint64_t) 0xF << (i * 4));
187  j++;
188  }
189  else {
190  j++;
191  }
192  }
193  *predRegister = predictedValue;
194 
195  predRegister++;
196  blockCounter++;
197  predictedValue = 0x0;
198  }
199 
200  SE_ELETYPE = c7x::se_eletype<c7x::ulong_vec>::value;
201  SE_VECLEN = c7x::se_veclen<c7x::ulong_vec>::value;
202 
203  se1Params.ELETYPE = __SE_ELETYPE_64BIT;
204  se1Params.VECLEN = __SE_VECLEN_1ELEM;
205  se1Params.DIMFMT = __SE_DIMFMT_3D;
206  se1Params.ICNT0 = 1;
207  se1Params.ICNT1 = wBlocks;
208  se1Params.ICNT2 = height;
209  se1Params.DIM1 = 1;
210  se1Params.DIM2 = 0;
211 
212  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET) = se1Params;
213  }
214 
215  else {
216 
217  uint64_t *predRegister = (uint64_t *) pKerPrivArgs->bufPredicateStore;
218 
219  uint32_t i = 0;
220  int32_t j = 0;
221  uint64_t predictedValue = 0x0;
222  size_t widthLastBlock = (width * height) % elemCount;
223 
224  for (i = 0; i < elemCount; i++) {
225  if (j < widthLastBlock) {
226  predictedValue |= (uint64_t) ((uint64_t) 0xF << (i * 4));
227  j++;
228  }
229  else {
230  j++;
231  }
232  }
233  *predRegister = predictedValue;
234 
235  se1Params.ELETYPE = __SE_ELETYPE_64BIT;
236  se1Params.VECLEN = __SE_VECLEN_1ELEM;
237  se1Params.DIMFMT = __SE_DIMFMT_1D;
238  se1Params.ICNT0 = 1;
239 
240  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET) = se1Params;
241  }
242 
243  return status;
244 }
245 
246 /**********************************************************************************************************************/
247 /* */
248 /* Explicit instantiations for VXLIB_histogram_init_ci */
249 /* */
250 /**********************************************************************************************************************/
251 
252 template VXLIB_STATUS
254  const VXLIB_bufParams2D_t *bufParamsIn0,
255  const VXLIB_bufParams1D_t *bufParamsOut,
256  const VXLIB_histogram_InitArgs *pKerInitArgs);
257 
258 /**********************************************************************************************************************/
259 /* */
260 /* VXLIB_histogram_exec_ci */
261 /* */
262 /**********************************************************************************************************************/
263 
264 // this method generates a histogram distribution based on pixel values
265 template <typename dTypeIn, typename dTypeOut>
266 VXLIB_STATUS VXLIB_histogram_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
267 {
268  // typecast handle (void) to struct pointer type associated to kernel
269  VXLIB_histogram_PrivArgs *pKerPrivArgs = (VXLIB_histogram_PrivArgs *) handle;
270 
271  // structs to retrieve SE and SA paramters
272  __SE_TEMPLATE_v1 se0Params;
273  __SE_TEMPLATE_v1 se1Params;
274  __SA_TEMPLATE_v1 sa0Params;
275 
276  // create local pointers
277  dTypeIn *restrict pInLocal = (dTypeIn *) pIn;
278  dTypeOut *restrict pOutLocal = (dTypeOut *) pOut;
279  uint8_t *restrict pPredicate = (uint8_t *) pKerPrivArgs->bufPredicateStore;
280 
282  printf("Enter VXLIB_histogram_exec_ci\n");
283 #endif
284 
285  // address of SE and SA parameters
286  uint8_t *pBlock = pKerPrivArgs->bufPblock;
287 
288  // retrieve SE and SA parameters
289  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET);
290  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET);
291  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET);
292 
293  // open SEs to fetch Input samples
294  __SE0_OPEN(pInLocal, se0Params);
295  __SE1_OPEN(pPredicate, se1Params);
296 
297  // open SA0 to write output samples
298  __SA0_OPEN(sa0Params);
299 
300  size_t numBlocks = pKerPrivArgs->numBlocks;
301  uint16_t numBins = pKerPrivArgs->pKerInitArgs.numBins;
302 
303  c7x::uint_vec lowerBound = pKerPrivArgs->pKerInitArgs.offset;
304  c7x::uint_vec upperBound = pKerPrivArgs->pKerInitArgs.range + pKerPrivArgs->pKerInitArgs.offset;
305  c7x::uint_vec numBinsVec = pKerPrivArgs->pKerInitArgs.numBins;
306  c7x::uint_vec rangeVec = pKerPrivArgs->pKerInitArgs.range;
307 
308  // derive c7x vector type from template typename
309  typedef typename c7x::make_full_vector<dTypeOut>::type vec;
310 
311  uint32_t maxSIMD = c7x::max_simd<dTypeOut>::value;
312 
313  /**************************/
314  /* Populate Histogram */
315  /**************************/
316 
317  // Linear images
318  if (pKerPrivArgs->width == pKerPrivArgs->strideInElements) {
319  // When width * height is not a multiple of SIMD, need to use predicate buffer on last block to mask DECDIM 0s
320  if ((pKerPrivArgs->width * pKerPrivArgs->height) % maxSIMD) {
321  vec inVec, inHist, histIndex;
322  __vpred cmp_lower, cmp_upper, condHist;
323  size_t linearBlocks = (pKerPrivArgs->width * pKerPrivArgs->height) / maxSIMD;
324  for (uint32_t counter = 0; counter < linearBlocks; counter++) {
325  inVec = c7x::strm_eng<0, vec>::get_adv(); // fetch input 0 via SE0
326 
327  cmp_lower = __cmp_le_pred(lowerBound, inVec);
328  cmp_upper = __cmp_lt_pred(inVec, upperBound);
329  condHist = __and(cmp_lower, cmp_upper);
330 
331  histIndex = ((inVec - lowerBound) * numBinsVec) / rangeVec;
332 
333  inHist = __select(condHist, histIndex, numBinsVec);
334 
335  __hist(__LUT_SET0, inHist);
336  }
337  // Predicate buffer to mask DECDIM 0s on last block
338  inVec = c7x::strm_eng<0, vec>::get_adv(); // fetch input 0 via SE0
339  __vpred inPred = _mvrp(c7x::strm_eng<1, ulong>::get_adv());
340 
341  cmp_lower = __cmp_le_pred(lowerBound, inVec);
342  cmp_upper = __cmp_lt_pred(inVec, upperBound);
343  condHist = __and(cmp_lower, cmp_upper);
344  condHist = __and(condHist, inPred);
345 
346  histIndex = ((inVec - lowerBound) * numBinsVec) / rangeVec;
347 
348  inHist = __select(condHist, histIndex, numBinsVec);
349 
350  __hist(__LUT_SET0, inHist);
351  }
352  // Width * height is a SIMD multiple, can process entire image in one loop
353  else {
354  size_t linearBlocks = (pKerPrivArgs->width * pKerPrivArgs->height) / maxSIMD;
355  for (uint32_t counter = 0; counter < linearBlocks; counter++) {
356  vec inVec = c7x::strm_eng<0, vec>::get_adv(); // fetch input 0 via SE0
357  __vpred cmp_lower = __cmp_le_pred(lowerBound, inVec);
358  __vpred cmp_upper = __cmp_lt_pred(inVec, upperBound);
359  __vpred condHist = __and(cmp_lower, cmp_upper);
360 
361  vec histIndex = ((inVec - lowerBound) * numBinsVec) / rangeVec;
362 
363  vec inHist = __select(condHist, histIndex, numBinsVec);
364 
365  __hist(__LUT_SET0, inHist);
366  }
367  }
368  }
369  // Not linear image (width != stride)
370  else {
371  for (uint32_t counter = 0; counter < numBlocks; counter++) {
372  vec inVec = c7x::strm_eng<0, vec>::get_adv(); // fetch input 0 via SE0
373  __vpred inPred = _mvrp(c7x::strm_eng<1, ulong>::get_adv()); // Predicate buffer to mask DECDIM 0s on last block
374 
375  __vpred cmp_lower = __cmp_le_pred(lowerBound, inVec);
376  __vpred cmp_upper = __cmp_lt_pred(inVec, upperBound);
377  __vpred condHist = __and(cmp_lower, cmp_upper);
378  condHist = __and(condHist, inPred);
379 
380  vec histIndex = ((inVec - lowerBound) * numBinsVec) / rangeVec;
381 
382  // numBinsVec is chosen if the pixel does not meet the criteria to be in the distribution
383  // bins are from bin[0] - bin[numBins -1] and thus numBins will not be a valid index and will not populate
384  // histogram
385  vec inHist = __select(condHist, histIndex, numBinsVec);
386 
387  __hist(__LUT_SET0, inHist);
388  }
389  }
390 
391  /**************************/
392  /* Store Histogram */
393  /**************************/
394 
395  // Store histogram in outBuffer if last call to exec function for image
396  if (pKerPrivArgs->pKerInitArgs.lastBlock) {
397 
398  uint32_t saStoreIter = (numBins / maxSIMD);
399  if (numBins % maxSIMD) {
400  saStoreIter++;
401  }
402  c7x::uint_vec bin;
403  uint16_t binCount = 0;
404 
405  for (int i = 0; i < saStoreIter; i++) {
406  bin = 0;
407  for (int j = 0; j < maxSIMD; j++) {
408  c7x::uint_vec vHist = __lookup_read_uint(__LUT_SET0, (c7x::uint_vec)(binCount));
409  // Horizontal add all bins to one index
410  bin.s[j] = __horizontal_add(vHist);
411  binCount++;
412  }
413 
414  // After reaching a SIMD number of bins, store vector
415  __vpred tmp = c7x::strm_agen<0, vec>::get_vpred();
416  vec *addr = c7x::strm_agen<0, vec>::get_adv(pOutLocal);
417  __vstore_pred(tmp, addr, bin);
418  }
419  }
420 
421  // close SE/SA
422  __SE0_CLOSE();
423  __SE1_CLOSE();
424  __SA0_CLOSE();
425 
426  return VXLIB_SUCCESS;
427 }
428 
429 /**********************************************************************************************************************/
430 /* */
431 /* Explicit instantiations for VXLIB_histogram_exec_ci */
432 /* */
433 /**********************************************************************************************************************/
434 
436  void *restrict pIn,
437  void *restrict pOut);
438 
439 void VXLIB_histogram_perfEst(VXLIB_kernelHandle handle, size_t *archCycles, size_t *estCycles)
440 {
441 
442  // typecast handle (void) to struct pointer type associated to kernel
443  VXLIB_histogram_PrivArgs *pKerPrivArgs = (VXLIB_histogram_PrivArgs *) handle;
444 
445  // obtain loop count for compute loop
446  size_t numBlocks = pKerPrivArgs->numBlocks;
447  size_t overheadCnt = 17; // profiled code before entering compute loop
448  *archCycles = 7 + numBlocks * 2; // obtained from asm
449  *estCycles = overheadCnt + *archCycles;
450 }
template VXLIB_STATUS VXLIB_histogram_init_ci< VXLIB_HISTOGRAM_DTYPE_I8U_O32U >(VXLIB_kernelHandle handle, const VXLIB_bufParams2D_t *bufParamsIn0, const VXLIB_bufParams1D_t *bufParamsOut, const VXLIB_histogram_InitArgs *pKerInitArgs)
#define SE0_PARAM_OFFSET
template VXLIB_STATUS VXLIB_histogram_exec_ci< VXLIB_HISTOGRAM_TYPENAME_I8U_O32U >(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
VXLIB_STATUS VXLIB_histogram_init_ci(VXLIB_kernelHandle handle, const VXLIB_bufParams2D_t *bufParamsIn, const VXLIB_bufParams1D_t *bufParamsOut, const VXLIB_histogram_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
#define SA0_PARAM_OFFSET
VXLIB_STATUS VXLIB_histogram_exec_ci(VXLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
#define SE1_PARAM_OFFSET
Header file for kernel's internal use. For the kernel's interface, please see VXLIB_histogram.
void * VXLIB_kernelHandle
Handle type for VXLIB operations.
Definition: VXLIB_types.h:247
#define VXLIB_DEBUGPRINT
Enable debug printf statements.
Definition: VXLIB_types.h:44
VXLIB_STATUS_NAME
The enumeration of all status codes.
Definition: VXLIB_types.h:220
@ VXLIB_SUCCESS
Definition: VXLIB_types.h:221
void VXLIB_histogram_val_init(uint16_t numBins)
This is a helper function to help clear the values stored in the LUT histogram.
void VXLIB_histogram_perfEst(VXLIB_kernelHandle handle, size_t *archCycles, size_t *estCycles)
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
uint16_t numBins
Parameter indicating distribution number of bins (<= 256)
uint8_t offset
Parameter indicating distribution offset.
uint16_t range
Parameter indicating distribution range (<= 256)
uint8_t lastBlock
Flag that indicates if the function call is the final call for the image (0: intermediate call,...
Structure that is reserved for internal use by the kernel.
VXLIB_histogram_InitArgs pKerInitArgs
Initargs of the kernel.
uint8_t bufPredicateStore[PRIVATE_BUFSIZE_PREDICATESTORE]
size_t numBlocks
Number of blocks to be processed after simidfication.
size_t height
Height of image
uint8_t bufPblock[VXLIB_HISTOGRAM_IXX_IXX_OXX_PBLOCK_SIZE]
Array to hold SE/SA params.
size_t strideInElements
Stride of input0 in elements.
size_t width
Width of image