FFTLIB User Guide
c7504/FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2017 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 #include "../FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc.h"
30 
31 #define TRACE_ON (0)
32 
33 #if TRACE_ON
34 #include "../../../common/printv.h"
35 #include <stdio.h>
36 #endif
37 
38 // CODE_SECTION(FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc, ".text:optimized")
39 // CODE_SECTION(FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_core, ".text:optimized")
40 // CODE_SECTION(FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_checkParams,
41 // ".text:optimized")
42 
43 #define SE_PARAM_BASE (0x0000)
44 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SE_LOOP6_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SE_LOOP7_PARAM_OFFSET (SE_LOOP6_PARAM_OFFSET + SE_PARAM_SIZE)
51 #define SE_TWID_PARAM_OFFSET (SE_LOOP7_PARAM_OFFSET + SE_PARAM_SIZE)
52 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
53 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
54 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
55 #define SA_LOOP4_PARAM_OFFSET (SA_LOOP3_PARAM_OFFSET + SA_PARAM_SIZE)
56 #define SA_LOOP6_PARAM_OFFSET (SA_LOOP4_PARAM_OFFSET + SA_PARAM_SIZE)
57 #define SE_CONJ_LOOP_PARAM_OFFSET (SA_LOOP6_PARAM_OFFSET + SE_PARAM_SIZE)
58 #define SA_CONJ_LOOP_PARAM_OFFSET (SE_CONJ_LOOP_PARAM_OFFSET + SE_PARAM_SIZE)
59 
60 static inline void ifft_i32fc_o32fc_conjugate_init_ci(void *pX, uint32_t size, void *pBlock)
61 {
62  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1();
63  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1();
64  /* cfloat *restrict pXLocal = (cfloat *) pX; */
65 
66  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
67  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
68  se0_param.DIMFMT = __SE_DIMFMT_1D;
69  se0_param.ICNT0 = size;
70 
71  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_CONJ_LOOP_PARAM_OFFSET)) = se0_param;
72 
73  sa0_param.ICNT0 = size;
74  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
75  sa0_param.DIMFMT = __SA_DIMFMT_1D;
76 
77  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_CONJ_LOOP_PARAM_OFFSET)) = sa0_param;
78 }
79 
82  FFTLIB_bufParams1D_t *bufParamsX,
83  FFTLIB_F32 *pW,
84  FFTLIB_bufParams1D_t *bufParamsW,
85  FFTLIB_F32 *pY,
86  FFTLIB_bufParams1D_t *bufParamsY,
87  uint32_t numPoints,
88  uint32_t numChannels,
89  void *pBlock)
90 {
92 
93 #if defined(FFTLIB_CHECK_PARAMS) || defined(FFTLIB_IFFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS)
94  /* status = FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_checkParams ( */
95  /* pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, numPoints,
96  * numChannels, */
97  /* pBlock); */
98  if (status == FFTLIB_SUCCESS)
99 #endif
100  {
101  uint32_t numPointsPerDft;
102  uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
103  uint32_t seCnt6, seCnt7, seCnt8, seCnt9, seCnt10;
104  uint32_t seCnt11;
105  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1();
106  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1();
107  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1();
108 
109  numPointsPerDft = numPoints;
110  seCnt1 = numPoints >> 2;
111  seCnt2 = numPoints >> 4;
112  seCnt3 = 1;
113  seCnt4 = numPoints >> 2;
114  seCnt6 = seCnt3 * numChannels;
115  seCnt7 = (numPoints * numChannels >> 4) > 1 ? numPoints * numChannels >> 4 : 1;
116  seCnt8 = numPoints * numChannels;
117  seCnt9 = (numPoints * numChannels > 16) ? numPoints * numChannels : 16;
118  seCnt10 = (numPoints * numChannels >> 5) > 1 ? numPoints * numChannels >> 5 : 1;
119  seCnt11 = (numPoints * numChannels > 32) ? numPoints * numChannels : 32;
120 
121  /* Init conjugate for IFFT */
122  ifft_i32fc_o32fc_conjugate_init_ci(pX, (numPointsPerDft * numChannels), pBlock);
123 
124  uint32_t elementSize = c7x::element_count_of<c7x::cfloat_vec>::value;
125 
126  se0_param = __gen_SE_TEMPLATE_v1();
127  se0_param.ICNT0 = elementSize;
128  se0_param.ICNT1 = 4;
129  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
130  se0_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
131  se0_param.DIM2 = elementSize; /* quarter */
132  se0_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
133  se0_param.DIM3 = numPointsPerDft;
134 
135  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
136  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
137  se0_param.DIMFMT = __SE_DIMFMT_4D;
138  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET)) = se0_param;
139 
140  se1_param = __gen_SE_TEMPLATE_v1();
141  se1_param.ICNT0 = elementSize;
142  se1_param.ICNT1 = 3;
143  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
144  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
145  se1_param.DIM2 = elementSize;
146  se1_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
147  se1_param.DIM3 = 0;
148 
149  se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
150  se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
151  se1_param.DIMFMT = __SE_DIMFMT_4D;
152  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET)) = se1_param;
153 
154  sa0_param = __gen_SA_TEMPLATE_v1();
155  sa0_param.ICNT0 = elementSize;
156  sa0_param.ICNT1 = 4;
157  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
158  sa0_param.ICNT2 = seCnt2; /* Number of 8-point stores within each */
159  sa0_param.DIM2 = elementSize;
160  sa0_param.ICNT3 = seCnt6;
161  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's for all channels */
162 
163  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
164  sa0_param.DIMFMT = __SA_DIMFMT_4D;
165  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET)) = sa0_param;
166 
167  se0_param = __gen_SE_TEMPLATE_v1();
168  se0_param.ICNT0 = elementSize;
169  se0_param.ICNT1 = 1;
170  se0_param.DIM1 = 16; /* Process two 16-point DFTs in one shot */
171  se0_param.ICNT2 = seCnt7;
172  se0_param.DIM2 = 16;
173 
174  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
175  /* se0_param.TRANSPOSE = */
176  /* __SE_TRANSPOSE_256BIT; /\* Using 256BIT transpose required *\/ */
177  /* /\* 16-byte alignment on pX *\/ */
178  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
179  se0_param.DIMFMT = __SE_DIMFMT_3D;
180  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET)) = se0_param;
181 
182  sa0_param = __gen_SA_TEMPLATE_v1();
183  sa0_param.ICNT0 = seCnt8; /* Input buffer must be at least 32
184  * elements long even though
185  * numPoints*numChannels = 16 */
186 
187  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
188  sa0_param.DIMFMT = __SA_DIMFMT_1D;
189  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET)) = sa0_param;
190 
191  se0_param = __gen_SE_TEMPLATE_v1();
192  se0_param.ICNT0 = seCnt8;
193 
194  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
195  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
196  se0_param.DIMFMT = __SE_DIMFMT_1D;
197  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET)) = se0_param;
198 
199  sa0_param = __gen_SA_TEMPLATE_v1();
200  sa0_param.ICNT0 = seCnt8;
201 
202  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
203  sa0_param.DIMFMT = __SA_DIMFMT_1D;
204  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET)) = sa0_param;
205 
206  /* The following SE configuration may cause sub-optimal
207  * tile in SE because second row of tile starts in the
208  * middle of first row */
209  se0_param = __gen_SE_TEMPLATE_v1();
210  se0_param.ICNT0 = 4;
211  se0_param.ICNT1 = 8;
212  se0_param.DIM1 = 4;
213  se0_param.ICNT2 = seCnt7;
214  se0_param.DIM2 = 32;
215 
216  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
217  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
218  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
219  se0_param.DIMFMT = __SE_DIMFMT_3D;
220  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET)) = se0_param;
221 
222  sa0_param = __gen_SA_TEMPLATE_v1();
223  sa0_param.ICNT0 = seCnt9; /* Input buffer must be at least 32
224  * elements long even though
225  * numPoints*numChannels = 16 */
226 
227  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
228  sa0_param.DIMFMT = __SA_DIMFMT_1D;
229  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP4_PARAM_OFFSET)) = sa0_param;
230 
231  se0_param = __gen_SE_TEMPLATE_v1();
232  se0_param.ICNT0 = seCnt4; /* Fetch consecutive four points for DFT */
233  se0_param.ICNT1 = elementSize;
234  se0_param.DIM1 = seCnt4;
235  se0_param.ICNT2 = numChannels;
236  se0_param.DIM2 = numPoints;
237 
238  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
239  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
240  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
241  se0_param.DIMFMT = __SE_DIMFMT_3D;
242  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET)) = se0_param;
243 
244  se0_param = __gen_SE_TEMPLATE_v1();
245  se0_param.ICNT0 = 8;
246  se0_param.ICNT1 = 8;
247  se0_param.DIM1 = 8;
248  se0_param.ICNT2 = seCnt10;
249  se0_param.DIM2 = 64;
250 
251  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
252  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
253  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
254  se0_param.DIMFMT = __SE_DIMFMT_3D;
255  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP6_PARAM_OFFSET)) = se0_param;
256 
257  sa0_param = __gen_SA_TEMPLATE_v1();
258  sa0_param.ICNT0 = seCnt11; /* Input buffer must be at least 64
259  * elements long even though
260  * numPoints*numChannels = 32 */
261 
262  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
263  sa0_param.DIMFMT = __SA_DIMFMT_1D;
264  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP6_PARAM_OFFSET)) = sa0_param;
265 
266  se0_param = __gen_SE_TEMPLATE_v1();
267  se0_param.ICNT0 = seCnt4;
268  se0_param.ICNT1 = elementSize;
269  se0_param.DIM1 = seCnt4;
270  se0_param.ICNT2 = numChannels;
271  se0_param.DIM2 = numPoints;
272 
273  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
274  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
275  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
276  se0_param.DIMFMT = __SE_DIMFMT_3D;
277  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP7_PARAM_OFFSET)) = se0_param;
278  }
279  return (status);
280 }
281 
282 /* Scale and Conjugate input buffer in-place */
283 static inline c7x::cfloat_vec
284 ifft_i32fc_o32fc_scaleAndConjugate(c7x::cfloat_vec in, c7x::float_vec scaleVec, c7x::ulong_vec xorVec)
285 {
286  return (c7x::as_cfloat_vec(scaleVec * c7x::as_float_vec(c7x::as_ulong_vec(in) ^ xorVec)));
287 }
288 
289 static inline void
290 ifft_i32fc_o32fc_conjugate_exec_ci(void *pX, c7x::ulong_vec xorVec, uint32_t size, uint32_t numPoints, void *pBlock)
291 {
292 
293  typedef typename c7x::cfloat_vec CV;
294  typedef CV *CVP;
295 
296  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1();
297  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1();
298  cfloat *restrict pXLocal = (cfloat *) pX;
299 
300  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_CONJ_LOOP_PARAM_OFFSET));
301 
302  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_CONJ_LOOP_PARAM_OFFSET));
303 
304  __SE0_OPEN(pX, se0_param);
305  __SA0_OPEN(sa0_param);
306 
307  uint32_t i = 0;
308  uint32_t loopCount = (size) / c7x::element_count_of<c7x::cfloat_vec>::value;
309  c7x::cfloat_vec regIn, regStore;
310  float scale = 1.0f / numPoints;
311  c7x::float_vec scaleVec = __vload_dup(&scale);
312 
313  __vpred tmp;
314  CV *addr;
315 
316  for (i = 0; i < loopCount; i++) {
317  regIn = c7x::strm_eng<0, c7x::cfloat_vec>::get_adv();
318  /* FFTLIB_debugPrintFloatVector (c7x::as_float_vec (regIn)); */
319 
320  regStore = ifft_i32fc_o32fc_scaleAndConjugate(regIn, scaleVec, xorVec);
321  /* FFTLIB_debugPrintFloatVector (c7x::as_float_vec (regStore)); */
322 
323  tmp = c7x::strm_agen<0, CV>::get_vpred();
324  addr = c7x::strm_agen<0, CV>::get_adv(&pXLocal[0]);
325  __vstore_pred(tmp, addr, regStore);
326  }
327 
328  __SE0_CLOSE();
329  __SA0_CLOSE();
330 }
331 
334  FFTLIB_bufParams1D_t *bufParamsX,
335  FFTLIB_F32 *pW,
336  FFTLIB_bufParams1D_t *bufParamsW,
337  FFTLIB_F32 *pY,
338  FFTLIB_bufParams1D_t *bufParamsY,
339  uint32_t numPoints,
340  uint32_t numChannels,
341  void *pBlock)
342 {
343  uint32_t k, l;
344  FFTLIB_STATUS status = FFTLIB_SUCCESS;
345  uint32_t numPointsPerDft;
346  uint32_t numLeadingZeros;
347  uint32_t offsetBitReverse;
348  uint32_t seCnt1, seCnt2, seCnt3, seCnt6;
349 
350  __SE_TEMPLATE_v1 se0_param;
351  __SE_TEMPLATE_v1 se1_param;
352  __SA_TEMPLATE_v1 sa0_param;
353 
354  cfloat *restrict pXLocal;
355  cfloat *restrict pYLocal;
356  cfloat *restrict pWLocal;
357  cfloat *restrict pY0;
358  cfloat *restrict pY1;
359  cfloat *restrict pY2;
360  cfloat *restrict pY3;
361  cfloat *restrict pY4;
362  cfloat *restrict pY5;
363  cfloat *restrict pY6;
364  cfloat *restrict pY7;
365 
366  typedef typename c7x::cfloat_vec CV;
367  typedef CV *CVP;
368 
369  /* typedef typename c7x::float_vec V; */
370  /* typedef V* VP; */
371 
372  CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
373  CV vSum1, vSum2, vDiff1, vDiff2;
374  CV vTwX1, vTwX2, vTwX3;
375  CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
376  CV vX0, vX1, vX2, vX3;
377  CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
378  CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
379  CV vX0_1, vX1_1, vX2_1, vX3_1;
380  CV vX0_2PtDft_1, vX0_2PtDft_2;
381  CV vX1_2PtDft_1, vX1_2PtDft_2;
382  CV vX2_2PtDft_1, vX2_2PtDft_2;
383  CV vX3_2PtDft_1, vX3_2PtDft_2;
384  CV vX01_lo, vX23_lo, vX01_hi, vX23_hi;
385  cfloat twTemp;
386 
387 #ifdef FFTLIB_CHECK_PARAMS
388  /* status = FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_checkParams ( */
389  /* pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, numPoints,
390  * numChannels, */
391  /* pBlock); */
392  if (status == FFTLIB_SUCCESS)
393 #endif
394  {
395  numPointsPerDft = numPoints;
396 
397  float scale = 1.0f;
398  c7x::float_vec scaleVec = __vload_dup(&scale);
399 
400  /* Set xor vector to flip sign bit of imaginary component of cfloat pair */
401 #if defined(_HOST_BUILD)
402  c7x::ulong_vec xorVec = (c7x::ulong_vec)(0x0000000080000000);
403 
404 #else
405  c7x::ulong_vec xorVec = (0x0000000080000000);
406 #endif
407 
408  /* Scale by 1/N and conjugate for batched IFFT, then follow batched FFT implementation */
409  ifft_i32fc_o32fc_conjugate_exec_ci((void *) pX, xorVec, (numPointsPerDft * numChannels), numPointsPerDft, pBlock);
410 
411  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET));
412  se1_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET));
413  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET));
414  seCnt1 = numPointsPerDft >> 2;
415  seCnt2 = numPointsPerDft >> 4;
416  seCnt3 = 1;
417 
418  pXLocal = (cfloat *) pX;
419  pWLocal = (cfloat *) pW;
420  pYLocal = (cfloat *) pY;
421 
422  while (numPointsPerDft >= 16) {
423 
424  seCnt6 = seCnt3 * numChannels;
425  se0_param.ICNT1 = 4;
426  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
427  se0_param.ICNT2 = seCnt2;
428  se0_param.DIM2 = 4; /* Number of 8-point fetches within each quarter */
429  se0_param.ICNT3 = seCnt6;
430  se0_param.DIM3 = numPointsPerDft; /* Number of DFT's for all channels */
431  __SE0_OPEN((void *) pXLocal, se0_param);
432 
433  se1_param.ICNT1 = 3;
434  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
435  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each quarter*/
436  se1_param.DIM2 = 4;
437  se1_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
438  se1_param.DIM3 = 0;
439  __SE1_OPEN((void *) pWLocal, se1_param);
440 
441  sa0_param.ICNT1 = 4;
442  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
443  sa0_param.ICNT2 = seCnt2;
444  sa0_param.DIM2 = 4;
445  /* Number of 8-point stores within each quarter */
446  sa0_param.ICNT3 = seCnt6;
447  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
448  __SA0_OPEN(sa0_param);
449 
450  /* Loop is unrolled twice for better optimization */
451  for (k = 0; k < numPoints * numChannels; k += 32) {
452 
453  /* First iteration of loop unroll */
454  vX_0 = c7x::strm_eng<0, CV>::get_adv();
455  vX_N_4 = c7x::strm_eng<0, CV>::get_adv();
456  vX_N_2 = c7x::strm_eng<0, CV>::get_adv();
457  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv();
458 
459  vSum1 = vX_0 + vX_N_2;
460  vSum2 = vX_N_4 + vX_3N_4;
461  vDiff1 = vX_0 - vX_N_2;
462  vDiff2 = vX_N_4 - vX_3N_4;
463 
464  vTwX1 = c7x::strm_eng<1, CV>::get_adv();
465  vTwX2 = c7x::strm_eng<1, CV>::get_adv();
466  vTwX3 = c7x::strm_eng<1, CV>::get_adv();
467 
468  vX0Temp = vSum1 + vSum2;
469  vX1Temp = vDiff1 - __vcrot90sp_vv(vDiff2);
470  vX2Temp = vSum1 - vSum2;
471  vX3Temp = vDiff1 + __vcrot90sp_vv(vDiff2);
472 
473  vX0 = vX0Temp;
474  vX1 = __complex_multiply(vX1Temp, vTwX1);
475  vX2 = __complex_multiply(vX2Temp, vTwX2);
476  vX3 = __complex_multiply(vX3Temp, vTwX3);
477 
478  __vpred tmp;
479  CVP addr;
480 
481  tmp = c7x::strm_agen<0, CV>::get_vpred();
482  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
483  __vstore_pred(tmp, addr, vX0);
484 
485  tmp = c7x::strm_agen<0, CV>::get_vpred();
486  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
487  __vstore_pred(tmp, addr, vX2);
488 
489  tmp = c7x::strm_agen<0, CV>::get_vpred();
490  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
491  __vstore_pred(tmp, addr, vX1);
492 
493  tmp = c7x::strm_agen<0, CV>::get_vpred();
494  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
495  __vstore_pred(tmp, addr, vX3);
496 
497  /* Second iteration of loop unroll */
498  vX_0 = c7x::strm_eng<0, CV>::get_adv();
499  vX_N_4 = c7x::strm_eng<0, CV>::get_adv();
500  vX_N_2 = c7x::strm_eng<0, CV>::get_adv();
501  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv();
502 
503  vSum1 = vX_0 + vX_N_2;
504  vSum2 = vX_N_4 + vX_3N_4;
505  vDiff1 = vX_0 - vX_N_2;
506  vDiff2 = vX_N_4 - vX_3N_4;
507 
508  vTwX1 = c7x::strm_eng<1, CV>::get_adv();
509  vTwX2 = c7x::strm_eng<1, CV>::get_adv();
510  vTwX3 = c7x::strm_eng<1, CV>::get_adv();
511 
512  vX0Temp = vSum1 + vSum2;
513  vX1Temp = vDiff1 - __vcrot90sp_vv(vDiff2);
514  vX2Temp = vSum1 - vSum2;
515  vX3Temp = vDiff1 + __vcrot90sp_vv(vDiff2);
516 
517  vX0 = vX0Temp;
518  vX1 = __complex_multiply(vX1Temp, vTwX1);
519  vX2 = __complex_multiply(vX2Temp, vTwX2);
520  vX3 = __complex_multiply(vX3Temp, vTwX3);
521 
522  tmp = c7x::strm_agen<0, CV>::get_vpred();
523  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
524  __vstore_pred(tmp, addr, vX0);
525 
526  tmp = c7x::strm_agen<0, CV>::get_vpred();
527  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
528  __vstore_pred(tmp, addr, vX2);
529 
530  tmp = c7x::strm_agen<0, CV>::get_vpred();
531  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
532  __vstore_pred(tmp, addr, vX1);
533 
534  tmp = c7x::strm_agen<0, CV>::get_vpred();
535  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
536  __vstore_pred(tmp, addr, vX3);
537  }
538  __SA0_CLOSE();
539  __SE0_CLOSE();
540  __SE1_CLOSE();
541 
542  numPointsPerDft >>= 2;
543  pWLocal += numPointsPerDft * 3;
544  seCnt1 >>= 2;
545  seCnt2 >>= 2;
546  seCnt3 <<= 2;
547  }
548 
549  if (numPointsPerDft == 16) {
550  /* 16-point stage */
551  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET));
552  __SE0_OPEN((void *) pXLocal, se0_param);
553  __SE1_OPEN((void *) (pXLocal + 8), se0_param);
554 
555  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET));
556  __SA0_OPEN(sa0_param);
557 
558  vTwX1 = *((CVP) pWLocal);
559  vTwX2 = *((CVP) (pWLocal + 4));
560  vTwX3 = *((CVP) (pWLocal + 8));
561 
562 #if __C7X_HOSTEM__
563  vTwX1 = CV(vTwX1.lo(), vTwX1.lo());
564  vTwX2 = CV(vTwX2.lo(), vTwX2.lo());
565  vTwX3 = CV(vTwX3.lo(), vTwX3.lo());
566 #else
567  vTwX1 = (CV) (vTwX1.lo(), vTwX1.lo());
568  vTwX2 = (CV) (vTwX2.lo(), vTwX2.lo());
569  vTwX3 = (CV) (vTwX3.lo(), vTwX3.lo());
570 #endif
571 
572  for (k = 0; k < numPoints * numChannels; k += 32) {
573  vX_0 = c7x::strm_eng<0, CV>::get_adv();
574  vX_N_4 = c7x::strm_eng<0, CV>::get_adv();
575  vX_N_2 = c7x::strm_eng<1, CV>::get_adv();
576  vX_3N_4 = c7x::strm_eng<1, CV>::get_adv();
577 
578  vSum1 = vX_0 + vX_N_2;
579  vSum2 = vX_N_4 + vX_3N_4;
580  vDiff1 = vX_0 - vX_N_2;
581  vDiff2 = vX_N_4 - vX_3N_4;
582 
583  vX0Temp = vSum1 + vSum2;
584  vX1Temp = vDiff1 - __vcrot90sp_vv(vDiff2);
585  vX2Temp = vSum1 - vSum2;
586  vX3Temp = vDiff1 + __vcrot90sp_vv(vDiff2);
587 
588  vX0 = vX0Temp;
589  vX1 = __complex_multiply(vX1Temp, vTwX1);
590  vX2 = __complex_multiply(vX2Temp, vTwX2);
591  vX3 = __complex_multiply(vX3Temp, vTwX3);
592 
593 #if __C7X_HOSTEM__
594  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred();
595  CVP addr;
596  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
597  __vstore_pred(tmp, addr, CV(vX0.lo(), vX2.lo()));
598 
599  tmp = c7x::strm_agen<0, CV>::get_vpred();
600  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
601  __vstore_pred(tmp, addr, CV(vX1.lo(), vX3.lo()));
602 
603  tmp = c7x::strm_agen<0, CV>::get_vpred();
604  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
605  __vstore_pred(tmp, addr, CV(vX0.hi(), vX2.hi()));
606 
607  tmp = c7x::strm_agen<0, CV>::get_vpred();
608  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
609  __vstore_pred(tmp, addr, CV(vX1.hi(), vX3.hi()));
610 #else
611  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred();
612  CVP addr;
613  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
614  __vstore_pred(tmp, addr, (CV) (vX0.lo(), vX2.lo()));
615 
616  tmp = c7x::strm_agen<0, CV>::get_vpred();
617  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
618  __vstore_pred(tmp, addr, (CV) (vX1.lo(), vX3.lo()));
619 
620  tmp = c7x::strm_agen<0, CV>::get_vpred();
621  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
622  __vstore_pred(tmp, addr, (CV) (vX0.hi(), vX2.hi()));
623 
624  tmp = c7x::strm_agen<0, CV>::get_vpred();
625  addr = c7x::strm_agen<0, CV>::get_adv(pXLocal);
626  __vstore_pred(tmp, addr, (CV) (vX1.hi(), vX3.hi()));
627 #endif
628  }
629  __SA0_CLOSE();
630  __SE0_CLOSE();
631  __SE1_CLOSE();
632  }
633  else {
634 #if 0
635  /* 32-point stage */
636  se0_param = *(
637  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET));
638  __SE0_OPEN ((void *) pXLocal, se0_param);
639 
640  sa0_param = *(
641  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET));
642  __SA0_OPEN (sa0_param);
643 
644  vTwX1 = *((CVP) pWLocal);
645  vTwX2 = *((CVP) (pWLocal + 8));
646  vTwX3 = *((CVP) (pWLocal + 16));
647 
648  for (k = 0; k < numPoints * numChannels; k += 32) {
649  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
650  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
651  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
652  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
653 
654  vSum1 = vX_0 + vX_N_2;
655  vSum2 = vX_N_4 + vX_3N_4;
656  vDiff1 = vX_0 - vX_N_2;
657  vDiff2 = vX_N_4 - vX_3N_4;
658 
659  vX0Temp = vSum1 + vSum2;
660  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
661  vX2Temp = vSum1 - vSum2;
662  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
663 
664  vX0 = vX0Temp;
665  vX1 = __complex_multiply (vX1Temp, vTwX1);
666  vX2 = __complex_multiply (vX2Temp, vTwX2);
667  vX3 = __complex_multiply (vX3Temp, vTwX3);
668 
669  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
670  CVP addr;
671  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
672  __vstore_pred (tmp, addr, vX0);
673 
674  tmp = c7x::strm_agen<0, CV>::get_vpred ();
675  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
676  __vstore_pred (tmp, addr, vX2);
677 
678  tmp = c7x::strm_agen<0, CV>::get_vpred ();
679  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
680  __vstore_pred (tmp, addr, vX1);
681 
682  tmp = c7x::strm_agen<0, CV>::get_vpred ();
683  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
684  __vstore_pred (tmp, addr, vX3);
685  }
686  __SE0_CLOSE ();
687  __SA0_CLOSE ();
688 #endif // if 0
689  }
690 
691  /* numPointsPerDft >>= 2; */
692  /* pWLocal += numPointsPerDft * 3; */
693 
694  if (numPointsPerDft == 4) {
695  /* 4-point stage with bit-reversal */
696 
697  if (numPoints == 16) {
698 #if 0
699 // clang-format off
700 #if __C7X_HOSTEM__
701  c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
702  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
703  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
704  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
705  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
706  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
707  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
708  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
709 #else
710  c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
711  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
712  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
713  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
714  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
715  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
716  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
717  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
718 #endif
719  // clang-format on
720 
721  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
723  __SE0_OPEN ((void *) pXLocal, se0_param);
724 
725  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
727  __SA0_OPEN (sa0_param);
728 
729  for (k = 0; k < numChannels << 4; k += 32) {
730  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
731  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
732  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
733  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
734 
735  vSum1 = vX_0 + vX_N_2;
736  vSum2 = vX_N_4 + vX_3N_4;
737  vDiff1 = vX_0 - vX_N_2;
738  vDiff2 = vX_N_4 - vX_3N_4;
739 
740  vX0 = vSum1 + vSum2;
741  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
742  vX2 = vSum1 - vSum2;
743  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
744 
745  vX01_lo = c7x::as_cfloat_vec (
746  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
747  c7x::as_uchar_vec (vX0)));
748  vX23_lo = c7x::as_cfloat_vec (
749  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
750  c7x::as_uchar_vec (vX2)));
751  vX01_hi = c7x::as_cfloat_vec (
752  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
753  c7x::as_uchar_vec (vX0)));
754  vX23_hi = c7x::as_cfloat_vec (
755  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
756  c7x::as_uchar_vec (vX2)));
757 
758  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
759  CVP addr;
760  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
761  __vstore_pred (tmp, addr, vX01_lo);
762 
763  tmp = c7x::strm_agen<0, CV>::get_vpred ();
764  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
765  __vstore_pred (tmp, addr, vX23_lo);
766 
767  tmp = c7x::strm_agen<0, CV>::get_vpred ();
768  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
769  __vstore_pred (tmp, addr, vX01_hi);
770 
771  tmp = c7x::strm_agen<0, CV>::get_vpred ();
772  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
773  __vstore_pred (tmp, addr, vX23_hi);
774  }
775  __SE0_CLOSE ();
776  __SA0_CLOSE ();
777 #endif // #if 0
778  }
779  else {
780  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET));
781  __SE0_OPEN((void *) pXLocal, se0_param);
782 
783  numLeadingZeros = __norm((int32_t) (numPoints - 1)) + 1;
784 
785  /* pY0 = &pYLocal[0x00000000u]; */
786  /* pY1 = &pYLocal[0x40000000u >> numLeadingZeros]; */
787  /* pY2 = &pYLocal[0x80000000u >> numLeadingZeros]; */
788  /* pY3 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
789 
790  pY0 = (cfloat *) (pY + 0);
791  pY1 = (cfloat *) (pY + ((0x40000000u >> numLeadingZeros) << 1));
792  pY2 = (cfloat *) (pY + ((0x80000000u >> numLeadingZeros) << 1));
793  pY3 = (cfloat *) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
794 
795 #ifdef CL7X_HE_CFLOAT_PTR_BUG
796  float *myPY0 = (float *) pY0;
797  float *myPY1 = (float *) pY1;
798  float *myPY2 = (float *) pY2;
799  float *myPY3 = (float *) pY3;
800 #endif
801 
802  for (l = 0; l < numChannels; l++) {
803  for (k = 0; k < numPoints >> 2; k += 4) {
804  offsetBitReverse = __bit_reverse(k) >> numLeadingZeros;
805 
806  vX_0 = c7x::strm_eng<0, CV>::get_adv();
807  vX_N_4 = c7x::strm_eng<0, CV>::get_adv();
808  vX_N_2 = c7x::strm_eng<0, CV>::get_adv();
809  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv();
810 
811  vSum1 = vX_0 + vX_N_2;
812  vSum2 = vX_N_4 + vX_3N_4;
813  vDiff1 = vX_0 - vX_N_2;
814  vDiff2 = vX_N_4 - vX_3N_4;
815 
816  vX0 = vSum1 + vSum2;
817  vX1 = vDiff1 - __vcrot90sp_vv(vDiff2);
818  vX2 = vSum1 - vSum2;
819  vX3 = vDiff1 + __vcrot90sp_vv(vDiff2);
820 
821  vX0 = ifft_i32fc_o32fc_scaleAndConjugate(vX0, scaleVec, xorVec);
822  vX1 = ifft_i32fc_o32fc_scaleAndConjugate(vX1, scaleVec, xorVec);
823  vX2 = ifft_i32fc_o32fc_scaleAndConjugate(vX2, scaleVec, xorVec);
824  vX3 = ifft_i32fc_o32fc_scaleAndConjugate(vX3, scaleVec, xorVec);
825 
826  __vstore_reverse_bit((CVP) (pY0 + offsetBitReverse), vX0);
827  __vstore_reverse_bit((CVP) (pY1 + offsetBitReverse), vX1);
828  __vstore_reverse_bit((CVP) (pY2 + offsetBitReverse), vX2);
829  __vstore_reverse_bit((CVP) (pY3 + offsetBitReverse), vX3);
830  }
831 
832 #ifdef CL7X_HE_CFLOAT_PTR_BUG
833  myPY0 += (numPoints << 1);
834  myPY1 += (numPoints << 1);
835  myPY2 += (numPoints << 1);
836  myPY3 += (numPoints << 1);
837 
838  pY0 = (cfloat *) myPY0;
839  pY1 = (cfloat *) myPY1;
840  pY2 = (cfloat *) myPY2;
841  pY3 = (cfloat *) myPY3;
842 
843 #else
844  pY0 += numPoints;
845  pY1 += numPoints;
846  pY2 += numPoints;
847  pY3 += numPoints;
848 #endif
849  }
850  __SE0_CLOSE();
851  }
852  }
853  else {
854  /* 4-point stage followed by 2-point stage with bit-reversal */
855 
856 #if __C7X_HOSTEM__
857  pWLocal += 1;
858  twTemp = *pWLocal;
859  vTwX1 = CV(twTemp, twTemp, twTemp, twTemp);
860  pWLocal += 2;
861  twTemp = *pWLocal;
862  vTwX2 = CV(twTemp, twTemp, twTemp, twTemp);
863  pWLocal += 2;
864  twTemp = *pWLocal;
865  vTwX3 = CV(twTemp, twTemp, twTemp, twTemp);
866 #else
867  pWLocal += 1;
868  twTemp = *pWLocal;
869  vTwX1 = (CV) (twTemp, twTemp, twTemp, twTemp);
870  pWLocal += 2;
871  twTemp = *pWLocal;
872  vTwX2 = (CV) (twTemp, twTemp, twTemp, twTemp);
873  pWLocal += 2;
874  twTemp = *pWLocal;
875  vTwX3 = (CV) (twTemp, twTemp, twTemp, twTemp);
876 
877 #endif
878 
879 #if 0
880  if (numPoints == 32) {
881 
882 // clang-format off
883  #if __C7X_HOSTEM__
884  c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
885  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
886  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
887  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
888  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
889  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
890  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
891  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
892  #else
893  c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
894  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
895  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
896  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
897  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
898  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
899  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
900  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
901  #endif
902  // clang-format on
903  CV vX01_2PtDft_1_lo, vX23_2PtDft_1_lo, vX01_2PtDft_2_lo,
904  vX23_2PtDft_2_lo;
905  CV vX01_2PtDft_1_hi, vX23_2PtDft_1_hi, vX01_2PtDft_2_hi,
906  vX23_2PtDft_2_hi;
907 
908  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
910  __SE0_OPEN ((void *) pXLocal, se0_param);
911 
912  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
914  __SA0_OPEN (sa0_param);
915 
916  for (k = 0; k < numChannels << 5; k += 64) {
917  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
918  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
919  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
920  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
921  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
922  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
923  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
924  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
925 
926  vSum1 = vX_0 + vX_N_2;
927  vSum2 = vX_N_4 + vX_3N_4;
928  vDiff1 = vX_0 - vX_N_2;
929  vDiff2 = vX_N_4 - vX_3N_4;
930 
931  vX0 = vSum1 + vSum2;
932  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
933  vX2 = vSum1 - vSum2;
934  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
935 
936  vSum1_1 = vX_0_1 + vX_N_2_1;
937  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
938  vDiff1_1 = vX_0_1 - vX_N_2_1;
939  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
940 
941  vX0Temp = vSum1_1 + vSum2_1;
942  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
943  vX2Temp = vSum1_1 - vSum2_1;
944  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
945 
946  vX0_1 = vX0Temp;
947  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
948  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
949  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
950 
951  vX0_2PtDft_1 = vX0 + vX0_1;
952  vX0_2PtDft_2 = vX0 - vX0_1;
953  vX1_2PtDft_1 = vX1 + vX1_1;
954  vX1_2PtDft_2 = vX1 - vX1_1;
955  vX2_2PtDft_1 = vX2 + vX2_1;
956  vX2_2PtDft_2 = vX2 - vX2_1;
957  vX3_2PtDft_1 = vX3 + vX3_1;
958  vX3_2PtDft_2 = vX3 - vX3_1;
959 
960  vX0_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX0_2PtDft_1, scaleVec, xorVec);
961  vX0_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX0_2PtDft_2, scaleVec, xorVec);
962  vX1_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX1_2PtDft_1, scaleVec, xorVec);
963  vX1_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX1_2PtDft_2, scaleVec, xorVec);
964  vX2_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX2_2PtDft_1, scaleVec, xorVec);
965  vX2_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX2_2PtDft_2, scaleVec, xorVec);
966  vX3_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX3_2PtDft_1, scaleVec, xorVec);
967  vX3_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX3_2PtDft_2, scaleVec, xorVec);
968 
969  /* In place of __vstore_reverse_bit, we used permute + vstore_pred */
970  /* Permute to obtain bit-reversal order */
971  vX01_2PtDft_1_lo = c7x::as_cfloat_vec (
972  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
973  c7x::as_uchar_vec (vX0_2PtDft_1)));
974  vX23_2PtDft_1_lo = c7x::as_cfloat_vec (
975  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
976  c7x::as_uchar_vec (vX2_2PtDft_1)));
977  vX01_2PtDft_2_lo = c7x::as_cfloat_vec (
978  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
979  c7x::as_uchar_vec (vX0_2PtDft_2)));
980  vX23_2PtDft_2_lo = c7x::as_cfloat_vec (
981  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
982  c7x::as_uchar_vec (vX2_2PtDft_2)));
983  vX01_2PtDft_1_hi = c7x::as_cfloat_vec (
984  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
985  c7x::as_uchar_vec (vX0_2PtDft_1)));
986  vX23_2PtDft_1_hi = c7x::as_cfloat_vec (
987  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
988  c7x::as_uchar_vec (vX2_2PtDft_1)));
989  vX01_2PtDft_2_hi = c7x::as_cfloat_vec (
990  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
991  c7x::as_uchar_vec (vX0_2PtDft_2)));
992  vX23_2PtDft_2_hi = c7x::as_cfloat_vec (
993  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
994  c7x::as_uchar_vec (vX2_2PtDft_2)));
995 
996 
997  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
998  CVP addr;
999  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1000  __vstore_pred (tmp, addr, vX01_2PtDft_1_lo);
1001 
1002  tmp = c7x::strm_agen<0, CV>::get_vpred ();
1003  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1004  __vstore_pred (tmp, addr, vX23_2PtDft_1_lo);
1005 
1006  tmp = c7x::strm_agen<0, CV>::get_vpred ();
1007  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1008  __vstore_pred (tmp, addr, vX01_2PtDft_2_lo);
1009 
1010  tmp = c7x::strm_agen<0, CV>::get_vpred ();
1011  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1012  __vstore_pred (tmp, addr, vX23_2PtDft_2_lo);
1013 
1014  tmp = c7x::strm_agen<0, CV>::get_vpred ();
1015  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1016  __vstore_pred (tmp, addr, vX01_2PtDft_1_hi);
1017 
1018  tmp = c7x::strm_agen<0, CV>::get_vpred ();
1019  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1020  __vstore_pred (tmp, addr, vX23_2PtDft_1_hi);
1021 
1022  tmp = c7x::strm_agen<0, CV>::get_vpred ();
1023  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1024  __vstore_pred (tmp, addr, vX01_2PtDft_2_hi);
1025 
1026  tmp = c7x::strm_agen<0, CV>::get_vpred ();
1027  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
1028  __vstore_pred (tmp, addr, vX23_2PtDft_2_hi);
1029  }
1030  __SE0_CLOSE ();
1031  __SA0_CLOSE ();
1032 
1033  }
1034 #endif // if (numPoints == 32)
1035  /* else */
1036  {
1037  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP7_PARAM_OFFSET));
1038  __SE0_OPEN((void *) pXLocal, se0_param);
1039 
1040  numLeadingZeros = __norm((int32_t) (numPoints - 1)) + 1;
1041 
1042  pY0 = (cfloat *) (pY + (0x00000000u));
1043  pY1 = (cfloat *) (pY + ((0x80000000u >> numLeadingZeros) << 1));
1044  pY2 = (cfloat *) (pY + ((0x20000000u >> numLeadingZeros) << 1));
1045  pY3 = (cfloat *) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
1046  pY4 = (cfloat *) (pY + ((0x40000000u >> numLeadingZeros) << 1));
1047  pY5 = (cfloat *) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
1048  pY6 = (cfloat *) (pY + ((0x60000000u >> numLeadingZeros) << 1));
1049  pY7 = (cfloat *) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
1050 
1051 #ifdef CL7X_HE_CFLOAT_PTR_BUG
1052  float *myPY0 = (float *) pY0;
1053  float *myPY1 = (float *) pY1;
1054  float *myPY2 = (float *) pY2;
1055  float *myPY3 = (float *) pY3;
1056  float *myPY4 = (float *) pY4;
1057  float *myPY5 = (float *) pY5;
1058  float *myPY6 = (float *) pY6;
1059  float *myPY7 = (float *) pY7;
1060 #endif
1061 
1062  for (l = 0; l < numChannels; l++) {
1063  for (k = 0; k < numPoints >> 2; k += 8) {
1064  offsetBitReverse = __bit_reverse(k) >> numLeadingZeros;
1065 
1066  vX_0 = c7x::strm_eng<0, CV>::get_adv();
1067  vX_0_1 = c7x::strm_eng<0, CV>::get_adv();
1068  vX_N_4 = c7x::strm_eng<0, CV>::get_adv();
1069  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv();
1070  vX_N_2 = c7x::strm_eng<0, CV>::get_adv();
1071  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv();
1072  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv();
1073  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv();
1074 
1075  vSum1 = vX_0 + vX_N_2;
1076  vSum2 = vX_N_4 + vX_3N_4;
1077  vDiff1 = vX_0 - vX_N_2;
1078  vDiff2 = vX_N_4 - vX_3N_4;
1079 
1080  vX0 = vSum1 + vSum2;
1081  vX1 = vDiff1 - __vcrot90sp_vv(vDiff2);
1082  vX2 = vSum1 - vSum2;
1083  vX3 = vDiff1 + __vcrot90sp_vv(vDiff2);
1084 
1085  vSum1_1 = vX_0_1 + vX_N_2_1;
1086  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
1087  vDiff1_1 = vX_0_1 - vX_N_2_1;
1088  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
1089 
1090  vX0Temp = vSum1_1 + vSum2_1;
1091  vX1Temp = vDiff1_1 - __vcrot90sp_vv(vDiff2_1);
1092  vX2Temp = vSum1_1 - vSum2_1;
1093  vX3Temp = vDiff1_1 + __vcrot90sp_vv(vDiff2_1);
1094 
1095  vX0_1 = vX0Temp;
1096  vX1_1 = __complex_multiply(vX1Temp, vTwX1);
1097  vX2_1 = __complex_multiply(vX2Temp, vTwX2);
1098  vX3_1 = __complex_multiply(vX3Temp, vTwX3);
1099 
1100  vX0_2PtDft_1 = vX0 + vX0_1;
1101  vX0_2PtDft_2 = vX0 - vX0_1;
1102  vX1_2PtDft_1 = vX1 + vX1_1;
1103  vX1_2PtDft_2 = vX1 - vX1_1;
1104  vX2_2PtDft_1 = vX2 + vX2_1;
1105  vX2_2PtDft_2 = vX2 - vX2_1;
1106  vX3_2PtDft_1 = vX3 + vX3_1;
1107  vX3_2PtDft_2 = vX3 - vX3_1;
1108 
1109  vX0_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX0_2PtDft_1, scaleVec, xorVec);
1110  vX0_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX0_2PtDft_2, scaleVec, xorVec);
1111  vX1_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX1_2PtDft_1, scaleVec, xorVec);
1112  vX1_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX1_2PtDft_2, scaleVec, xorVec);
1113  vX2_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX2_2PtDft_1, scaleVec, xorVec);
1114  vX2_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX2_2PtDft_2, scaleVec, xorVec);
1115  vX3_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate(vX3_2PtDft_1, scaleVec, xorVec);
1116  vX3_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate(vX3_2PtDft_2, scaleVec, xorVec);
1117 
1118  __vstore_reverse_bit((CVP) (pY0 + offsetBitReverse), vX0_2PtDft_1);
1119  __vstore_reverse_bit((CVP) (pY1 + offsetBitReverse), vX0_2PtDft_2);
1120  __vstore_reverse_bit((CVP) (pY2 + offsetBitReverse), vX1_2PtDft_1);
1121  __vstore_reverse_bit((CVP) (pY3 + offsetBitReverse), vX1_2PtDft_2);
1122  __vstore_reverse_bit((CVP) (pY4 + offsetBitReverse), vX2_2PtDft_1);
1123  __vstore_reverse_bit((CVP) (pY5 + offsetBitReverse), vX2_2PtDft_2);
1124  __vstore_reverse_bit((CVP) (pY6 + offsetBitReverse), vX3_2PtDft_1);
1125  __vstore_reverse_bit((CVP) (pY7 + offsetBitReverse), vX3_2PtDft_2);
1126  }
1127 
1128 #ifdef CL7X_HE_CFLOAT_PTR_BUG
1129  myPY0 += (numPoints << 1);
1130  myPY1 += (numPoints << 1);
1131  myPY2 += (numPoints << 1);
1132  myPY3 += (numPoints << 1);
1133  myPY4 += (numPoints << 1);
1134  myPY5 += (numPoints << 1);
1135  myPY6 += (numPoints << 1);
1136  myPY7 += (numPoints << 1);
1137 
1138  pY0 = (cfloat *) myPY0;
1139  pY1 = (cfloat *) myPY1;
1140  pY2 = (cfloat *) myPY2;
1141  pY3 = (cfloat *) myPY3;
1142  pY4 = (cfloat *) myPY4;
1143  pY5 = (cfloat *) myPY5;
1144  pY6 = (cfloat *) myPY6;
1145  pY7 = (cfloat *) myPY7;
1146 
1147 #else
1148  pY0 += numPoints;
1149  pY1 += numPoints;
1150  pY2 += numPoints;
1151  pY3 += numPoints;
1152  pY4 += numPoints;
1153  pY5 += numPoints;
1154  pY6 += numPoints;
1155  pY7 += numPoints;
1156 #endif
1157  }
1158  __SE0_CLOSE();
1159  }
1160  }
1161  }
1162 
1163  return (status);
1164 }
1165 
1166 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && !defined(FFTLIB_IFFT1DBATCHED_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
1167  (defined(FFTLIB_CHECK_PARAMS)) || (defined(FFTLIB_IFFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS))
1168 
1170  FFTLIB_bufParams1D_t *bufParamsX,
1171  FFTLIB_F32 *pW,
1172  FFTLIB_bufParams1D_t *bufParamsW,
1173  FFTLIB_F32 *pY,
1174  FFTLIB_bufParams1D_t *bufParamsY,
1175  uint32_t numPoints,
1176  uint32_t numChannels,
1177  void *pBlock)
1178 {
1179  FFTLIB_STATUS status = FFTLIB_SUCCESS;
1180 
1181  if ((pX == NULL) || (pW == NULL) || (pY == NULL) || (pBlock == NULL)) {
1182  status = FFTLIB_ERR_NULL_POINTER;
1183  }
1184  else if (bufParamsX->dim_x != bufParamsY->dim_x) {
1186  }
1187  else if (bufParamsX->dim_x < numPoints * numChannels * 2) {
1188  /* In general, dim_x == numPoints*numChannels*2. However,
1189  * optimized kernel requires dim_x to be atleast 64*2. Hence, for
1190  * small values of numPoints*numChannels, dim_x could be greater
1191  * than numPoints*numChannels*2 */
1193  }
1194  else if (bufParamsX->dim_x < 64 * 2) {
1196  }
1197  else if (bufParamsW->dim_x != numPoints * 2) {
1199  }
1200  else if ((bufParamsX->data_type != FFTLIB_FLOAT32) || (bufParamsW->data_type != FFTLIB_FLOAT32) ||
1201  (bufParamsY->data_type != FFTLIB_FLOAT32)) {
1202  status = FFTLIB_ERR_INVALID_TYPE;
1203  }
1204  else if (((uint64_t) pX) & 0xFu) { /* pX must be 16-byte aligned for a */
1205  status = FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES; /* streaming engine
1206  configuration */
1207  }
1208  else {
1209  /* Check if number of pts is a power of 2 */
1210  uint32_t k = 0;
1211  while (k < 32) {
1212  if (numPoints & (1u << k)) {
1213  break;
1214  }
1215  k++;
1216  }
1217  if ((1u << k) != numPoints) {
1219  }
1220 
1221  if ((numChannels != 1) && (numChannels != 2) && (numChannels != 4) && (numChannels != 8) && (numChannels != 16)) {
1223  }
1224  }
1225  return (status);
1226 }
1227 
1228 #endif
@ FFTLIB_FLOAT32
c7x::cfloat_vec CV
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_ERR_INVALID_TYPE
Definition: FFTLIB_types.h:176
@ FFTLIB_ERR_NULL_POINTER
Definition: FFTLIB_types.h:178
@ FFTLIB_ERR_INVALID_DIMENSION
Definition: FFTLIB_types.h:177
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
@ FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES
Definition: FFTLIB_types.h:181
float FFTLIB_F32
Single precision floating point.
Definition: FFTLIB_types.h:169
static void ifft_i32fc_o32fc_conjugate_exec_ci(void *pX, c7x::ulong_vec xorVec, uint32_t size, uint32_t numPoints, void *pBlock)
static c7x::cfloat_vec ifft_i32fc_o32fc_scaleAndConjugate(c7x::cfloat_vec in, c7x::float_vec scaleVec, c7x::ulong_vec xorVec)
static void ifft_i32fc_o32fc_conjugate_init_ci(void *pX, uint32_t size, void *pBlock)
FFTLIB_STATUS FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function is the main kernel compute function.
FFTLIB_STATUS FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function should be called before the FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_kernel function is c...
FFTLIB_STATUS FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_ifft1dBatched_i32fc_c32fc_o32fc_...
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.