FFTLIB User Guide
c7504/FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2017 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 #include "../FFTLIB_fft1dBatched_i32fc_c32fc_o32fc.h"
30 
31 #define TRACE_ON (0)
32 
33 #if TRACE_ON
34 #include "../../../common/printv.h"
35 #include <stdio.h>
36 #endif
37 
38 // CODE_SECTION(FFTLIB_fft1dBatched_i32fc_c32fc_o32fc, ".text:optimized")
39 // CODE_SECTION(FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_core, ".text:optimized")
40 // CODE_SECTION(FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams,
41 // ".text:optimized")
42 
43 #define SE_PARAM_BASE (0x0000)
44 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SE_LOOP6_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SE_LOOP7_PARAM_OFFSET (SE_LOOP6_PARAM_OFFSET + SE_PARAM_SIZE)
51 #define SE_TWID_PARAM_OFFSET (SE_LOOP7_PARAM_OFFSET + SE_PARAM_SIZE)
52 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
53 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
54 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
55 #define SA_LOOP4_PARAM_OFFSET (SA_LOOP3_PARAM_OFFSET + SA_PARAM_SIZE)
56 #define SA_LOOP6_PARAM_OFFSET (SA_LOOP4_PARAM_OFFSET + SA_PARAM_SIZE)
57 
60  FFTLIB_bufParams1D_t *bufParamsX,
61  FFTLIB_F32 *pW,
62  FFTLIB_bufParams1D_t *bufParamsW,
63  FFTLIB_F32 *pY,
64  FFTLIB_bufParams1D_t *bufParamsY,
65  uint32_t numPoints,
66  uint32_t numChannels,
67  void *pBlock)
68 {
70 
71 #if defined(FFTLIB_CHECK_PARAMS) || \
72  defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS)
73  /* status = FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams ( */
74  /* pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, numPoints,
75  * numChannels, */
76  /* pBlock); */
77  if (status == FFTLIB_SUCCESS)
78 #endif
79  {
80  uint32_t numPointsPerDft;
81  uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
82  uint32_t seCnt6, seCnt7, seCnt8, seCnt9, seCnt10;
83  uint32_t seCnt11;
84  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
85  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
86  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
87 
88  numPointsPerDft = numPoints;
89  seCnt1 = numPoints >> 2;
90  seCnt2 = numPoints >> 4;
91  seCnt3 = 1;
92  seCnt4 = numPoints >> 2;
93  seCnt6 = seCnt3 * numChannels;
94  seCnt7 =
95  (numPoints * numChannels >> 4) > 1 ? numPoints * numChannels >> 4 : 1;
96  seCnt8 = numPoints * numChannels;
97  seCnt9 = (numPoints * numChannels > 16) ? numPoints * numChannels : 16;
98  seCnt10 =
99  (numPoints * numChannels >> 5) > 1 ? numPoints * numChannels >> 5 : 1;
100  seCnt11 = (numPoints * numChannels > 32) ? numPoints * numChannels : 32;
101 
102  uint32_t elementSize = c7x::element_count_of<c7x::cfloat_vec>::value;
103 
104  se0_param = __gen_SE_TEMPLATE_v1 ();
105  se0_param.ICNT0 = elementSize;
106  se0_param.ICNT1 = 4;
107  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
108  se0_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
109  se0_param.DIM2 = elementSize; /* quarter */
110  se0_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
111  se0_param.DIM3 = numPointsPerDft;
112 
113  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
114  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
115  se0_param.DIMFMT = __SE_DIMFMT_4D;
116  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET)) =
117  se0_param;
118 
119  se1_param = __gen_SE_TEMPLATE_v1 ();
120  se1_param.ICNT0 = elementSize;
121  se1_param.ICNT1 = 3;
122  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
123  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
124  se1_param.DIM2 = elementSize;
125  se1_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
126  se1_param.DIM3 = 0;
127 
128  se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
129  se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
130  se1_param.DIMFMT = __SE_DIMFMT_4D;
131  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET)) =
132  se1_param;
133 
134  sa0_param = __gen_SA_TEMPLATE_v1 ();
135  sa0_param.ICNT0 = elementSize;
136  sa0_param.ICNT1 = 4;
137  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
138  sa0_param.ICNT2 = seCnt2; /* Number of 8-point stores within each */
139  sa0_param.DIM2 = elementSize;
140  sa0_param.ICNT3 = seCnt6;
141  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's for all channels */
142 
143  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
144  sa0_param.DIMFMT = __SA_DIMFMT_4D;
145  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET)) =
146  sa0_param;
147 
148  se0_param = __gen_SE_TEMPLATE_v1 ();
149  se0_param.ICNT0 = elementSize;
150  se0_param.ICNT1 = 1;
151  se0_param.DIM1 = 16; /* Process two 16-point DFTs in one shot */
152  se0_param.ICNT2 = seCnt7;
153  se0_param.DIM2 = 16;
154 
155  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
156  /* se0_param.TRANSPOSE = */
157  /* __SE_TRANSPOSE_256BIT; /\* Using 256BIT transpose required *\/ */
158  /* /\* 16-byte alignment on pX *\/ */
159  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
160  se0_param.DIMFMT = __SE_DIMFMT_3D;
161  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET)) =
162  se0_param;
163 
164  sa0_param = __gen_SA_TEMPLATE_v1 ();
165  sa0_param.ICNT0 = seCnt8; /* Input buffer must be at least 32
166  * elements long even though
167  * numPoints*numChannels = 16 */
168 
169  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
170  sa0_param.DIMFMT = __SA_DIMFMT_1D;
171  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET)) =
172  sa0_param;
173 
174  se0_param = __gen_SE_TEMPLATE_v1 ();
175  se0_param.ICNT0 = seCnt8;
176 
177  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
178  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
179  se0_param.DIMFMT = __SE_DIMFMT_1D;
180  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET)) =
181  se0_param;
182 
183  sa0_param = __gen_SA_TEMPLATE_v1 ();
184  sa0_param.ICNT0 = seCnt8;
185 
186  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
187  sa0_param.DIMFMT = __SA_DIMFMT_1D;
188  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET)) =
189  sa0_param;
190 
191  /* The following SE configuration may cause sub-optimal
192  * tile in SE because second row of tile starts in the
193  * middle of first row */
194  se0_param = __gen_SE_TEMPLATE_v1 ();
195  se0_param.ICNT0 = 4;
196  se0_param.ICNT1 = 8;
197  se0_param.DIM1 = 4;
198  se0_param.ICNT2 = seCnt7;
199  se0_param.DIM2 = 32;
200 
201  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
202  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
203  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
204  se0_param.DIMFMT = __SE_DIMFMT_3D;
205  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET)) =
206  se0_param;
207 
208  sa0_param = __gen_SA_TEMPLATE_v1 ();
209  sa0_param.ICNT0 = seCnt9; /* Input buffer must be at least 32
210  * elements long even though
211  * numPoints*numChannels = 16 */
212 
213  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
214  sa0_param.DIMFMT = __SA_DIMFMT_1D;
215  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP4_PARAM_OFFSET)) =
216  sa0_param;
217 
218  se0_param = __gen_SE_TEMPLATE_v1 ();
219  se0_param.ICNT0 = seCnt4; /* Fetch consecutive four points for DFT */
220  se0_param.ICNT1 = elementSize;
221  se0_param.DIM1 = seCnt4;
222  se0_param.ICNT2 = numChannels;
223  se0_param.DIM2 = numPoints;
224 
225  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
226  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
227  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
228  se0_param.DIMFMT = __SE_DIMFMT_3D;
229  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET)) =
230  se0_param;
231 
232  se0_param = __gen_SE_TEMPLATE_v1 ();
233  se0_param.ICNT0 = 8;
234  se0_param.ICNT1 = 8;
235  se0_param.DIM1 = 8;
236  se0_param.ICNT2 = seCnt10;
237  se0_param.DIM2 = 64;
238 
239  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
240  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
241  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
242  se0_param.DIMFMT = __SE_DIMFMT_3D;
243  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP6_PARAM_OFFSET)) =
244  se0_param;
245 
246  sa0_param = __gen_SA_TEMPLATE_v1 ();
247  sa0_param.ICNT0 = seCnt11; /* Input buffer must be at least 64
248  * elements long even though
249  * numPoints*numChannels = 32 */
250 
251  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
252  sa0_param.DIMFMT = __SA_DIMFMT_1D;
253  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP6_PARAM_OFFSET)) =
254  sa0_param;
255 
256  se0_param = __gen_SE_TEMPLATE_v1 ();
257  se0_param.ICNT0 = seCnt4;
258  se0_param.ICNT1 = elementSize;
259  se0_param.DIM1 = seCnt4;
260  se0_param.ICNT2 = numChannels;
261  se0_param.DIM2 = numPoints;
262 
263  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
264  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
265  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
266  se0_param.DIMFMT = __SE_DIMFMT_3D;
267  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP7_PARAM_OFFSET)) =
268  se0_param;
269  }
270  return (status);
271 }
272 
275  FFTLIB_bufParams1D_t *bufParamsX,
276  FFTLIB_F32 *pW,
277  FFTLIB_bufParams1D_t *bufParamsW,
278  FFTLIB_F32 *pY,
279  FFTLIB_bufParams1D_t *bufParamsY,
280  uint32_t numPoints,
281  uint32_t numChannels,
282  void *pBlock)
283 {
284  uint32_t k, l;
285  FFTLIB_STATUS status = FFTLIB_SUCCESS;
286  uint32_t numPointsPerDft;
287  uint32_t numLeadingZeros;
288  uint32_t offsetBitReverse;
289  uint32_t seCnt1, seCnt2, seCnt3, seCnt6;
290 
291  __SE_TEMPLATE_v1 se0_param;
292  __SE_TEMPLATE_v1 se1_param;
293  __SA_TEMPLATE_v1 sa0_param;
294 
295  cfloat* restrict pXLocal;
296  cfloat* restrict pYLocal;
297  cfloat* restrict pWLocal;
298  cfloat* restrict pY0;
299  cfloat* restrict pY1;
300  cfloat* restrict pY2;
301  cfloat* restrict pY3;
302  cfloat* restrict pY4;
303  cfloat* restrict pY5;
304  cfloat* restrict pY6;
305  cfloat* restrict pY7;
306 
307  typedef typename c7x::cfloat_vec CV;
308  typedef CV* CVP;
309 
310  /* typedef typename c7x::float_vec V; */
311  /* typedef V* VP; */
312 
313  CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
314  CV vSum1, vSum2, vDiff1, vDiff2;
315  CV vTwX1, vTwX2, vTwX3;
316  CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
317  CV vX0, vX1, vX2, vX3;
318  CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
319  CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
320  CV vX0_1, vX1_1, vX2_1, vX3_1;
321  CV vX0_2PtDft_1, vX0_2PtDft_2;
322  CV vX1_2PtDft_1, vX1_2PtDft_2;
323  CV vX2_2PtDft_1, vX2_2PtDft_2;
324  CV vX3_2PtDft_1, vX3_2PtDft_2;
325  CV vX01_lo, vX23_lo, vX01_hi, vX23_hi;
326  cfloat twTemp;
327 
328 #ifdef FFTLIB_CHECK_PARAMS
329  /* status = FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams ( */
330  /* pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, numPoints,
331  * numChannels, */
332  /* pBlock); */
333  if (status == FFTLIB_SUCCESS)
334 #endif
335  {
336  numPointsPerDft = numPoints;
337 
338  se0_param =
339  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET));
340  se1_param =
341  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET));
342  sa0_param =
343  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET));
344  seCnt1 = numPointsPerDft >> 2;
345  seCnt2 = numPointsPerDft >> 4;
346  seCnt3 = 1;
347 
348  pXLocal = (cfloat*) pX;
349  pWLocal = (cfloat*) pW;
350  pYLocal = (cfloat*) pY;
351 
352  while (numPointsPerDft >= 16) {
353 
354  seCnt6 = seCnt3 * numChannels;
355  se0_param.ICNT1 = 4;
356  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
357  se0_param.ICNT2 = seCnt2;
358  se0_param.DIM2 = 4; /* Number of 8-point fetches within each quarter */
359  se0_param.ICNT3 = seCnt6;
360  se0_param.DIM3 =
361  numPointsPerDft; /* Number of DFT's for all channels */
362  __SE0_OPEN ((void *) pXLocal, se0_param);
363 
364  se1_param.ICNT1 = 3;
365  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
366  se1_param.ICNT2 =
367  seCnt2; /* Number of 8-point fetches within each quarter*/
368  se1_param.DIM2 = 4;
369  se1_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
370  se1_param.DIM3 = 0;
371  __SE1_OPEN ((void *) pWLocal, se1_param);
372 
373  sa0_param.ICNT1 = 4;
374  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
375  sa0_param.ICNT2 = seCnt2;
376  sa0_param.DIM2 = 4;
377  /* Number of 8-point stores within each quarter */
378  sa0_param.ICNT3 = seCnt6;
379  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
380  __SA0_OPEN (sa0_param);
381 
382  /* Loop is unrolled twice for better optimization */
383  for (k = 0; k < numPoints * numChannels; k += 32) {
384 
385  /* First iteration of loop unroll */
386  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
387  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
388  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
389  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
390 
391  vSum1 = vX_0 + vX_N_2;
392  vSum2 = vX_N_4 + vX_3N_4;
393  vDiff1 = vX_0 - vX_N_2;
394  vDiff2 = vX_N_4 - vX_3N_4;
395 
396  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
397  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
398  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
399 
400  vX0Temp = vSum1 + vSum2;
401  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
402  vX2Temp = vSum1 - vSum2;
403  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
404 
405  vX0 = vX0Temp;
406  vX1 = __complex_multiply (vX1Temp, vTwX1);
407  vX2 = __complex_multiply (vX2Temp, vTwX2);
408  vX3 = __complex_multiply (vX3Temp, vTwX3);
409 
410  __vpred tmp;
411  CVP addr;
412 
413  tmp = c7x::strm_agen<0, CV>::get_vpred ();
414  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
415  __vstore_pred (tmp, addr, vX0);
416 
417  tmp = c7x::strm_agen<0, CV>::get_vpred ();
418  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
419  __vstore_pred (tmp, addr, vX2);
420 
421  tmp = c7x::strm_agen<0, CV>::get_vpred ();
422  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
423  __vstore_pred (tmp, addr, vX1);
424 
425  tmp = c7x::strm_agen<0, CV>::get_vpred ();
426  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
427  __vstore_pred (tmp, addr, vX3);
428 
429  /* Second iteration of loop unroll */
430  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
431  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
432  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
433  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
434 
435  vSum1 = vX_0 + vX_N_2;
436  vSum2 = vX_N_4 + vX_3N_4;
437  vDiff1 = vX_0 - vX_N_2;
438  vDiff2 = vX_N_4 - vX_3N_4;
439 
440  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
441  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
442  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
443 
444  vX0Temp = vSum1 + vSum2;
445  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
446  vX2Temp = vSum1 - vSum2;
447  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
448 
449  vX0 = vX0Temp;
450  vX1 = __complex_multiply (vX1Temp, vTwX1);
451  vX2 = __complex_multiply (vX2Temp, vTwX2);
452  vX3 = __complex_multiply (vX3Temp, vTwX3);
453 
454  tmp = c7x::strm_agen<0, CV>::get_vpred ();
455  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
456  __vstore_pred (tmp, addr, vX0);
457 
458  tmp = c7x::strm_agen<0, CV>::get_vpred ();
459  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
460  __vstore_pred (tmp, addr, vX2);
461 
462  tmp = c7x::strm_agen<0, CV>::get_vpred ();
463  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
464  __vstore_pred (tmp, addr, vX1);
465 
466  tmp = c7x::strm_agen<0, CV>::get_vpred ();
467  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
468  __vstore_pred (tmp, addr, vX3);
469  }
470  __SA0_CLOSE ();
471  __SE0_CLOSE ();
472  __SE1_CLOSE ();
473 
474  numPointsPerDft >>= 2;
475  pWLocal += numPointsPerDft * 3;
476  seCnt1 >>= 2;
477  seCnt2 >>= 2;
478  seCnt3 <<= 2;
479  }
480 
481  if (numPointsPerDft == 16) {
482  /* 16-point stage */
483  se0_param = *(
484  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET));
485  __SE0_OPEN ((void *) pXLocal, se0_param);
486  __SE1_OPEN ((void *) (pXLocal + 8), se0_param);
487 
488  sa0_param = *(
489  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET));
490  __SA0_OPEN (sa0_param);
491 
492  vTwX1 = *((CVP) pWLocal);
493  vTwX2 = *((CVP) (pWLocal + 4));
494  vTwX3 = *((CVP) (pWLocal + 8));
495 
496 #if __C7X_HOSTEM__
497  vTwX1 = CV (vTwX1.lo(), vTwX1.lo());
498  vTwX2 = CV (vTwX2.lo(), vTwX2.lo());
499  vTwX3 = CV (vTwX3.lo(), vTwX3.lo());
500 #else
501  vTwX1 = (CV) (vTwX1.lo(), vTwX1.lo());
502  vTwX2 = (CV) (vTwX2.lo(), vTwX2.lo());
503  vTwX3 = (CV) (vTwX3.lo(), vTwX3.lo());
504 #endif
505 
506  for (k = 0; k < numPoints * numChannels; k += 32) {
507  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
508  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
509  vX_N_2 = c7x::strm_eng<1, CV>::get_adv ();
510  vX_3N_4 = c7x::strm_eng<1, CV>::get_adv ();
511 
512  vSum1 = vX_0 + vX_N_2;
513  vSum2 = vX_N_4 + vX_3N_4;
514  vDiff1 = vX_0 - vX_N_2;
515  vDiff2 = vX_N_4 - vX_3N_4;
516 
517  vX0Temp = vSum1 + vSum2;
518  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
519  vX2Temp = vSum1 - vSum2;
520  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
521 
522  vX0 = vX0Temp;
523  vX1 = __complex_multiply (vX1Temp, vTwX1);
524  vX2 = __complex_multiply (vX2Temp, vTwX2);
525  vX3 = __complex_multiply (vX3Temp, vTwX3);
526 
527 #if __C7X_HOSTEM__
528  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
529  CVP addr;
530  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
531  __vstore_pred (tmp, addr, CV (vX0.lo(), vX2.lo()));
532 
533  tmp = c7x::strm_agen<0, CV>::get_vpred ();
534  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
535  __vstore_pred (tmp, addr, CV (vX1.lo(), vX3.lo()));
536 
537  tmp = c7x::strm_agen<0, CV>::get_vpred ();
538  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
539  __vstore_pred (tmp, addr, CV (vX0.hi(), vX2.hi()));
540 
541  tmp = c7x::strm_agen<0, CV>::get_vpred ();
542  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
543  __vstore_pred (tmp, addr, CV (vX1.hi(), vX3.hi()));
544 #else
545  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
546  CVP addr;
547  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
548  __vstore_pred (tmp, addr, (CV) (vX0.lo(), vX2.lo()));
549 
550  tmp = c7x::strm_agen<0, CV>::get_vpred ();
551  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
552  __vstore_pred (tmp, addr, (CV) (vX1.lo(), vX3.lo()));
553 
554  tmp = c7x::strm_agen<0, CV>::get_vpred ();
555  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
556  __vstore_pred (tmp, addr, (CV) (vX0.hi(), vX2.hi()));
557 
558  tmp = c7x::strm_agen<0, CV>::get_vpred ();
559  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
560  __vstore_pred (tmp, addr, (CV) (vX1.hi(), vX3.hi()));
561 #endif
562  }
563  __SA0_CLOSE ();
564  __SE0_CLOSE ();
565  __SE1_CLOSE ();
566  }
567  else {
568 #if 0
569  /* 32-point stage */
570  se0_param = *(
571  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET));
572  __SE0_OPEN ((void *) pXLocal, se0_param);
573 
574  sa0_param = *(
575  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET));
576  __SA0_OPEN (sa0_param);
577 
578  vTwX1 = *((CVP) pWLocal);
579  vTwX2 = *((CVP) (pWLocal + 8));
580  vTwX3 = *((CVP) (pWLocal + 16));
581 
582  for (k = 0; k < numPoints * numChannels; k += 32) {
583  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
584  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
585  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
586  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
587 
588  vSum1 = vX_0 + vX_N_2;
589  vSum2 = vX_N_4 + vX_3N_4;
590  vDiff1 = vX_0 - vX_N_2;
591  vDiff2 = vX_N_4 - vX_3N_4;
592 
593  vX0Temp = vSum1 + vSum2;
594  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
595  vX2Temp = vSum1 - vSum2;
596  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
597 
598  vX0 = vX0Temp;
599  vX1 = __complex_multiply (vX1Temp, vTwX1);
600  vX2 = __complex_multiply (vX2Temp, vTwX2);
601  vX3 = __complex_multiply (vX3Temp, vTwX3);
602 
603  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
604  CVP addr;
605  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
606  __vstore_pred (tmp, addr, vX0);
607 
608  tmp = c7x::strm_agen<0, CV>::get_vpred ();
609  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
610  __vstore_pred (tmp, addr, vX2);
611 
612  tmp = c7x::strm_agen<0, CV>::get_vpred ();
613  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
614  __vstore_pred (tmp, addr, vX1);
615 
616  tmp = c7x::strm_agen<0, CV>::get_vpred ();
617  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
618  __vstore_pred (tmp, addr, vX3);
619  }
620  __SE0_CLOSE ();
621  __SA0_CLOSE ();
622 #endif // if 0
623  }
624 
625  /* numPointsPerDft >>= 2; */
626  /* pWLocal += numPointsPerDft * 3; */
627 
628  if (numPointsPerDft == 4) {
629  /* 4-point stage with bit-reversal */
630 
631  if (numPoints == 16) {
632 #if 0
633 // clang-format off
634 #if __C7X_HOSTEM__
635  c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
636  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
637  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
638  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
639  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
640  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
641  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
642  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
643 #else
644  c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
645  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
646  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
647  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
648  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
649  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
650  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
651  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
652 #endif
653  // clang-format on
654 
655  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
657  __SE0_OPEN ((void *) pXLocal, se0_param);
658 
659  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
661  __SA0_OPEN (sa0_param);
662 
663  for (k = 0; k < numChannels << 4; k += 32) {
664  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
665  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
666  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
667  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
668 
669  vSum1 = vX_0 + vX_N_2;
670  vSum2 = vX_N_4 + vX_3N_4;
671  vDiff1 = vX_0 - vX_N_2;
672  vDiff2 = vX_N_4 - vX_3N_4;
673 
674  vX0 = vSum1 + vSum2;
675  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
676  vX2 = vSum1 - vSum2;
677  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
678 
679  vX01_lo = c7x::as_cfloat_vec (
680  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
681  c7x::as_uchar_vec (vX0)));
682  vX23_lo = c7x::as_cfloat_vec (
683  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
684  c7x::as_uchar_vec (vX2)));
685  vX01_hi = c7x::as_cfloat_vec (
686  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
687  c7x::as_uchar_vec (vX0)));
688  vX23_hi = c7x::as_cfloat_vec (
689  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
690  c7x::as_uchar_vec (vX2)));
691 
692  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
693  CVP addr;
694  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
695  __vstore_pred (tmp, addr, vX01_lo);
696 
697  tmp = c7x::strm_agen<0, CV>::get_vpred ();
698  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
699  __vstore_pred (tmp, addr, vX23_lo);
700 
701  tmp = c7x::strm_agen<0, CV>::get_vpred ();
702  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
703  __vstore_pred (tmp, addr, vX01_hi);
704 
705  tmp = c7x::strm_agen<0, CV>::get_vpred ();
706  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
707  __vstore_pred (tmp, addr, vX23_hi);
708  }
709  __SE0_CLOSE ();
710  __SA0_CLOSE ();
711 #endif // #if 0
712  }
713  else {
714  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
716  __SE0_OPEN ((void *) pXLocal, se0_param);
717 
718  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
719 
720  /* pY0 = &pYLocal[0x00000000u]; */
721  /* pY1 = &pYLocal[0x40000000u >> numLeadingZeros]; */
722  /* pY2 = &pYLocal[0x80000000u >> numLeadingZeros]; */
723  /* pY3 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
724 
725  pY0 = (cfloat*) (pY + 0);
726  pY1 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
727  pY2 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
728  pY3 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
729 
730 #ifdef CL7X_HE_CFLOAT_PTR_BUG
731  float *myPY0 = (float *) pY0;
732  float *myPY1 = (float *) pY1;
733  float *myPY2 = (float *) pY2;
734  float *myPY3 = (float *) pY3;
735 #endif
736 
737  for (l = 0; l < numChannels; l++) {
738  for (k = 0; k < numPoints >> 2; k += 4) {
739  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
740 
741  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
742  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
743  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
744  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
745 
746  vSum1 = vX_0 + vX_N_2;
747  vSum2 = vX_N_4 + vX_3N_4;
748  vDiff1 = vX_0 - vX_N_2;
749  vDiff2 = vX_N_4 - vX_3N_4;
750 
751  vX0 = vSum1 + vSum2;
752  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
753  vX2 = vSum1 - vSum2;
754  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
755 
756  /* __vstore_reverse_bit ((CVP) &pY0[offsetBitReverse],
757  */
758  /* vX0); */
759  /* __vstore_reverse_bit ((CVP) &pY1[offsetBitReverse],
760  */
761  /* vX1); */
762  /* __vstore_reverse_bit ((CVP) &pY2[offsetBitReverse],
763  */
764  /* vX2); */
765  /* __vstore_reverse_bit ((CVP) &pY3[offsetBitReverse],
766  */
767  /* vX3); */
768 
769  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0);
770  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX1);
771  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX2);
772  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX3);
773  }
774 
775 #ifdef CL7X_HE_CFLOAT_PTR_BUG
776  myPY0 += (numPoints << 1);
777  myPY1 += (numPoints << 1);
778  myPY2 += (numPoints << 1);
779  myPY3 += (numPoints << 1);
780 
781  pY0 = (cfloat*) myPY0;
782  pY1 = (cfloat*) myPY1;
783  pY2 = (cfloat*) myPY2;
784  pY3 = (cfloat*) myPY3;
785 
786 #else
787  pY0 += numPoints;
788  pY1 += numPoints;
789  pY2 += numPoints;
790  pY3 += numPoints;
791 #endif
792  }
793  __SE0_CLOSE ();
794  }
795  }
796  else {
797  /* 4-point stage followed by 2-point stage with bit-reversal */
798 
799 #if __C7X_HOSTEM__
800  pWLocal += 1;
801  twTemp = *pWLocal;
802  vTwX1 = CV (twTemp, twTemp, twTemp, twTemp);
803  pWLocal += 2;
804  twTemp = *pWLocal;
805  vTwX2 = CV (twTemp, twTemp, twTemp, twTemp);
806  pWLocal += 2;
807  twTemp = *pWLocal;
808  vTwX3 = CV (twTemp, twTemp, twTemp, twTemp);
809 #else
810  pWLocal += 1;
811  twTemp = *pWLocal;
812  vTwX1 = (CV) (twTemp, twTemp, twTemp, twTemp);
813  pWLocal += 2;
814  twTemp = *pWLocal;
815  vTwX2 = (CV) (twTemp, twTemp, twTemp, twTemp);
816  pWLocal += 2;
817  twTemp = *pWLocal;
818  vTwX3 = (CV) (twTemp, twTemp, twTemp, twTemp);
819 
820 #endif
821 
822 #if 0
823  if (numPoints == 32) {
824 
825 // clang-format off
826  #if __C7X_HOSTEM__
827  c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
828  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
829  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
830  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
831  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
832  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
833  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
834  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
835  #else
836  c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
837  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
838  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
839  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
840  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
841  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
842  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
843  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
844  #endif
845  // clang-format on
846  CV vX01_2PtDft_1_lo, vX23_2PtDft_1_lo, vX01_2PtDft_2_lo,
847  vX23_2PtDft_2_lo;
848  CV vX01_2PtDft_1_hi, vX23_2PtDft_1_hi, vX01_2PtDft_2_hi,
849  vX23_2PtDft_2_hi;
850 
851  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
853  __SE0_OPEN ((void *) pXLocal, se0_param);
854 
855  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
857  __SA0_OPEN (sa0_param);
858 
859  for (k = 0; k < numChannels << 5; k += 64) {
860  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
861  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
862  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
863  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
864  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
865  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
866  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
867  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
868 
869  vSum1 = vX_0 + vX_N_2;
870  vSum2 = vX_N_4 + vX_3N_4;
871  vDiff1 = vX_0 - vX_N_2;
872  vDiff2 = vX_N_4 - vX_3N_4;
873 
874  vX0 = vSum1 + vSum2;
875  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
876  vX2 = vSum1 - vSum2;
877  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
878 
879  vSum1_1 = vX_0_1 + vX_N_2_1;
880  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
881  vDiff1_1 = vX_0_1 - vX_N_2_1;
882  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
883 
884  vX0Temp = vSum1_1 + vSum2_1;
885  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
886  vX2Temp = vSum1_1 - vSum2_1;
887  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
888 
889  vX0_1 = vX0Temp;
890  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
891  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
892  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
893 
894  vX0_2PtDft_1 = vX0 + vX0_1;
895  vX0_2PtDft_2 = vX0 - vX0_1;
896  vX1_2PtDft_1 = vX1 + vX1_1;
897  vX1_2PtDft_2 = vX1 - vX1_1;
898  vX2_2PtDft_1 = vX2 + vX2_1;
899  vX2_2PtDft_2 = vX2 - vX2_1;
900  vX3_2PtDft_1 = vX3 + vX3_1;
901  vX3_2PtDft_2 = vX3 - vX3_1;
902 
903  /* Permute to obtain bit-reversal order */
904  vX01_2PtDft_1_lo = c7x::as_cfloat_vec (
905  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
906  c7x::as_uchar_vec (vX0_2PtDft_1)));
907  vX23_2PtDft_1_lo = c7x::as_cfloat_vec (
908  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
909  c7x::as_uchar_vec (vX2_2PtDft_1)));
910  vX01_2PtDft_2_lo = c7x::as_cfloat_vec (
911  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
912  c7x::as_uchar_vec (vX0_2PtDft_2)));
913  vX23_2PtDft_2_lo = c7x::as_cfloat_vec (
914  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
915  c7x::as_uchar_vec (vX2_2PtDft_2)));
916  vX01_2PtDft_1_hi = c7x::as_cfloat_vec (
917  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
918  c7x::as_uchar_vec (vX0_2PtDft_1)));
919  vX23_2PtDft_1_hi = c7x::as_cfloat_vec (
920  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
921  c7x::as_uchar_vec (vX2_2PtDft_1)));
922  vX01_2PtDft_2_hi = c7x::as_cfloat_vec (
923  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
924  c7x::as_uchar_vec (vX0_2PtDft_2)));
925  vX23_2PtDft_2_hi = c7x::as_cfloat_vec (
926  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
927  c7x::as_uchar_vec (vX2_2PtDft_2)));
928 
929  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
930  CVP addr;
931  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
932  __vstore_pred (tmp, addr, vX01_2PtDft_1_lo);
933 
934  tmp = c7x::strm_agen<0, CV>::get_vpred ();
935  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
936  __vstore_pred (tmp, addr, vX23_2PtDft_1_lo);
937 
938  tmp = c7x::strm_agen<0, CV>::get_vpred ();
939  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
940  __vstore_pred (tmp, addr, vX01_2PtDft_2_lo);
941 
942  tmp = c7x::strm_agen<0, CV>::get_vpred ();
943  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
944  __vstore_pred (tmp, addr, vX23_2PtDft_2_lo);
945 
946  tmp = c7x::strm_agen<0, CV>::get_vpred ();
947  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
948  __vstore_pred (tmp, addr, vX01_2PtDft_1_hi);
949 
950  tmp = c7x::strm_agen<0, CV>::get_vpred ();
951  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
952  __vstore_pred (tmp, addr, vX23_2PtDft_1_hi);
953 
954  tmp = c7x::strm_agen<0, CV>::get_vpred ();
955  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
956  __vstore_pred (tmp, addr, vX01_2PtDft_2_hi);
957 
958  tmp = c7x::strm_agen<0, CV>::get_vpred ();
959  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
960  __vstore_pred (tmp, addr, vX23_2PtDft_2_hi);
961  }
962  __SE0_CLOSE ();
963  __SA0_CLOSE ();
964 
965  }
966 #endif // if (numPoints == 32)
967  /* else */
968  {
969  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
971  __SE0_OPEN ((void *) pXLocal, se0_param);
972 
973  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
974 
975  /* pY0 = &pYLocal[0x00000000u]; */
976  /* pY1 = &pYLocal[0x80000000u >> numLeadingZeros]; */
977  /* pY2 = &pYLocal[0x20000000u >> numLeadingZeros]; */
978  /* pY3 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
979  /* pY4 = &pYLocal[0x40000000u >> numLeadingZeros]; */
980  /* pY5 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
981  /* pY6 = &pYLocal[0x60000000u >> numLeadingZeros]; */
982  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
983 
984  pY0 = (cfloat*) (pY + (0x00000000u));
985  pY1 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
986  pY2 = (cfloat*) (pY + ((0x20000000u >> numLeadingZeros) << 1));
987  pY3 = (cfloat*) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
988  pY4 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
989  pY5 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
990  pY6 = (cfloat*) (pY + ((0x60000000u >> numLeadingZeros) << 1));
991  pY7 = (cfloat*) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
992 
993 #ifdef CL7X_HE_CFLOAT_PTR_BUG
994  float *myPY0 = (float *) pY0;
995  float *myPY1 = (float *) pY1;
996  float *myPY2 = (float *) pY2;
997  float *myPY3 = (float *) pY3;
998  float *myPY4 = (float *) pY4;
999  float *myPY5 = (float *) pY5;
1000  float *myPY6 = (float *) pY6;
1001  float *myPY7 = (float *) pY7;
1002 #endif
1003 
1004  for (l = 0; l < numChannels; l++) {
1005  for (k = 0; k < numPoints >> 2; k += 8) {
1006  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
1007 
1008  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
1009  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
1010  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
1011  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
1012  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
1013  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
1014  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
1015  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
1016 
1017  vSum1 = vX_0 + vX_N_2;
1018  vSum2 = vX_N_4 + vX_3N_4;
1019  vDiff1 = vX_0 - vX_N_2;
1020  vDiff2 = vX_N_4 - vX_3N_4;
1021 
1022  vX0 = vSum1 + vSum2;
1023  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
1024  vX2 = vSum1 - vSum2;
1025  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
1026 
1027  vSum1_1 = vX_0_1 + vX_N_2_1;
1028  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
1029  vDiff1_1 = vX_0_1 - vX_N_2_1;
1030  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
1031 
1032  vX0Temp = vSum1_1 + vSum2_1;
1033  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
1034  vX2Temp = vSum1_1 - vSum2_1;
1035  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
1036 
1037  vX0_1 = vX0Temp;
1038  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
1039  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
1040  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
1041 
1042  vX0_2PtDft_1 = vX0 + vX0_1;
1043  vX0_2PtDft_2 = vX0 - vX0_1;
1044  vX1_2PtDft_1 = vX1 + vX1_1;
1045  vX1_2PtDft_2 = vX1 - vX1_1;
1046  vX2_2PtDft_1 = vX2 + vX2_1;
1047  vX2_2PtDft_2 = vX2 - vX2_1;
1048  vX3_2PtDft_1 = vX3 + vX3_1;
1049  vX3_2PtDft_2 = vX3 - vX3_1;
1050 
1051  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse),
1052  vX0_2PtDft_1);
1053  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse),
1054  vX0_2PtDft_2);
1055  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse),
1056  vX1_2PtDft_1);
1057  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse),
1058  vX1_2PtDft_2);
1059  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse),
1060  vX2_2PtDft_1);
1061  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse),
1062  vX2_2PtDft_2);
1063  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse),
1064  vX3_2PtDft_1);
1065  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse),
1066  vX3_2PtDft_2);
1067  }
1068 
1069 #ifdef CL7X_HE_CFLOAT_PTR_BUG
1070  myPY0 += (numPoints << 1);
1071  myPY1 += (numPoints << 1);
1072  myPY2 += (numPoints << 1);
1073  myPY3 += (numPoints << 1);
1074  myPY4 += (numPoints << 1);
1075  myPY5 += (numPoints << 1);
1076  myPY6 += (numPoints << 1);
1077  myPY7 += (numPoints << 1);
1078 
1079  pY0 = (cfloat*) myPY0;
1080  pY1 = (cfloat*) myPY1;
1081  pY2 = (cfloat*) myPY2;
1082  pY3 = (cfloat*) myPY3;
1083  pY4 = (cfloat*) myPY4;
1084  pY5 = (cfloat*) myPY5;
1085  pY6 = (cfloat*) myPY6;
1086  pY7 = (cfloat*) myPY7;
1087 
1088 #else
1089  pY0 += numPoints;
1090  pY1 += numPoints;
1091  pY2 += numPoints;
1092  pY3 += numPoints;
1093  pY4 += numPoints;
1094  pY5 += numPoints;
1095  pY6 += numPoints;
1096  pY7 += numPoints;
1097 #endif
1098  }
1099  __SE0_CLOSE ();
1100  }
1101  }
1102  }
1103 
1104  return (status);
1105 }
1106 
1107 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && \
1108  !defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
1109  (defined(FFTLIB_CHECK_PARAMS)) || \
1110  (defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS))
1111 
1113  FFTLIB_F32 *pX,
1114  FFTLIB_bufParams1D_t *bufParamsX,
1115  FFTLIB_F32 *pW,
1116  FFTLIB_bufParams1D_t *bufParamsW,
1117  FFTLIB_F32 *pY,
1118  FFTLIB_bufParams1D_t *bufParamsY,
1119  uint32_t numPoints,
1120  uint32_t numChannels,
1121  void *pBlock)
1122 {
1123  FFTLIB_STATUS status = FFTLIB_SUCCESS;
1124 
1125  if ((pX == NULL) || (pW == NULL) || (pY == NULL) || (pBlock == NULL)) {
1126  status = FFTLIB_ERR_NULL_POINTER;
1127  }
1128  else if (bufParamsX->dim_x != bufParamsY->dim_x) {
1130  }
1131  else if (bufParamsX->dim_x < numPoints * numChannels * 2) {
1132  /* In general, dim_x == numPoints*numChannels*2. However,
1133  * optimized kernel requires dim_x to be atleast 64*2. Hence, for
1134  * small values of numPoints*numChannels, dim_x could be greater
1135  * than numPoints*numChannels*2 */
1137  }
1138  else if (bufParamsX->dim_x < 64 * 2) {
1140  }
1141  else if (bufParamsW->dim_x != numPoints * 2) {
1143  }
1144  else if ((bufParamsX->data_type != FFTLIB_FLOAT32) ||
1145  (bufParamsW->data_type != FFTLIB_FLOAT32) ||
1146  (bufParamsY->data_type != FFTLIB_FLOAT32)) {
1147  status = FFTLIB_ERR_INVALID_TYPE;
1148  }
1149  else if (((uint64_t) pX) & 0xFu) { /* pX must be 16-byte aligned for a */
1150  status = FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES; /* streaming engine
1151  configuration */
1152  }
1153  else {
1154  /* Check if number of pts is a power of 2 */
1155  uint32_t k = 0;
1156  while (k < 32) {
1157  if (numPoints & (1u << k)) {
1158  break;
1159  }
1160  k++;
1161  }
1162  if ((1u << k) != numPoints) {
1164  }
1165 
1166  if ((numChannels != 1) && (numChannels != 2) && (numChannels != 4) &&
1167  (numChannels != 8) && (numChannels != 16)) {
1169  }
1170  }
1171  return (status);
1172 }
1173 
1174 #endif
@ FFTLIB_FLOAT32
c7x::cfloat_vec CV
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_ERR_INVALID_TYPE
Definition: FFTLIB_types.h:176
@ FFTLIB_ERR_NULL_POINTER
Definition: FFTLIB_types.h:178
@ FFTLIB_ERR_INVALID_DIMENSION
Definition: FFTLIB_types.h:177
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
@ FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES
Definition: FFTLIB_types.h:181
float FFTLIB_F32
Single precision floating point.
Definition: FFTLIB_types.h:169
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function should be called before the FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_kernel function is ca...
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function is the main kernel compute function.
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_i...
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.