FFTLIB User Guide
c71/FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2017 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 #include "../FFTLIB_fft1dBatched_i32fc_c32fc_o32fc.h"
30 
31 #define TRACE_ON (0)
32 
33 #if TRACE_ON
34 #include "../../../common/printv.h"
35 #include <stdio.h>
36 #endif
37 
38 // CODE_SECTION(FFTLIB_fft1dBatched_i32fc_c32fc_o32fc, ".text:optimized")
39 // CODE_SECTION(FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_core, ".text:optimized")
40 // CODE_SECTION(FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams,
41 // ".text:optimized")
42 
43 #define SE_PARAM_BASE (0x0000)
44 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SE_LOOP6_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SE_LOOP7_PARAM_OFFSET (SE_LOOP6_PARAM_OFFSET + SE_PARAM_SIZE)
51 #define SE_TWID_PARAM_OFFSET (SE_LOOP7_PARAM_OFFSET + SE_PARAM_SIZE)
52 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
53 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
54 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
55 #define SA_LOOP4_PARAM_OFFSET (SA_LOOP3_PARAM_OFFSET + SA_PARAM_SIZE)
56 #define SA_LOOP6_PARAM_OFFSET (SA_LOOP4_PARAM_OFFSET + SA_PARAM_SIZE)
57 
60  FFTLIB_bufParams1D_t *bufParamsX,
61  FFTLIB_F32 *pW,
62  FFTLIB_bufParams1D_t *bufParamsW,
63  FFTLIB_F32 *pY,
64  FFTLIB_bufParams1D_t *bufParamsY,
65  uint32_t numPoints,
66  uint32_t numChannels,
67  void *pBlock)
68 {
70 
71 #if defined(FFTLIB_CHECK_PARAMS) || \
72  defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS)
73  /* status = FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams ( */
74  /* pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, numPoints,
75  * numChannels, */
76  /* pBlock); */
77  if (status == FFTLIB_SUCCESS)
78 #endif
79  {
80  uint32_t numPointsPerDft;
81  uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
82  uint32_t seCnt6, seCnt7, seCnt8, seCnt9, seCnt10;
83  uint32_t seCnt11;
84  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
85  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
86  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
87 
88  numPointsPerDft = numPoints;
89  seCnt1 = numPoints >> 2;
90  seCnt2 = numPoints >> 5;
91  seCnt3 = 1;
92  seCnt4 = numPoints >> 3;
93  seCnt6 = seCnt3 * numChannels;
94  seCnt7 =
95  (numPoints * numChannels >> 5) > 1 ? numPoints * numChannels >> 5 : 1;
96  seCnt8 = numPoints * numChannels;
97  seCnt9 = (numPoints * numChannels > 32) ? numPoints * numChannels : 32;
98  seCnt10 =
99  (numPoints * numChannels >> 6) > 1 ? numPoints * numChannels >> 6 : 1;
100  seCnt11 = (numPoints * numChannels > 64) ? numPoints * numChannels : 64;
101 
102  se0_param = __gen_SE_TEMPLATE_v1 ();
103  se0_param.ICNT0 = 8; /* 8-point vectors processed in one shot */
104  se0_param.ICNT1 = 4;
105  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
106  se0_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
107  se0_param.DIM2 = 8; /* quarter */
108  se0_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
109  se0_param.DIM3 = numPointsPerDft;
110 
111  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
112  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
113  se0_param.DIMFMT = __SE_DIMFMT_4D;
114  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET)) =
115  se0_param;
116 
117  se1_param = __gen_SE_TEMPLATE_v1 ();
118  se1_param.ICNT0 = 8; /* 8-point vectors processed in one shot */
119  se1_param.ICNT1 = 3;
120  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
121  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
122  se1_param.DIM2 = 8; /* quarter */
123  se1_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
124  se1_param.DIM3 = 0;
125 
126  se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
127  se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
128  se1_param.DIMFMT = __SE_DIMFMT_4D;
129  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET)) =
130  se1_param;
131 
132  sa0_param = __gen_SA_TEMPLATE_v1 ();
133  sa0_param.ICNT0 = 8;
134  sa0_param.ICNT1 = 4;
135  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
136  sa0_param.ICNT2 = seCnt2; /* Number of 8-point stores within each */
137  sa0_param.DIM2 = 8; /* quarter */
138  sa0_param.ICNT3 = seCnt6;
139  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's for all channels */
140 
141  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
142  sa0_param.DIMFMT = __SA_DIMFMT_4D;
143  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET)) =
144  sa0_param;
145 
146  se0_param = __gen_SE_TEMPLATE_v1 ();
147  se0_param.ICNT0 = 8; /* Fetch first two quarters */
148  se0_param.ICNT1 = 2;
149  se0_param.DIM1 = 16; /* Process two 16-point DFTs in one shot */
150  se0_param.ICNT2 = seCnt7;
151  se0_param.DIM2 = 32;
152 
153  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
154  se0_param.TRANSPOSE =
155  __SE_TRANSPOSE_256BIT; /* Using 256BIT transpose required */
156  /* 16-byte alignment on pX */
157  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
158  se0_param.DIMFMT = __SE_DIMFMT_3D;
159  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET)) =
160  se0_param;
161 
162  sa0_param = __gen_SA_TEMPLATE_v1 ();
163  sa0_param.ICNT0 = seCnt8; /* Input buffer must be at least 32
164  * elements long even though
165  * numPoints*numChannels = 16 */
166 
167  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
168  sa0_param.DIMFMT = __SA_DIMFMT_1D;
169  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET)) =
170  sa0_param;
171 
172  se0_param = __gen_SE_TEMPLATE_v1 ();
173  se0_param.ICNT0 = seCnt8;
174 
175  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
176  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
177  se0_param.DIMFMT = __SE_DIMFMT_1D;
178  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET)) =
179  se0_param;
180 
181  sa0_param = __gen_SA_TEMPLATE_v1 ();
182  sa0_param.ICNT0 = seCnt8;
183 
184  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
185  sa0_param.DIMFMT = __SA_DIMFMT_1D;
186  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET)) =
187  sa0_param;
188 
189  /* The following SE configuration may cause sub-optimal
190  * tile in SE because second row of tile starts in the
191  * middle of first row */
192  se0_param = __gen_SE_TEMPLATE_v1 ();
193  se0_param.ICNT0 = 4;
194  se0_param.ICNT1 = 8;
195  se0_param.DIM1 = 4;
196  se0_param.ICNT2 = seCnt7;
197  se0_param.DIM2 = 32;
198 
199  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
200  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
201  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
202  se0_param.DIMFMT = __SE_DIMFMT_3D;
203  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET)) =
204  se0_param;
205 
206  sa0_param = __gen_SA_TEMPLATE_v1 ();
207  sa0_param.ICNT0 = seCnt9; /* Input buffer must be at least 32
208  * elements long even though
209  * numPoints*numChannels = 16 */
210 
211  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
212  sa0_param.DIMFMT = __SA_DIMFMT_1D;
213  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP4_PARAM_OFFSET)) =
214  sa0_param;
215 
216  se0_param = __gen_SE_TEMPLATE_v1 ();
217  se0_param.ICNT0 = seCnt4; /* Fetch consecutive four points for DFT */
218  se0_param.ICNT1 = 8;
219  se0_param.DIM1 =
220  /* Fetch 8 points separated by */ seCnt4; /* (numPoints >>
221  3). This fetch
222  pattern */
223  /* can be used for bit reversal */
224  se0_param.ICNT2 = numChannels;
225  se0_param.DIM2 = numPoints;
226 
227  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
228  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
229  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
230  se0_param.DIMFMT = __SE_DIMFMT_3D;
231  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET)) =
232  se0_param;
233 
234  se0_param = __gen_SE_TEMPLATE_v1 ();
235  se0_param.ICNT0 = 8;
236  se0_param.ICNT1 = 8;
237  se0_param.DIM1 = 8;
238  se0_param.ICNT2 = seCnt10;
239  se0_param.DIM2 = 64;
240 
241  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
242  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
243  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
244  se0_param.DIMFMT = __SE_DIMFMT_3D;
245  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP6_PARAM_OFFSET)) =
246  se0_param;
247 
248  sa0_param = __gen_SA_TEMPLATE_v1 ();
249  sa0_param.ICNT0 = seCnt11; /* Input buffer must be at least 64
250  * elements long even though
251  * numPoints*numChannels = 32 */
252 
253  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
254  sa0_param.DIMFMT = __SA_DIMFMT_1D;
255  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP6_PARAM_OFFSET)) =
256  sa0_param;
257 
258  se0_param = __gen_SE_TEMPLATE_v1 ();
259  se0_param.ICNT0 = seCnt4;
260  se0_param.ICNT1 = 8;
261  se0_param.DIM1 =
262  /* Fetch 8 points separated by */ seCnt4; /* (numPoints >>
263  3). This fetch
264  pattern */
265  /* can be used for bit reversal */
266  se0_param.ICNT2 = numChannels;
267  se0_param.DIM2 = numPoints;
268 
269  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
270  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
271  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
272  se0_param.DIMFMT = __SE_DIMFMT_3D;
273  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP7_PARAM_OFFSET)) =
274  se0_param;
275  }
276  return (status);
277 }
278 
281  FFTLIB_bufParams1D_t *bufParamsX,
282  FFTLIB_F32 *pW,
283  FFTLIB_bufParams1D_t *bufParamsW,
284  FFTLIB_F32 *pY,
285  FFTLIB_bufParams1D_t *bufParamsY,
286  uint32_t numPoints,
287  uint32_t numChannels,
288  void *pBlock)
289 {
290  uint32_t k, l;
291  FFTLIB_STATUS status = FFTLIB_SUCCESS;
292  uint32_t numPointsPerDft;
293  uint32_t numLeadingZeros;
294  uint32_t offsetBitReverse;
295  uint32_t seCnt1, seCnt2, seCnt3, seCnt6;
296 
297  __SE_TEMPLATE_v1 se0_param;
298  __SE_TEMPLATE_v1 se1_param;
299  __SA_TEMPLATE_v1 sa0_param;
300 
301  cfloat* restrict pXLocal;
302  cfloat* restrict pYLocal;
303  cfloat* restrict pWLocal;
304  cfloat* restrict pY0;
305  cfloat* restrict pY1;
306  cfloat* restrict pY2;
307  cfloat* restrict pY3;
308  cfloat* restrict pY4;
309  cfloat* restrict pY5;
310  cfloat* restrict pY6;
311  cfloat* restrict pY7;
312 
313  typedef typename c7x::cfloat_vec CV;
314  typedef CV* CVP;
315 
316  /* typedef typename c7x::float_vec V; */
317  /* typedef V* VP; */
318 
319  CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
320  CV vSum1, vSum2, vDiff1, vDiff2;
321  CV vTwX1, vTwX2, vTwX3;
322  CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
323  CV vX0, vX1, vX2, vX3;
324  CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
325  CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
326  CV vX0_1, vX1_1, vX2_1, vX3_1;
327  CV vX0_2PtDft_1, vX0_2PtDft_2;
328  CV vX1_2PtDft_1, vX1_2PtDft_2;
329  CV vX2_2PtDft_1, vX2_2PtDft_2;
330  CV vX3_2PtDft_1, vX3_2PtDft_2;
331  CV vX01_lo, vX23_lo, vX01_hi, vX23_hi;
332  cfloat twTemp;
333 
334 #ifdef FFTLIB_CHECK_PARAMS
335  /* status = FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams ( */
336  /* pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, numPoints,
337  * numChannels, */
338  /* pBlock); */
339  if (status == FFTLIB_SUCCESS)
340 #endif
341  {
342  numPointsPerDft = numPoints;
343 
344  se0_param =
345  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET));
346  se1_param =
347  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET));
348  sa0_param =
349  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET));
350  seCnt1 = numPointsPerDft >> 2;
351  seCnt2 = numPointsPerDft >> 5;
352  seCnt3 = 1;
353 
354  pXLocal = (cfloat*) pX;
355  pWLocal = (cfloat*) pW;
356  pYLocal = (cfloat*) pY;
357 
358  while (numPointsPerDft >= 64) {
359 
360  seCnt6 = seCnt3 * numChannels;
361  se0_param.ICNT1 = 4;
362  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
363  se0_param.ICNT2 = seCnt2;
364  se0_param.DIM2 = 8; /* Number of 8-point fetches within each quarter */
365  se0_param.ICNT3 = seCnt6;
366  se0_param.DIM3 =
367  numPointsPerDft; /* Number of DFT's for all channels */
368  __SE0_OPEN ((void *) pXLocal, se0_param);
369 
370  se1_param.ICNT1 = 3;
371  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
372  se1_param.ICNT2 =
373  seCnt2; /* Number of 8-point fetches within each quarter*/
374  se1_param.DIM2 = 8;
375  se1_param.ICNT3 = seCnt6; /* Number of DFT's for all channels */
376  se1_param.DIM3 = 0;
377  __SE1_OPEN ((void *) pWLocal, se1_param);
378 
379  sa0_param.ICNT1 = 4;
380  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
381  sa0_param.ICNT2 = seCnt2;
382  sa0_param.DIM2 = 8;
383  /* Number of 8-point stores within each quarter */
384  sa0_param.ICNT3 = seCnt6;
385  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
386  __SA0_OPEN (sa0_param);
387 
388  /* Loop is unrolled twice for better optimization */
389  for (k = 0; k < numPoints * numChannels; k += 64) {
390 
391  /* First iteration of loop unroll */
392  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
393  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
394  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
395  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
396 
397  vSum1 = vX_0 + vX_N_2;
398  vSum2 = vX_N_4 + vX_3N_4;
399  vDiff1 = vX_0 - vX_N_2;
400  vDiff2 = vX_N_4 - vX_3N_4;
401 
402  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
403  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
404  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
405 
406  vX0Temp = vSum1 + vSum2;
407  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
408  vX2Temp = vSum1 - vSum2;
409  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
410 
411  vX0 = vX0Temp;
412  vX1 = __complex_multiply (vX1Temp, vTwX1);
413  vX2 = __complex_multiply (vX2Temp, vTwX2);
414  vX3 = __complex_multiply (vX3Temp, vTwX3);
415 
416  __vpred tmp;
417  CVP addr;
418 
419  tmp = c7x::strm_agen<0, CV>::get_vpred ();
420  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
421  __vstore_pred (tmp, addr, vX0);
422 
423  tmp = c7x::strm_agen<0, CV>::get_vpred ();
424  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
425  __vstore_pred (tmp, addr, vX2);
426 
427  tmp = c7x::strm_agen<0, CV>::get_vpred ();
428  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
429  __vstore_pred (tmp, addr, vX1);
430 
431  tmp = c7x::strm_agen<0, CV>::get_vpred ();
432  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
433  __vstore_pred (tmp, addr, vX3);
434 
435  /* Second iteration of loop unroll */
436  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
437  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
438  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
439  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
440 
441  vSum1 = vX_0 + vX_N_2;
442  vSum2 = vX_N_4 + vX_3N_4;
443  vDiff1 = vX_0 - vX_N_2;
444  vDiff2 = vX_N_4 - vX_3N_4;
445 
446  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
447  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
448  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
449 
450  vX0Temp = vSum1 + vSum2;
451  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
452  vX2Temp = vSum1 - vSum2;
453  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
454 
455  vX0 = vX0Temp;
456  vX1 = __complex_multiply (vX1Temp, vTwX1);
457  vX2 = __complex_multiply (vX2Temp, vTwX2);
458  vX3 = __complex_multiply (vX3Temp, vTwX3);
459 
460  tmp = c7x::strm_agen<0, CV>::get_vpred ();
461  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
462  __vstore_pred (tmp, addr, vX0);
463 
464  tmp = c7x::strm_agen<0, CV>::get_vpred ();
465  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
466  __vstore_pred (tmp, addr, vX2);
467 
468  tmp = c7x::strm_agen<0, CV>::get_vpred ();
469  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
470  __vstore_pred (tmp, addr, vX1);
471 
472  tmp = c7x::strm_agen<0, CV>::get_vpred ();
473  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
474  __vstore_pred (tmp, addr, vX3);
475  }
476  __SA0_CLOSE ();
477  __SE0_CLOSE ();
478  __SE1_CLOSE ();
479 
480  numPointsPerDft >>= 2;
481  pWLocal += numPointsPerDft * 3;
482  seCnt1 >>= 2;
483  seCnt2 >>= 2;
484  seCnt3 <<= 2;
485  }
486 
487  if (numPointsPerDft == 16) {
488  /* 16-point stage */
489  se0_param = *(
490  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET));
491  __SE0_OPEN ((void *) pXLocal, se0_param);
492  __SE1_OPEN ((void *) (pXLocal + 8), se0_param);
493 
494  sa0_param = *(
495  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET));
496  __SA0_OPEN (sa0_param);
497 
498  vTwX1 = *((CVP) pWLocal);
499  vTwX2 = *((CVP) (pWLocal + 4));
500  vTwX3 = *((CVP) (pWLocal + 8));
501 
502 #if __C7X_HOSTEM__
503  vTwX1 = CV (vTwX1.lo(), vTwX1.lo());
504  vTwX2 = CV (vTwX2.lo(), vTwX2.lo());
505  vTwX3 = CV (vTwX3.lo(), vTwX3.lo());
506 #else
507  vTwX1 = (CV) (vTwX1.lo(), vTwX1.lo());
508  vTwX2 = (CV) (vTwX2.lo(), vTwX2.lo());
509  vTwX3 = (CV) (vTwX3.lo(), vTwX3.lo());
510 #endif
511 
512  for (k = 0; k < numPoints * numChannels; k += 32) {
513  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
514  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
515  vX_N_2 = c7x::strm_eng<1, CV>::get_adv ();
516  vX_3N_4 = c7x::strm_eng<1, CV>::get_adv ();
517 
518  vSum1 = vX_0 + vX_N_2;
519  vSum2 = vX_N_4 + vX_3N_4;
520  vDiff1 = vX_0 - vX_N_2;
521  vDiff2 = vX_N_4 - vX_3N_4;
522 
523  vX0Temp = vSum1 + vSum2;
524  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
525  vX2Temp = vSum1 - vSum2;
526  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
527 
528  vX0 = vX0Temp;
529  vX1 = __complex_multiply (vX1Temp, vTwX1);
530  vX2 = __complex_multiply (vX2Temp, vTwX2);
531  vX3 = __complex_multiply (vX3Temp, vTwX3);
532 
533 #if __C7X_HOSTEM__
534  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
535  CVP addr;
536  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
537  __vstore_pred (tmp, addr, CV (vX0.lo(), vX2.lo()));
538 
539  tmp = c7x::strm_agen<0, CV>::get_vpred ();
540  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
541  __vstore_pred (tmp, addr, CV (vX1.lo(), vX3.lo()));
542 
543  tmp = c7x::strm_agen<0, CV>::get_vpred ();
544  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
545  __vstore_pred (tmp, addr, CV (vX0.hi(), vX2.hi()));
546 
547  tmp = c7x::strm_agen<0, CV>::get_vpred ();
548  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
549  __vstore_pred (tmp, addr, CV (vX1.hi(), vX3.hi()));
550 #else
551  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
552  CVP addr;
553  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
554  __vstore_pred (tmp, addr, (CV) (vX0.lo(), vX2.lo()));
555 
556  tmp = c7x::strm_agen<0, CV>::get_vpred ();
557  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
558  __vstore_pred (tmp, addr, (CV) (vX1.lo(), vX3.lo()));
559 
560  tmp = c7x::strm_agen<0, CV>::get_vpred ();
561  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
562  __vstore_pred (tmp, addr, (CV) (vX0.hi(), vX2.hi()));
563 
564  tmp = c7x::strm_agen<0, CV>::get_vpred ();
565  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
566  __vstore_pred (tmp, addr, (CV) (vX1.hi(), vX3.hi()));
567 #endif
568  }
569  __SA0_CLOSE ();
570  __SE0_CLOSE ();
571  __SE1_CLOSE ();
572  }
573  else {
574  /* 32-point stage */
575  se0_param = *(
576  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET));
577  __SE0_OPEN ((void *) pXLocal, se0_param);
578 
579  sa0_param = *(
580  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET));
581  __SA0_OPEN (sa0_param);
582 
583  vTwX1 = *((CVP) pWLocal);
584  vTwX2 = *((CVP) (pWLocal + 8));
585  vTwX3 = *((CVP) (pWLocal + 16));
586 
587  for (k = 0; k < numPoints * numChannels; k += 32) {
588  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
589  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
590  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
591  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
592 
593  vSum1 = vX_0 + vX_N_2;
594  vSum2 = vX_N_4 + vX_3N_4;
595  vDiff1 = vX_0 - vX_N_2;
596  vDiff2 = vX_N_4 - vX_3N_4;
597 
598  vX0Temp = vSum1 + vSum2;
599  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
600  vX2Temp = vSum1 - vSum2;
601  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
602 
603  vX0 = vX0Temp;
604  vX1 = __complex_multiply (vX1Temp, vTwX1);
605  vX2 = __complex_multiply (vX2Temp, vTwX2);
606  vX3 = __complex_multiply (vX3Temp, vTwX3);
607 
608  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
609  CVP addr;
610  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
611  __vstore_pred (tmp, addr, vX0);
612 
613  tmp = c7x::strm_agen<0, CV>::get_vpred ();
614  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
615  __vstore_pred (tmp, addr, vX2);
616 
617  tmp = c7x::strm_agen<0, CV>::get_vpred ();
618  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
619  __vstore_pred (tmp, addr, vX1);
620 
621  tmp = c7x::strm_agen<0, CV>::get_vpred ();
622  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
623  __vstore_pred (tmp, addr, vX3);
624  }
625  __SE0_CLOSE ();
626  __SA0_CLOSE ();
627  }
628 
629  numPointsPerDft >>= 2;
630  pWLocal += numPointsPerDft * 3;
631 
632  if (numPointsPerDft == 4) {
633  /* 4-point stage with bit-reversal */
634 
635  if (numPoints == 16) {
636 // clang-format off
637 #if __C7X_HOSTEM__
638  c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
639  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
640  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
641  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
642  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
643  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
644  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
645  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
646 #else
647  c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
648  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
649  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
650  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
651  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
652  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
653  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
654  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
655 #endif
656  // clang-format on
657 
658  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
660  __SE0_OPEN ((void *) pXLocal, se0_param);
661 
662  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
664  __SA0_OPEN (sa0_param);
665 
666  for (k = 0; k < numChannels << 4; k += 32) {
667  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
668  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
669  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
670  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
671 
672  vSum1 = vX_0 + vX_N_2;
673  vSum2 = vX_N_4 + vX_3N_4;
674  vDiff1 = vX_0 - vX_N_2;
675  vDiff2 = vX_N_4 - vX_3N_4;
676 
677  vX0 = vSum1 + vSum2;
678  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
679  vX2 = vSum1 - vSum2;
680  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
681 
682  vX01_lo = c7x::as_cfloat_vec (
683  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
684  c7x::as_uchar_vec (vX0)));
685  vX23_lo = c7x::as_cfloat_vec (
686  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
687  c7x::as_uchar_vec (vX2)));
688  vX01_hi = c7x::as_cfloat_vec (
689  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
690  c7x::as_uchar_vec (vX0)));
691  vX23_hi = c7x::as_cfloat_vec (
692  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
693  c7x::as_uchar_vec (vX2)));
694 
695  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
696  CVP addr;
697  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
698  __vstore_pred (tmp, addr, vX01_lo);
699 
700  tmp = c7x::strm_agen<0, CV>::get_vpred ();
701  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
702  __vstore_pred (tmp, addr, vX23_lo);
703 
704  tmp = c7x::strm_agen<0, CV>::get_vpred ();
705  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
706  __vstore_pred (tmp, addr, vX01_hi);
707 
708  tmp = c7x::strm_agen<0, CV>::get_vpred ();
709  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
710  __vstore_pred (tmp, addr, vX23_hi);
711  }
712  __SE0_CLOSE ();
713  __SA0_CLOSE ();
714  }
715  else {
716  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
718  __SE0_OPEN ((void *) pXLocal, se0_param);
719 
720  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
721 
722  /* pY0 = &pYLocal[0x00000000u]; */
723  /* pY1 = &pYLocal[0x40000000u >> numLeadingZeros]; */
724  /* pY2 = &pYLocal[0x80000000u >> numLeadingZeros]; */
725  /* pY3 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
726 
727  pY0 = (cfloat*) (pY + 0);
728  pY1 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
729  pY2 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
730  pY3 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
731 
732 #ifdef CL7X_HE_CFLOAT_PTR_BUG
733  float *myPY0 = (float *) pY0;
734  float *myPY1 = (float *) pY1;
735  float *myPY2 = (float *) pY2;
736  float *myPY3 = (float *) pY3;
737 #endif
738 
739  for (l = 0; l < numChannels; l++) {
740  for (k = 0; k < numPoints >> 3; k += 4) {
741  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
742 
743  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
744  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
745  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
746  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
747 
748  vSum1 = vX_0 + vX_N_2;
749  vSum2 = vX_N_4 + vX_3N_4;
750  vDiff1 = vX_0 - vX_N_2;
751  vDiff2 = vX_N_4 - vX_3N_4;
752 
753  vX0 = vSum1 + vSum2;
754  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
755  vX2 = vSum1 - vSum2;
756  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
757 
758  /* __vstore_reverse_bit ((CVP) &pY0[offsetBitReverse],
759  */
760  /* vX0); */
761  /* __vstore_reverse_bit ((CVP) &pY1[offsetBitReverse],
762  */
763  /* vX1); */
764  /* __vstore_reverse_bit ((CVP) &pY2[offsetBitReverse],
765  */
766  /* vX2); */
767  /* __vstore_reverse_bit ((CVP) &pY3[offsetBitReverse],
768  */
769  /* vX3); */
770 
771  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0);
772  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX1);
773  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX2);
774  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX3);
775  }
776 
777 #ifdef CL7X_HE_CFLOAT_PTR_BUG
778  myPY0 += (numPoints << 1);
779  myPY1 += (numPoints << 1);
780  myPY2 += (numPoints << 1);
781  myPY3 += (numPoints << 1);
782 
783  pY0 = (cfloat*) myPY0;
784  pY1 = (cfloat*) myPY1;
785  pY2 = (cfloat*) myPY2;
786  pY3 = (cfloat*) myPY3;
787 
788 #else
789  pY0 += numPoints;
790  pY1 += numPoints;
791  pY2 += numPoints;
792  pY3 += numPoints;
793 #endif
794  }
795  __SE0_CLOSE ();
796  }
797  }
798  else {
799  /* 4-point stage followed by 2-point stage with bit-reversal */
800 
801 #if __C7X_HOSTEM__
802  pWLocal += 1;
803  twTemp = *pWLocal;
804  vTwX1 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
805  twTemp);
806  pWLocal += 2;
807  twTemp = *pWLocal;
808  vTwX2 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
809  twTemp);
810  pWLocal += 2;
811  twTemp = *pWLocal;
812  vTwX3 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
813  twTemp);
814 #else
815  pWLocal += 1;
816  twTemp = *pWLocal;
817  vTwX1 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
818  twTemp);
819  pWLocal += 2;
820  twTemp = *pWLocal;
821  vTwX2 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
822  twTemp);
823  pWLocal += 2;
824  twTemp = *pWLocal;
825  vTwX3 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
826  twTemp);
827 #endif
828 
829  if (numPoints == 32) {
830 // clang-format off
831  #if __C7X_HOSTEM__
832  c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
833  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
834  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
835  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
836  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
837  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
838  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
839  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
840  #else
841  c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
842  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
843  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
844  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
845  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
846  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
847  0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
848  0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
849  #endif
850  // clang-format on
851  CV vX01_2PtDft_1_lo, vX23_2PtDft_1_lo, vX01_2PtDft_2_lo,
852  vX23_2PtDft_2_lo;
853  CV vX01_2PtDft_1_hi, vX23_2PtDft_1_hi, vX01_2PtDft_2_hi,
854  vX23_2PtDft_2_hi;
855 
856  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
858  __SE0_OPEN ((void *) pXLocal, se0_param);
859 
860  sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
862  __SA0_OPEN (sa0_param);
863 
864  for (k = 0; k < numChannels << 5; k += 64) {
865  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
866  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
867  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
868  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
869  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
870  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
871  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
872  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
873 
874  vSum1 = vX_0 + vX_N_2;
875  vSum2 = vX_N_4 + vX_3N_4;
876  vDiff1 = vX_0 - vX_N_2;
877  vDiff2 = vX_N_4 - vX_3N_4;
878 
879  vX0 = vSum1 + vSum2;
880  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
881  vX2 = vSum1 - vSum2;
882  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
883 
884  vSum1_1 = vX_0_1 + vX_N_2_1;
885  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
886  vDiff1_1 = vX_0_1 - vX_N_2_1;
887  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
888 
889  vX0Temp = vSum1_1 + vSum2_1;
890  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
891  vX2Temp = vSum1_1 - vSum2_1;
892  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
893 
894  vX0_1 = vX0Temp;
895  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
896  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
897  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
898 
899  vX0_2PtDft_1 = vX0 + vX0_1;
900  vX0_2PtDft_2 = vX0 - vX0_1;
901  vX1_2PtDft_1 = vX1 + vX1_1;
902  vX1_2PtDft_2 = vX1 - vX1_1;
903  vX2_2PtDft_1 = vX2 + vX2_1;
904  vX2_2PtDft_2 = vX2 - vX2_1;
905  vX3_2PtDft_1 = vX3 + vX3_1;
906  vX3_2PtDft_2 = vX3 - vX3_1;
907 
908  /* Permute to obtain bit-reversal order */
909  vX01_2PtDft_1_lo = c7x::as_cfloat_vec (
910  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
911  c7x::as_uchar_vec (vX0_2PtDft_1)));
912  vX23_2PtDft_1_lo = c7x::as_cfloat_vec (
913  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
914  c7x::as_uchar_vec (vX2_2PtDft_1)));
915  vX01_2PtDft_2_lo = c7x::as_cfloat_vec (
916  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
917  c7x::as_uchar_vec (vX0_2PtDft_2)));
918  vX23_2PtDft_2_lo = c7x::as_cfloat_vec (
919  __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
920  c7x::as_uchar_vec (vX2_2PtDft_2)));
921  vX01_2PtDft_1_hi = c7x::as_cfloat_vec (
922  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
923  c7x::as_uchar_vec (vX0_2PtDft_1)));
924  vX23_2PtDft_1_hi = c7x::as_cfloat_vec (
925  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
926  c7x::as_uchar_vec (vX2_2PtDft_1)));
927  vX01_2PtDft_2_hi = c7x::as_cfloat_vec (
928  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
929  c7x::as_uchar_vec (vX0_2PtDft_2)));
930  vX23_2PtDft_2_hi = c7x::as_cfloat_vec (
931  __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
932  c7x::as_uchar_vec (vX2_2PtDft_2)));
933 
934  __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
935  CVP addr;
936  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
937  __vstore_pred (tmp, addr, vX01_2PtDft_1_lo);
938 
939  tmp = c7x::strm_agen<0, CV>::get_vpred ();
940  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
941  __vstore_pred (tmp, addr, vX23_2PtDft_1_lo);
942 
943  tmp = c7x::strm_agen<0, CV>::get_vpred ();
944  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
945  __vstore_pred (tmp, addr, vX01_2PtDft_2_lo);
946 
947  tmp = c7x::strm_agen<0, CV>::get_vpred ();
948  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
949  __vstore_pred (tmp, addr, vX23_2PtDft_2_lo);
950 
951  tmp = c7x::strm_agen<0, CV>::get_vpred ();
952  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
953  __vstore_pred (tmp, addr, vX01_2PtDft_1_hi);
954 
955  tmp = c7x::strm_agen<0, CV>::get_vpred ();
956  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
957  __vstore_pred (tmp, addr, vX23_2PtDft_1_hi);
958 
959  tmp = c7x::strm_agen<0, CV>::get_vpred ();
960  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
961  __vstore_pred (tmp, addr, vX01_2PtDft_2_hi);
962 
963  tmp = c7x::strm_agen<0, CV>::get_vpred ();
964  addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
965  __vstore_pred (tmp, addr, vX23_2PtDft_2_hi);
966  }
967  __SE0_CLOSE ();
968  __SA0_CLOSE ();
969  }
970  else {
971  se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
973  __SE0_OPEN ((void *) pXLocal, se0_param);
974 
975  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
976 
977  /* pY0 = &pYLocal[0x00000000u]; */
978  /* pY1 = &pYLocal[0x80000000u >> numLeadingZeros]; */
979  /* pY2 = &pYLocal[0x20000000u >> numLeadingZeros]; */
980  /* pY3 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
981  /* pY4 = &pYLocal[0x40000000u >> numLeadingZeros]; */
982  /* pY5 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
983  /* pY6 = &pYLocal[0x60000000u >> numLeadingZeros]; */
984  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
985 
986  pY0 = (cfloat*) (pY + (0x00000000u));
987  pY1 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
988  pY2 = (cfloat*) (pY + ((0x20000000u >> numLeadingZeros) << 1));
989  pY3 = (cfloat*) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
990  pY4 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
991  pY5 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
992  pY6 = (cfloat*) (pY + ((0x60000000u >> numLeadingZeros) << 1));
993  pY7 = (cfloat*) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
994 
995 #ifdef CL7X_HE_CFLOAT_PTR_BUG
996  float *myPY0 = (float *) pY0;
997  float *myPY1 = (float *) pY1;
998  float *myPY2 = (float *) pY2;
999  float *myPY3 = (float *) pY3;
1000  float *myPY4 = (float *) pY4;
1001  float *myPY5 = (float *) pY5;
1002  float *myPY6 = (float *) pY6;
1003  float *myPY7 = (float *) pY7;
1004 #endif
1005 
1006  for (l = 0; l < numChannels; l++) {
1007  for (k = 0; k < numPoints >> 3; k += 8) {
1008  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
1009 
1010  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
1011  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
1012  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
1013  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
1014  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
1015  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
1016  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
1017  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
1018 
1019  vSum1 = vX_0 + vX_N_2;
1020  vSum2 = vX_N_4 + vX_3N_4;
1021  vDiff1 = vX_0 - vX_N_2;
1022  vDiff2 = vX_N_4 - vX_3N_4;
1023 
1024  vX0 = vSum1 + vSum2;
1025  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
1026  vX2 = vSum1 - vSum2;
1027  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
1028 
1029  vSum1_1 = vX_0_1 + vX_N_2_1;
1030  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
1031  vDiff1_1 = vX_0_1 - vX_N_2_1;
1032  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
1033 
1034  vX0Temp = vSum1_1 + vSum2_1;
1035  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
1036  vX2Temp = vSum1_1 - vSum2_1;
1037  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
1038 
1039  vX0_1 = vX0Temp;
1040  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
1041  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
1042  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
1043 
1044  vX0_2PtDft_1 = vX0 + vX0_1;
1045  vX0_2PtDft_2 = vX0 - vX0_1;
1046  vX1_2PtDft_1 = vX1 + vX1_1;
1047  vX1_2PtDft_2 = vX1 - vX1_1;
1048  vX2_2PtDft_1 = vX2 + vX2_1;
1049  vX2_2PtDft_2 = vX2 - vX2_1;
1050  vX3_2PtDft_1 = vX3 + vX3_1;
1051  vX3_2PtDft_2 = vX3 - vX3_1;
1052 
1053  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse),
1054  vX0_2PtDft_1);
1055  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse),
1056  vX0_2PtDft_2);
1057  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse),
1058  vX1_2PtDft_1);
1059  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse),
1060  vX1_2PtDft_2);
1061  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse),
1062  vX2_2PtDft_1);
1063  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse),
1064  vX2_2PtDft_2);
1065  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse),
1066  vX3_2PtDft_1);
1067  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse),
1068  vX3_2PtDft_2);
1069  }
1070 
1071 #ifdef CL7X_HE_CFLOAT_PTR_BUG
1072  myPY0 += (numPoints << 1);
1073  myPY1 += (numPoints << 1);
1074  myPY2 += (numPoints << 1);
1075  myPY3 += (numPoints << 1);
1076  myPY4 += (numPoints << 1);
1077  myPY5 += (numPoints << 1);
1078  myPY6 += (numPoints << 1);
1079  myPY7 += (numPoints << 1);
1080 
1081  pY0 = (cfloat*) myPY0;
1082  pY1 = (cfloat*) myPY1;
1083  pY2 = (cfloat*) myPY2;
1084  pY3 = (cfloat*) myPY3;
1085  pY4 = (cfloat*) myPY4;
1086  pY5 = (cfloat*) myPY5;
1087  pY6 = (cfloat*) myPY6;
1088  pY7 = (cfloat*) myPY7;
1089 
1090 #else
1091  pY0 += numPoints;
1092  pY1 += numPoints;
1093  pY2 += numPoints;
1094  pY3 += numPoints;
1095  pY4 += numPoints;
1096  pY5 += numPoints;
1097  pY6 += numPoints;
1098  pY7 += numPoints;
1099 #endif
1100  }
1101  __SE0_CLOSE ();
1102  }
1103  }
1104  }
1105 
1106  return (status);
1107 }
1108 
1109 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && \
1110  !defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
1111  (defined(FFTLIB_CHECK_PARAMS)) || \
1112  (defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS))
1113 
1115  FFTLIB_F32 *pX,
1116  FFTLIB_bufParams1D_t *bufParamsX,
1117  FFTLIB_F32 *pW,
1118  FFTLIB_bufParams1D_t *bufParamsW,
1119  FFTLIB_F32 *pY,
1120  FFTLIB_bufParams1D_t *bufParamsY,
1121  uint32_t numPoints,
1122  uint32_t numChannels,
1123  void *pBlock)
1124 {
1125  FFTLIB_STATUS status = FFTLIB_SUCCESS;
1126 
1127  if ((pX == NULL) || (pW == NULL) || (pY == NULL) || (pBlock == NULL)) {
1128  status = FFTLIB_ERR_NULL_POINTER;
1129  }
1130  else if (bufParamsX->dim_x != bufParamsY->dim_x) {
1132  }
1133  else if (bufParamsX->dim_x < numPoints * numChannels * 2) {
1134  /* In general, dim_x == numPoints*numChannels*2. However,
1135  * optimized kernel requires dim_x to be atleast 64*2. Hence, for
1136  * small values of numPoints*numChannels, dim_x could be greater
1137  * than numPoints*numChannels*2 */
1139  }
1140  else if (bufParamsX->dim_x < 64 * 2) {
1142  }
1143  else if (bufParamsW->dim_x != numPoints * 2) {
1145  }
1146  else if ((bufParamsX->data_type != FFTLIB_FLOAT32) ||
1147  (bufParamsW->data_type != FFTLIB_FLOAT32) ||
1148  (bufParamsY->data_type != FFTLIB_FLOAT32)) {
1149  status = FFTLIB_ERR_INVALID_TYPE;
1150  }
1151  else if (((uint64_t) pX) & 0xFu) { /* pX must be 16-byte aligned for a */
1152  status = FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES; /* streaming engine
1153  configuration */
1154  }
1155  else {
1156  /* Check if number of pts is a power of 2 */
1157  uint32_t k = 0;
1158  while (k < 32) {
1159  if (numPoints & (1u << k)) {
1160  break;
1161  }
1162  k++;
1163  }
1164  if ((1u << k) != numPoints) {
1166  }
1167 
1168  if ((numChannels != 1) && (numChannels != 2) && (numChannels != 4) &&
1169  (numChannels != 8) && (numChannels != 16)) {
1171  }
1172  }
1173  return (status);
1174 }
1175 
1176 #endif
@ FFTLIB_FLOAT32
c7x::cfloat_vec CV
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_ERR_INVALID_TYPE
Definition: FFTLIB_types.h:176
@ FFTLIB_ERR_NULL_POINTER
Definition: FFTLIB_types.h:178
@ FFTLIB_ERR_INVALID_DIMENSION
Definition: FFTLIB_types.h:177
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
@ FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES
Definition: FFTLIB_types.h:181
float FFTLIB_F32
Single precision floating point.
Definition: FFTLIB_types.h:169
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function should be called before the FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_kernel function is ca...
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function is the main kernel compute function.
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_i...
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.