FFTLIB User Guide
c7504/FFTLIB_fft1d_i32fc_c32fc_o32fc_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2017 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 #include "../FFTLIB_fft1d_i32fc_c32fc_o32fc.h"
30 
31 #define TRACE_ON (0)
32 #define LAST_LOOP_UNROLL 0
33 
34 #if TRACE_ON
35 #include "../../../common/printv.h"
36 #include <stdio.h>
37 #endif
38 
39 // CODE_SECTION(FFTLIB_fft1d, ".text:optimized")
40 // CODE_SECTION(FFTLIB_fft1d_core, ".text:optimized")
41 // CODE_SECTION(FFTLIB_fft1d_checkParams, ".text:optimized")
42 
43 #define SE_PARAM_BASE (0x0000)
44 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SE_TWID_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
51 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
52 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
53 
56  FFTLIB_bufParams1D_t *bufParamsX,
57  FFTLIB_F32 *pW,
58  FFTLIB_bufParams1D_t *bufParamsW,
59  FFTLIB_F32 *pY,
60  FFTLIB_bufParams1D_t *bufParamsY,
61  void *pBlock)
62 {
64 
65 #if defined(FFTLIB_CHECK_PARAMS) || \
66  defined(FFTLIB_FFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS)
68  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
69  if (status == FFTLIB_SUCCESS)
70 #endif
71  {
72  uint32_t numPoints;
73  uint32_t numPointsPerDft;
74  uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
75  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
76  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
77  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
78 
79  numPoints = bufParamsX->dim_x >> 1;
80  numPointsPerDft = numPoints;
81  seCnt1 = numPoints >> 2;
82  seCnt2 = numPoints >> 4;
83  seCnt3 = 1;
84  seCnt4 = numPoints >> 2;
85 
86  uint32_t elementSize = c7x::element_count_of<c7x::cfloat_vec>::value;
87  /* printf ("elementSize: %d\n", elementSize); */
88  /* se0_param = (0); */
89  se0_param.ICNT0 = elementSize;
90  se0_param.ICNT1 = 4; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
91  se0_param.DIM1 = seCnt1;
92  se0_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
93  se0_param.DIM2 = elementSize;
94  se0_param.ICNT3 = seCnt3; /* Number of DFT's */
95  se0_param.DIM3 = numPointsPerDft;
96 
97  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
98  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
99  se0_param.DIMFMT = __SE_DIMFMT_4D;
100  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET)) =
101  se0_param;
102 
103  /* se1_param = (0); */
104  se1_param.ICNT0 = elementSize;
105  se1_param.ICNT1 = 3;
106  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
107  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
108  se1_param.DIM2 = elementSize;
109  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
110  se1_param.DIM3 = 0;
111 
112  se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
113  se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
114  se1_param.DIMFMT = __SE_DIMFMT_4D;
115  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET)) =
116  se1_param;
117 
118  /* sa0_param = (0); */
119  sa0_param.ICNT0 = elementSize;
120  sa0_param.ICNT1 = 4;
121  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
122  sa0_param.ICNT2 = seCnt2; /* Number of 8-point stores within each */
123  sa0_param.DIM2 = elementSize;
124  sa0_param.ICNT3 = seCnt3;
125  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
126 
127  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
128  sa0_param.DIMFMT = __SA_DIMFMT_4D;
129  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET)) =
130  sa0_param;
131 
132  /* se0_param = (0); */
133  se0_param.ICNT0 = elementSize; /* Fetch first two quarters */
134  se0_param.ICNT1 = 1;
135  se0_param.DIM1 = 16; /* Process two 16-point DFTs in one shot */
136  se0_param.ICNT2 = seCnt2;
137  se0_param.DIM2 = 16; /* Half the number of DFT's */
138 
139  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
140  /* se0_param.TRANSPOSE = */
141  /* __SE_TRANSPOSE_128BIT; /\* Using 256BIT transpose required *\/ */
142  /* /\* 16-byte alignment on pX *\/ */
143  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
144  se0_param.DIMFMT = __SE_DIMFMT_3D;
145  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET)) =
146  se0_param;
147 
148  /* sa0_param = (0); */
149  sa0_param.ICNT0 = numPoints;
150 
151  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
152  sa0_param.DIMFMT = __SA_DIMFMT_1D;
153  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET)) =
154  sa0_param;
155 
156  /* se0_param = (0); */
157  se0_param = __gen_SE_TEMPLATE_v1 ();
158  se0_param.ICNT0 = numPoints;
159 
160  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
161  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
162  se0_param.DIMFMT = __SE_DIMFMT_1D;
163  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET)) =
164  se0_param;
165 
166  /* sa0_param = (0); */
167  sa0_param.ICNT0 = numPoints;
168 
169  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
170  sa0_param.DIMFMT = __SA_DIMFMT_1D;
171  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET)) =
172  sa0_param;
173 
174  /* se0_param = (0); */
175  se0_param.ICNT0 = seCnt4; /* Fetch consecutive four points for DFT */
176  se0_param.ICNT1 = elementSize;
177  se0_param.DIM1 = seCnt4;
178  /* Fetch 8 points separated by */ /* (numPoints >>
179  3). This fetch
180  pattern */
181  /* can be used for bit reversal */
182 
183  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
184  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
185  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
186  se0_param.DIMFMT = __SE_DIMFMT_2D;
187  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET)) =
188  se0_param;
189 
190  /* se0_param = (0); */
191  se0_param.ICNT0 = seCnt4;
192  se0_param.ICNT1 = elementSize;
193  se0_param.DIM1 = seCnt4;
194  /* Fetch 8 points separated by */ /* (numPoints >>
195  3). This fetch
196  pattern */
197  /* can be used for bit reversal */
198 
199  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
200  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
201  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
202  se0_param.DIMFMT = __SE_DIMFMT_2D;
203  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET)) =
204  se0_param;
205  }
206  return (status);
207 }
208 
211  FFTLIB_bufParams1D_t *bufParamsX,
212  FFTLIB_F32 *pW,
213  FFTLIB_bufParams1D_t *bufParamsW,
214  FFTLIB_F32 *pY,
215  FFTLIB_bufParams1D_t *bufParamsY,
216  void *pBlock)
217 {
218  uint32_t k;
219  FFTLIB_STATUS status = FFTLIB_SUCCESS;
220  uint32_t numPoints;
221  uint32_t numPointsPerDft;
222  uint32_t numLeadingZeros;
223  uint32_t offsetBitReverse;
224  uint32_t seCnt1, seCnt2, seCnt3;
225  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
226  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
227  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
228 
229  cfloat* restrict pXLocal;
230  cfloat* restrict pYLocal;
231  cfloat* restrict pWLocal;
232  cfloat* restrict pY0;
233  cfloat* restrict pY1;
234  cfloat* restrict pY2;
235  cfloat* restrict pY3;
236  cfloat* restrict pY4;
237  cfloat* restrict pY5;
238  cfloat* restrict pY6;
239  cfloat* restrict pY7;
240 
241  typedef typename c7x::cfloat_vec CV;
242  typedef CV* CVP;
243 
244  typedef typename c7x::float_vec V;
245  typedef V* VP;
246 
247  CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
248  CV vSum1, vSum2, vDiff1, vDiff2;
249  CV vTwX1, vTwX2, vTwX3;
250  CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
251  CV vX0, vX1, vX2, vX3;
252  CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
253  CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
254  CV vX0_1, vX1_1, vX2_1, vX3_1;
255  CV vX0_2PtDft_1, vX0_2PtDft_2;
256  CV vX1_2PtDft_1, vX1_2PtDft_2;
257  CV vX2_2PtDft_1, vX2_2PtDft_2;
258  CV vX3_2PtDft_1, vX3_2PtDft_2;
259  cfloat twTemp;
260 
261 #ifdef FFTLIB_CHECK_PARAMS
263  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
264  if (status == FFTLIB_SUCCESS)
265 #endif
266  {
267  numPoints = bufParamsX->dim_x >> 1;
268  numPointsPerDft = numPoints;
269 
270  se0_param =
271  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET));
272  se1_param =
273  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET));
274  sa0_param =
275  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET));
276  seCnt1 = numPointsPerDft >> 2;
277  seCnt2 = numPointsPerDft >> 4;
278  seCnt3 = 1;
279 
280  pXLocal = (cfloat*) pX;
281  pWLocal = (cfloat*) pW;
282  pYLocal = (cfloat*) pY;
283 
284  while (numPointsPerDft >= 16) {
285  /* TODO OPT: Calculate params upfront in init function,
286  * rather than generating SE params on the fly here */
287  se0_param.ICNT1 = 4;
288  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
289  se0_param.ICNT2 = seCnt2;
290  se0_param.DIM2 = 4;
291  /* Number of 8-point fetches within */ /* each quarter */
292  se0_param.ICNT3 = seCnt3;
293  se0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
294  __SE0_OPEN ((void *) pXLocal, se0_param);
295 
296  se1_param.ICNT1 = 3;
297  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
298  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
299  se1_param.DIM2 = 4; /* quarter */
300  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
301  se1_param.DIM3 = 0;
302  __SE1_OPEN ((void *) pWLocal, se1_param);
303 
304  sa0_param.ICNT1 = 4;
305  sa0_param.DIM1 = /* Save to each of the 4 quarters */ seCnt1;
306  sa0_param.ICNT2 = seCnt2;
307  sa0_param.DIM2 = 4;
308  /* Number of 8-point stores within */ /* each quarter */
309  sa0_param.ICNT3 = seCnt3;
310  sa0_param.DIM3 = numPointsPerDft;
311  /* Number of DFT's */
312  __SA0_OPEN (sa0_param);
313 
314  /* Loop is unrolled twice for better optimization */
315  for (k = 0; k < numPoints; k += 32) {
316 
317  /* First iteration of loop unroll */
318  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
319  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
320  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
321  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
322 
323  vSum1 = vX_0 + vX_N_2;
324  vSum2 = vX_N_4 + vX_3N_4;
325  vDiff1 = vX_0 - vX_N_2;
326  vDiff2 = vX_N_4 - vX_3N_4;
327 
328  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
329  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
330  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
331 
332  vX0Temp = vSum1 + vSum2;
333  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
334  vX2Temp = vSum1 - vSum2;
335  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
336 
337  vX0 = vX0Temp;
338  vX1 = __complex_multiply (vX1Temp, vTwX1);
339  vX2 = __complex_multiply (vX2Temp, vTwX2);
340  vX3 = __complex_multiply (vX3Temp, vTwX3);
341 
342  /* __SA0ADV(CV, pXLocal) = vX0; */
343  /* __SA0ADV(CV, pXLocal) = vX2; */
344  /* __SA0ADV(CV, pXLocal) = vX1; */
345  /* __SA0ADV(CV, pXLocal) = vX3; */
346 
347  __vpred tmp;
348  CVP addr;
349  tmp = c7x::strm_agen<0, CV>::get_vpred ();
350  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
351  __vstore_pred (tmp, addr, vX0);
352 
353  tmp = c7x::strm_agen<0, CV>::get_vpred ();
354  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
355  __vstore_pred (tmp, addr, vX2);
356 
357  tmp = c7x::strm_agen<0, CV>::get_vpred ();
358  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
359  __vstore_pred (tmp, addr, vX1);
360 
361  tmp = c7x::strm_agen<0, CV>::get_vpred ();
362  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
363  __vstore_pred (tmp, addr, vX3);
364 
365  /* Second iteration of loop unroll */
366 
367  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
368  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
369  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
370  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
371 
372  vSum1 = vX_0 + vX_N_2;
373  vSum2 = vX_N_4 + vX_3N_4;
374  vDiff1 = vX_0 - vX_N_2;
375  vDiff2 = vX_N_4 - vX_3N_4;
376 
377  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
378  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
379  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
380 
381  vX0Temp = vSum1 + vSum2;
382  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
383  vX2Temp = vSum1 - vSum2;
384  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
385 
386  vX0 = vX0Temp;
387  vX1 = __complex_multiply (vX1Temp, vTwX1);
388  vX2 = __complex_multiply (vX2Temp, vTwX2);
389  vX3 = __complex_multiply (vX3Temp, vTwX3);
390 
391  /* __SA0ADV(CV, pXLocal) = vX0; */
392  /* __SA0ADV(CV, pXLocal) = vX2; */
393  /* __SA0ADV(CV, pXLocal) = vX1; */
394  /* __SA0ADV(CV, pXLocal) = vX3; */
395 
396  tmp = c7x::strm_agen<0, CV>::get_vpred ();
397  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
398  __vstore_pred (tmp, addr, vX0);
399 
400  tmp = c7x::strm_agen<0, CV>::get_vpred ();
401  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
402  __vstore_pred (tmp, addr, vX2);
403 
404  tmp = c7x::strm_agen<0, CV>::get_vpred ();
405  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
406  __vstore_pred (tmp, addr, vX1);
407 
408  tmp = c7x::strm_agen<0, CV>::get_vpred ();
409  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
410  __vstore_pred (tmp, addr, vX3);
411  }
412  __SA0_CLOSE ();
413  __SE0_CLOSE ();
414  __SE1_CLOSE ();
415 
416  numPointsPerDft >>= 2;
417  pWLocal += numPointsPerDft * 3;
418  seCnt1 >>= 2;
419  seCnt2 >>= 2;
420  seCnt3 <<= 2;
421  /* printf ("\n\n"); */
422  /* for (int32_t u = 0; u < 64 * 2; u++) { */
423  /* printf ("%f, ", ((float *) pX)[u]); */
424  /* } */
425  }
426  if (numPointsPerDft == 4) {
427  /* 4-point stage with bit-reversal */
428  se0_param = *(
429  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET));
430  __SE0_OPEN ((void *) pXLocal, se0_param);
431 
432  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
433 
434  /* pY0 = &pYLocal[0x00000000u]; */
435  /* pY1 = &pYLocal[0x40000000u >> numLeadingZeros]; */
436  /* pY2 = &pYLocal[0x80000000u >> numLeadingZeros]; */
437  /* pY3 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
438 
439  pY0 = (cfloat*) (pY + 0);
440  pY1 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
441  pY2 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
442  pY3 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
443 
444 #ifdef LAST_LOOP_UNROLL
445  /* pY4 = &pYLocal[0x20000000u >> numLeadingZeros]; */
446  /* pY5 = &pYLocal[0x60000000u >> numLeadingZeros]; */
447  /* pY6 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
448  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
449 
450  pY4 = (cfloat*) (pY + ((0x20000000u >> numLeadingZeros) << 1));
451  pY5 = (cfloat*) (pY + ((0x60000000u >> numLeadingZeros) << 1));
452  pY6 = (cfloat*) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
453  pY7 = (cfloat*) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
454 #endif
455 
456 #ifdef LAST_LOOP_UNROLL
457  for (k = 0; k < numPoints >> 2; k += 8)
458 #else
459  for (k = 0; k < numPoints >> 2; k += 4)
460 #endif
461  {
462  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
463 
464  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
465  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
466  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
467  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
468 
469  vSum1 = vX_0 + vX_N_2;
470  vSum2 = vX_N_4 + vX_3N_4;
471  vDiff1 = vX_0 - vX_N_2;
472  vDiff2 = vX_N_4 - vX_3N_4;
473 
474  vX0 = vSum1 + vSum2;
475  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
476  vX2 = vSum1 - vSum2;
477  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
478 
479  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0);
480  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX1);
481  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX2);
482  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX3);
483 
484 #ifdef LAST_LOOP_UNROLL
485  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
486  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
487  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
488  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
489 
490  vSum1_1 = vX_0_1 + vX_N_2_1;
491  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
492  vDiff1_1 = vX_0_1 - vX_N_2_1;
493  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
494 
495  vX0_1 = vSum1_1 + vSum2_1;
496  vX1_1 = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
497  vX2_1 = vSum1_1 - vSum2_1;
498  vX3_1 = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
499 
500  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX0_1);
501  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX1_1);
502  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX2_1);
503  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_1);
504 #endif
505  }
506  __SE0_CLOSE ();
507 
508  /* printf ("\nStage == 4\n"); */
509  /* for (int32_t u = 0; u < 64 * 2; u++) { */
510  /* printf ("%f, ", ((float *) pY)[u]); */
511  /* } */
512  }
513  else {
514 #if 1
515  /* 4-point stage followed by 2-point stage with bit-reversal */
516  se0_param = *(
517  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET));
518  __SE0_OPEN ((void *) pXLocal, se0_param);
519 
520  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
521 #if __C7X_HOSTEM__
522  pWLocal += 1;
523  twTemp = *pWLocal;
524  vTwX1 = CV (twTemp, twTemp, twTemp, twTemp);
525 #else
526  pWLocal += 1;
527  twTemp = *pWLocal;
528  vTwX1 = (CV) (twTemp, twTemp, twTemp, twTemp);
529 #endif
530 
531 #if __C7X_HOSTEM__
532  pWLocal += 2;
533  twTemp = *pWLocal;
534  vTwX2 = CV (twTemp, twTemp, twTemp, twTemp);
535 #else
536  pWLocal += 2;
537  twTemp = *pWLocal;
538  vTwX2 = (CV) (twTemp, twTemp, twTemp, twTemp);
539 #endif
540 #if __C7X_HOSTEM__
541  pWLocal += 2;
542  twTemp = *pWLocal;
543  vTwX3 = CV (twTemp, twTemp, twTemp, twTemp);
544 #else
545  pWLocal += 2;
546  twTemp = *pWLocal;
547  vTwX3 = (CV) (twTemp, twTemp, twTemp, twTemp);
548 #endif
549 
550  /* pY0 = &pYLocal[0x00000000u]; */
551  /* pY1 = &pYLocal[0x80000000u >> numLeadingZeros]; */
552  /* pY2 = &pYLocal[0x20000000u >> numLeadingZeros]; */
553  /* pY3 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
554  /* pY4 = &pYLocal[0x40000000u >> numLeadingZeros]; */
555  /* pY5 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
556  /* pY6 = &pYLocal[0x60000000u >> numLeadingZeros]; */
557  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
558 
559  pY0 = (cfloat*) (pY + (0x00000000u));
560  pY1 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
561  pY2 = (cfloat*) (pY + ((0x20000000u >> numLeadingZeros) << 1));
562  pY3 = (cfloat*) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
563  pY4 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
564  pY5 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
565  pY6 = (cfloat*) (pY + ((0x60000000u >> numLeadingZeros) << 1));
566  pY7 = (cfloat*) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
567 
568  for (k = 0; k < numPoints >> 2; k += 8) {
569  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
570 
571  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
572  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
573  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
574  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
575  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
576  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
577  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
578  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
579 
580  vSum1 = vX_0 + vX_N_2;
581  vSum2 = vX_N_4 + vX_3N_4;
582  vDiff1 = vX_0 - vX_N_2;
583  vDiff2 = vX_N_4 - vX_3N_4;
584 
585  vX0 = vSum1 + vSum2;
586  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
587  vX2 = vSum1 - vSum2;
588  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
589 
590  vSum1_1 = vX_0_1 + vX_N_2_1;
591  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
592  vDiff1_1 = vX_0_1 - vX_N_2_1;
593  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
594 
595  vX0Temp = vSum1_1 + vSum2_1;
596  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
597  vX2Temp = vSum1_1 - vSum2_1;
598  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
599 
600  vX0_1 = vX0Temp;
601  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
602  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
603  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
604 
605  vX0_2PtDft_1 = vX0 + vX0_1;
606  vX0_2PtDft_2 = vX0 - vX0_1;
607  vX1_2PtDft_1 = vX1 + vX1_1;
608  vX1_2PtDft_2 = vX1 - vX1_1;
609  vX2_2PtDft_1 = vX2 + vX2_1;
610  vX2_2PtDft_2 = vX2 - vX2_1;
611  vX3_2PtDft_1 = vX3 + vX3_1;
612  vX3_2PtDft_2 = vX3 - vX3_1;
613 
614  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0_2PtDft_1);
615  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX0_2PtDft_2);
616  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX1_2PtDft_1);
617  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX1_2PtDft_2);
618  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX2_2PtDft_1);
619  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX2_2PtDft_2);
620  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX3_2PtDft_1);
621  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_2PtDft_2);
622  }
623  __SE0_CLOSE ();
624 #endif
625  }
626  }
627  return (status);
628 }
629 
630 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && \
631  !defined(FFTLIB_FFT1D_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
632  (defined(FFTLIB_CHECK_PARAMS)) || \
633  (defined(FFTLIB_FFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS))
634 
637  FFTLIB_bufParams1D_t *bufParamsX,
638  FFTLIB_F32 *pW,
639  FFTLIB_bufParams1D_t *bufParamsW,
640  FFTLIB_F32 *pY,
641  FFTLIB_bufParams1D_t *bufParamsY,
642  void *pBlock)
643 {
644  FFTLIB_STATUS status = FFTLIB_SUCCESS;
645 
646  if ((pX == NULL) || (pW == NULL) || (pY == NULL)) {
647  status = FFTLIB_ERR_NULL_POINTER;
648  }
649  else if (bufParamsX->dim_x != bufParamsW->dim_x ||
650  bufParamsX->dim_x != bufParamsY->dim_x) {
652  }
653  else if (bufParamsX->dim_x < 64 * 2) { /* Minimum number of points is 64 */
655  }
656  else if ((bufParamsX->data_type != FFTLIB_FLOAT32) ||
657  (bufParamsW->data_type != FFTLIB_FLOAT32) ||
658  (bufParamsY->data_type != FFTLIB_FLOAT32)) {
659  status = FFTLIB_ERR_INVALID_TYPE;
660  }
661  else if (((uint64_t) pX) & 0xFu) { /* pX must be 16-byte aligned for a */
662  status = FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES; /* streaming engine
663  configuration */
664  }
665  else {
666  /* Check if number of pts is a power of 2 */
667  uint32_t k = 0;
668  while (k < 32) {
669  if (bufParamsX->dim_x & (1u << k)) {
670  break;
671  }
672  k++;
673  }
674  if ((1u << k) != bufParamsX->dim_x) {
676  }
677  }
678  return (status);
679 }
680 
681 #endif
@ FFTLIB_FLOAT32
c7x::cfloat_vec CV
c7x::float_vec V
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_ERR_INVALID_TYPE
Definition: FFTLIB_types.h:176
@ FFTLIB_ERR_NULL_POINTER
Definition: FFTLIB_types.h:178
@ FFTLIB_ERR_INVALID_DIMENSION
Definition: FFTLIB_types.h:177
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
@ FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES
Definition: FFTLIB_types.h:181
float FFTLIB_F32
Single precision floating point.
Definition: FFTLIB_types.h:169
FFTLIB_STATUS FFTLIB_fft1d_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function should be called before the FFTLIB_fft1d_i32fc_c32fc_o32fc_kernel function is called....
FFTLIB_STATUS FFTLIB_fft1d_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_fft1d_i32fc_c32fc_o32fc_init and...
FFTLIB_STATUS FFTLIB_fft1d_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function is the main kernel compute function.
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.