FFTLIB User Guide
c71/FFTLIB_ifft1d_i32fc_c32fc_o32fc_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2017 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 #include "../FFTLIB_ifft1d_i32fc_c32fc_o32fc.h"
30 
31 #define TRACE_ON (0)
32 
33 #if TRACE_ON
34 #include "../../../common/printv.h"
35 #include <stdio.h>
36 #endif
37 
38 // CODE_SECTION(FFTLIB_ifft1d, ".text:optimized")
39 // CODE_SECTION(FFTLIB_ifft1d_core, ".text:optimized")
40 // CODE_SECTION(FFTLIB_ifft1d_checkParams, ".text:optimized")
41 
42 #define SE_PARAM_BASE (0x0000)
43 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
44 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
45 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_TWID_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
51 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
52 #define SE_CONJ_LOOP_PARAM_OFFSET (SA_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
53 #define SA_CONJ_LOOP_PARAM_OFFSET (SE_CONJ_LOOP_PARAM_OFFSET + SE_PARAM_SIZE)
54 
55 typedef typename c7x::cfloat_vec CV;
56 typedef CV * CVP;
57 
58 typedef typename c7x::float_vec V;
59 typedef V * VP;
60 
61 void ifft_i32fc_o32fc_conjugate_init_ci (void *pX, uint32_t size, void *pBlock)
62 {
63  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
64  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
65  /* cfloat *restrict pXLocal = (cfloat *) pX; */
66 
67  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
68  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
69  se0_param.DIMFMT = __SE_DIMFMT_1D;
70  se0_param.ICNT0 = size;
71 
72  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_CONJ_LOOP_PARAM_OFFSET)) =
73  se0_param;
74 
75  sa0_param.ICNT0 = size;
76  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
77  sa0_param.DIMFMT = __SA_DIMFMT_1D;
78 
79  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_CONJ_LOOP_PARAM_OFFSET)) =
80  sa0_param;
81 }
82 
85  FFTLIB_bufParams1D_t *bufParamsX,
86  FFTLIB_F32 * pW,
87  FFTLIB_bufParams1D_t *bufParamsW,
88  FFTLIB_F32 * pY,
89  FFTLIB_bufParams1D_t *bufParamsY,
90  void * pBlock)
91 {
93 
94 #if defined(FFTLIB_CHECK_PARAMS) || \
95  defined(FFTLIB_IFFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS)
97  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
98  if (status == FFTLIB_SUCCESS)
99 #endif
100  {
101  uint32_t numPoints;
102  uint32_t numPointsPerDft;
103  uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
104  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
105  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
106  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
107 
108  numPoints = bufParamsX->dim_x >> 1;
109  numPointsPerDft = numPoints;
110  seCnt1 = numPoints >> 2;
111  seCnt2 = numPoints >> 5;
112  seCnt3 = 1;
113  seCnt4 = numPoints >> 3;
114 
115  ifft_i32fc_o32fc_conjugate_init_ci (pX, numPoints, pBlock);
116 
117  /* se0_param = (0); */
118  se0_param.ICNT0 = 8; /* 8-point vectors processed in one shot */
119  se0_param.ICNT1 = 4;
120  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
121  se0_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
122  se0_param.DIM2 = 8; /* quarter */
123  se0_param.ICNT3 = seCnt3; /* Number of DFT's */
124  se0_param.DIM3 = numPointsPerDft;
125 
126  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
127  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
128  se0_param.DIMFMT = __SE_DIMFMT_4D;
129  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET)) =
130  se0_param;
131 
132  /* se1_param = (0); */
133  se1_param.ICNT0 = 8; /* 8-point vectors processed in one shot */
134  se1_param.ICNT1 = 3;
135  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
136  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
137  se1_param.DIM2 = 8; /* quarter */
138  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
139  se1_param.DIM3 = 0;
140 
141  se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
142  se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
143  se1_param.DIMFMT = __SE_DIMFMT_4D;
144  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET)) =
145  se1_param;
146 
147  /* sa0_param = (0); */
148  sa0_param.ICNT0 = 8;
149  sa0_param.ICNT1 = 4;
150  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
151  sa0_param.ICNT2 = seCnt2; /* Number of 8-point stores within each */
152  sa0_param.DIM2 = 8; /* quarter */
153  sa0_param.ICNT3 = seCnt3;
154  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
155 
156  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
157  sa0_param.DIMFMT = __SA_DIMFMT_4D;
158  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET)) =
159  sa0_param;
160 
161  /* se0_param = (0); */
162  se0_param.ICNT0 = 8; /* Fetch first two quarters */
163  se0_param.ICNT1 = 2;
164  se0_param.DIM1 = 16; /* Process two 16-point DFTs in one shot */
165  se0_param.ICNT2 = seCnt2;
166  se0_param.DIM2 = 32; /* Half the number of DFT's */
167 
168  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
169  se0_param.TRANSPOSE =
170  __SE_TRANSPOSE_256BIT; /* Using 256BIT transpose required */
171  /* 16-byte alignment on pX */
172  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
173  se0_param.DIMFMT = __SE_DIMFMT_3D;
174  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET)) =
175  se0_param;
176 
177  /* sa0_param = (0); */
178  sa0_param.ICNT0 = numPoints;
179 
180  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
181  sa0_param.DIMFMT = __SA_DIMFMT_1D;
182  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET)) =
183  sa0_param;
184 
185  /* se0_param = (0); */
186  se0_param = __gen_SE_TEMPLATE_v1 ();
187  se0_param.ICNT0 = numPoints;
188 
189  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
190  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
191  se0_param.DIMFMT = __SE_DIMFMT_1D;
192  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET)) =
193  se0_param;
194 
195  /* sa0_param = (0); */
196  sa0_param.ICNT0 = numPoints;
197 
198  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
199  sa0_param.DIMFMT = __SA_DIMFMT_1D;
200  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET)) =
201  sa0_param;
202 
203  /* se0_param = (0); */
204  se0_param.ICNT0 = seCnt4; /* Fetch consecutive four points for DFT */
205  se0_param.ICNT1 = 8;
206  se0_param.DIM1 = seCnt4;
207  /* Fetch 8 points separated by */ /* (numPoints >>
208  3). This fetch
209  pattern */
210  /* can be used for bit reversal */
211 
212  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
213  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
214  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
215  se0_param.DIMFMT = __SE_DIMFMT_2D;
216  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET)) =
217  se0_param;
218 
219  /* se0_param = (0); */
220  se0_param.ICNT0 = seCnt4;
221  se0_param.ICNT1 = 8;
222  se0_param.DIM1 = seCnt4;
223  /* Fetch 8 points separated by */ /* (numPoints >>
224  3). This fetch
225  pattern */
226  /* can be used for bit reversal */
227 
228  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
229  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
230  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
231  se0_param.DIMFMT = __SE_DIMFMT_2D;
232  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET)) =
233  se0_param;
234  }
235  return (status);
236 }
237 
238 static inline c7x::cfloat_vec
240  c7x::float_vec scaleVec,
241  c7x::ulong_vec xorVec)
242 {
243  return (c7x::as_cfloat_vec (
244  scaleVec * c7x::as_float_vec (c7x::as_ulong_vec (in) ^ xorVec)));
245 }
246 
248  c7x::ulong_vec xorVec,
249  uint32_t size,
250  void * pBlock)
251 {
252 
253  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
254  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
255  cfloat *restrict pXLocal = (cfloat *) pX;
256 
257  se0_param =
258  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_CONJ_LOOP_PARAM_OFFSET));
259 
260  sa0_param =
261  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_CONJ_LOOP_PARAM_OFFSET));
262 
263  __SE0_OPEN (pX, se0_param);
264  __SA0_OPEN (sa0_param);
265 
266  uint32_t i = 0;
267  uint32_t loopCount = (size) / c7x::element_count_of<c7x::cfloat_vec>::value;
268  c7x::cfloat_vec regIn, regStore;
269  float scale = 1.0f;
270  c7x::float_vec scaleVec = __vload_dup (&scale);
271 
272  __vpred tmp;
273  CV * addr;
274 
275  for (i = 0; i < loopCount; i++) {
276  regIn = c7x::strm_eng<0, c7x::cfloat_vec>::get_adv ();
277  /* FFTLIB_debugPrintFloatVector (c7x::as_float_vec (regIn)); */
278 
279  regStore = ifft_i32fc_o32fc_scaleAndConjugate (regIn, scaleVec, xorVec);
280  /* FFTLIB_debugPrintFloatVector (c7x::as_float_vec (regStore)); */
281 
282  tmp = c7x::strm_agen<0, CV>::get_vpred ();
283  addr = c7x::strm_agen<0, CV>::get_adv (&pXLocal[0]);
284  __vstore_pred (tmp, addr, regStore);
285  }
286 
287  __SE0_CLOSE ();
288  __SA0_CLOSE ();
289 }
290 
293  FFTLIB_bufParams1D_t *bufParamsX,
294  FFTLIB_F32 * pW,
295  FFTLIB_bufParams1D_t *bufParamsW,
296  FFTLIB_F32 * pY,
297  FFTLIB_bufParams1D_t *bufParamsY,
298  void * pBlock)
299 {
300  uint32_t k;
301  FFTLIB_STATUS status = FFTLIB_SUCCESS;
302  uint32_t numPoints;
303  uint32_t numPointsPerDft;
304  uint32_t numLeadingZeros;
305  uint32_t offsetBitReverse;
306  uint32_t seCnt1, seCnt2, seCnt3;
307  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
308  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
309  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
310 
311  cfloat *restrict pXLocal;
312  cfloat *restrict pYLocal;
313  cfloat *restrict pWLocal;
314  cfloat *restrict pY0;
315  cfloat *restrict pY1;
316  cfloat *restrict pY2;
317  cfloat *restrict pY3;
318  cfloat *restrict pY4;
319  cfloat *restrict pY5;
320  cfloat *restrict pY6;
321  cfloat *restrict pY7;
322 
323  CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
324  CV vSum1, vSum2, vDiff1, vDiff2;
325  CV vTwX1, vTwX2, vTwX3;
326  CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
327  CV vX0, vX1, vX2, vX3;
328  CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
329  CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
330  CV vX0_1, vX1_1, vX2_1, vX3_1;
331  CV vX0_2PtDft_1, vX0_2PtDft_2;
332  CV vX1_2PtDft_1, vX1_2PtDft_2;
333  CV vX2_2PtDft_1, vX2_2PtDft_2;
334  CV vX3_2PtDft_1, vX3_2PtDft_2;
335  cfloat twTemp;
336 
337 #ifdef FFTLIB_CHECK_PARAMS
339  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
340  if (status == FFTLIB_SUCCESS)
341 #endif
342  {
343  numPoints = bufParamsX->dim_x >> 1;
344  numPointsPerDft = numPoints;
345 
346  float scale = 1.0 / (numPoints);
347  c7x::float_vec scaleVec = __vload_dup (&scale);
348 
349 #if defined(_HOST_BUILD)
350  c7x::ulong_vec xorVec = (c7x::ulong_vec) (0x0000000080000000);
351 
352 #else
353  c7x::ulong_vec xorVec = (0x0000000080000000);
354 #endif
355 
356  ifft_i32fc_o32fc_conjugate_exec_ci ((void *) pX, xorVec, numPoints,
357  pBlock);
358 
359  se0_param =
360  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET));
361  se1_param =
362  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET));
363  sa0_param =
364  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET));
365  seCnt1 = numPointsPerDft >> 2;
366  seCnt2 = numPointsPerDft >> 5;
367  seCnt3 = 1;
368 
369  pXLocal = (cfloat *) pX;
370  pWLocal = (cfloat *) pW;
371  pYLocal = (cfloat *) pY;
372 
373  while (numPointsPerDft >= 64) {
374  /* TODO OPT: Calculate params upfront in init function,
375  * rather than generating SE params on the fly here */
376  se0_param.ICNT1 = 4;
377  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
378  se0_param.ICNT2 = seCnt2;
379  se0_param.DIM2 = 8;
380  /* Number of 8-point fetches within */ /* each quarter */
381  se0_param.ICNT3 = seCnt3;
382  se0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
383  __SE0_OPEN ((void *) pXLocal, se0_param);
384 
385  se1_param.ICNT1 = 3;
386  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
387  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
388  se1_param.DIM2 = 8; /* quarter */
389  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
390  se1_param.DIM3 = 0;
391  __SE1_OPEN ((void *) pWLocal, se1_param);
392 
393  sa0_param.ICNT1 = 4;
394  sa0_param.DIM1 = /* Save to each of the 4 quarters */ seCnt1;
395  sa0_param.ICNT2 = seCnt2;
396  sa0_param.DIM2 = 8;
397  /* Number of 8-point stores within */ /* each quarter */
398  sa0_param.ICNT3 = seCnt3;
399  sa0_param.DIM3 = numPointsPerDft;
400  /* Number of DFT's */
401  __SA0_OPEN (sa0_param);
402 
403  /* Loop is unrolled twice for better optimization */
404  for (k = 0; k < numPoints; k += 64) {
405 
406  /* First iteration of loop unroll */
407  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
408  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
409  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
410  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
411 
412  vSum1 = vX_0 + vX_N_2;
413  vSum2 = vX_N_4 + vX_3N_4;
414  vDiff1 = vX_0 - vX_N_2;
415  vDiff2 = vX_N_4 - vX_3N_4;
416 
417  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
418  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
419  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
420 
421  vX0Temp = vSum1 + vSum2;
422  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
423  vX2Temp = vSum1 - vSum2;
424  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
425 
426  vX0 = vX0Temp;
427  vX1 = __complex_multiply (vX1Temp, vTwX1);
428  vX2 = __complex_multiply (vX2Temp, vTwX2);
429  vX3 = __complex_multiply (vX3Temp, vTwX3);
430 
431  /* __SA0ADV(CV, pXLocal) = vX0; */
432  /* __SA0ADV(CV, pXLocal) = vX2; */
433  /* __SA0ADV(CV, pXLocal) = vX1; */
434  /* __SA0ADV(CV, pXLocal) = vX3; */
435 
436  __vpred tmp;
437  CVP addr;
438  tmp = c7x::strm_agen<0, CV>::get_vpred ();
439  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
440  __vstore_pred (tmp, addr, vX0);
441 
442  tmp = c7x::strm_agen<0, CV>::get_vpred ();
443  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
444  __vstore_pred (tmp, addr, vX2);
445 
446  tmp = c7x::strm_agen<0, CV>::get_vpred ();
447  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
448  __vstore_pred (tmp, addr, vX1);
449 
450  tmp = c7x::strm_agen<0, CV>::get_vpred ();
451  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
452  __vstore_pred (tmp, addr, vX3);
453 
454  /* Second iteration of loop unroll */
455  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
456  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
457  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
458  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
459 
460  vSum1 = vX_0 + vX_N_2;
461  vSum2 = vX_N_4 + vX_3N_4;
462  vDiff1 = vX_0 - vX_N_2;
463  vDiff2 = vX_N_4 - vX_3N_4;
464 
465  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
466  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
467  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
468 
469  vX0Temp = vSum1 + vSum2;
470  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
471  vX2Temp = vSum1 - vSum2;
472  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
473 
474  vX0 = vX0Temp;
475  vX1 = __complex_multiply (vX1Temp, vTwX1);
476  vX2 = __complex_multiply (vX2Temp, vTwX2);
477  vX3 = __complex_multiply (vX3Temp, vTwX3);
478 
479  /* __SA0ADV(CV, pXLocal) = vX0; */
480  /* __SA0ADV(CV, pXLocal) = vX2; */
481  /* __SA0ADV(CV, pXLocal) = vX1; */
482  /* __SA0ADV(CV, pXLocal) = vX3; */
483 
484  tmp = c7x::strm_agen<0, CV>::get_vpred ();
485  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
486  __vstore_pred (tmp, addr, vX0);
487 
488  tmp = c7x::strm_agen<0, CV>::get_vpred ();
489  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
490  __vstore_pred (tmp, addr, vX2);
491 
492  tmp = c7x::strm_agen<0, CV>::get_vpred ();
493  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
494  __vstore_pred (tmp, addr, vX1);
495 
496  tmp = c7x::strm_agen<0, CV>::get_vpred ();
497  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
498  __vstore_pred (tmp, addr, vX3);
499  }
500  __SA0_CLOSE ();
501  __SE0_CLOSE ();
502  __SE1_CLOSE ();
503 
504  numPointsPerDft >>= 2;
505  pWLocal += numPointsPerDft * 3;
506  seCnt1 >>= 2;
507  seCnt2 >>= 2;
508  seCnt3 <<= 2;
509  }
510 
511  if (numPointsPerDft == 16) {
512  /* TODO OPT: Use one SE and see compiler behavior.
513  * There may be L2 bank conflicts using
514  * two SE's separated by 64 bytes */
515  /* 16-point stage */
516  se0_param = *(
517  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET));
518  __SE0_OPEN ((void *) pXLocal, se0_param);
519  __SE1_OPEN ((void *) (pXLocal + 8), se0_param);
520 
521  sa0_param = *(
522  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET));
523  __SA0_OPEN (sa0_param);
524 
525 #if __C7X_HOSTEM__
526  vTwX1 = *((CVP) pWLocal);
527  vTwX1 = CV (vTwX1.lo (), vTwX1.lo ());
528  vTwX2 = *((CVP) (pWLocal + 4));
529  vTwX2 = CV (vTwX2.lo (), vTwX2.lo ());
530  vTwX3 = *((CVP) (pWLocal + 8));
531  vTwX3 = CV (vTwX3.lo (), vTwX3.lo ());
532 #else
533  vTwX1 = *((CVP) pWLocal);
534  vTwX1 = (CV) (vTwX1.lo (), vTwX1.lo ());
535  vTwX2 = *((CVP) (pWLocal + 4));
536  vTwX2 = (CV) (vTwX2.lo (), vTwX2.lo ());
537  vTwX3 = *((CVP) (pWLocal + 8));
538  vTwX3 = (CV) (vTwX3.lo (), vTwX3.lo ());
539 
540 #endif
541  for (k = 0; k < numPoints; k += 32) {
542  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
543  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
544  vX_N_2 = c7x::strm_eng<1, CV>::get_adv ();
545  vX_3N_4 = c7x::strm_eng<1, CV>::get_adv ();
546 
547  vSum1 = vX_0 + vX_N_2;
548  vSum2 = vX_N_4 + vX_3N_4;
549  vDiff1 = vX_0 - vX_N_2;
550  vDiff2 = vX_N_4 - vX_3N_4;
551 
552  vX0Temp = vSum1 + vSum2;
553  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
554  vX2Temp = vSum1 - vSum2;
555  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
556 
557  vX0 = vX0Temp;
558  vX1 = __complex_multiply (vX1Temp, vTwX1);
559  vX2 = __complex_multiply (vX2Temp, vTwX2);
560  vX3 = __complex_multiply (vX3Temp, vTwX3);
561 
562  /* __SA0ADV(CV, pXLocal) = (CV)(vX0.lo(),
563  * vX2.lo()); */
564  /* __SA0ADV(CV, pXLocal) = (CV)(vX1.lo(),
565  * vX3.lo()); */
566  /* __SA0ADV(CV, pXLocal) = (CV)(vX0.hi(),
567  * vX2.hi()); */
568  /* __SA0ADV(CV, pXLocal) = (CV)(vX1.hi(),
569  * vX3.hi()); */
570 
571 #if __C7X_HOSTEM__
572  __vpred tmp;
573  CVP addr;
574  tmp = c7x::strm_agen<0, CV>::get_vpred ();
575  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
576  __vstore_pred (tmp, addr, CV (vX0.lo (), vX2.lo ()));
577  tmp = c7x::strm_agen<0, CV>::get_vpred ();
578  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
579  __vstore_pred (tmp, addr, CV (vX1.lo (), vX3.lo ()));
580  tmp = c7x::strm_agen<0, CV>::get_vpred ();
581  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
582  __vstore_pred (tmp, addr, CV (vX0.hi (), vX2.hi ()));
583  tmp = c7x::strm_agen<0, CV>::get_vpred ();
584  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
585  __vstore_pred (tmp, addr, CV (vX1.hi (), vX3.hi ()));
586 #else
587  __vpred tmp;
588  CVP addr;
589  tmp = c7x::strm_agen<0, CV>::get_vpred ();
590  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
591  __vstore_pred (tmp, addr, (CV) (vX0.lo (), vX2.lo ()));
592  tmp = c7x::strm_agen<0, CV>::get_vpred ();
593  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
594  __vstore_pred (tmp, addr, (CV) (vX1.lo (), vX3.lo ()));
595  tmp = c7x::strm_agen<0, CV>::get_vpred ();
596  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
597  __vstore_pred (tmp, addr, (CV) (vX0.hi (), vX2.hi ()));
598  tmp = c7x::strm_agen<0, CV>::get_vpred ();
599  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
600  __vstore_pred (tmp, addr, (CV) (vX1.hi (), vX3.hi ()));
601 #endif
602  }
603  __SA0_CLOSE ();
604  __SE0_CLOSE ();
605  __SE1_CLOSE ();
606  }
607  else {
608  /* 32-point stage */
609  se0_param = *(
610  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET));
611  __SE0_OPEN ((void *) pXLocal, se0_param);
612 
613  sa0_param = *(
614  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET));
615  __SA0_OPEN (sa0_param);
616 
617  vTwX1 = *((CVP) pWLocal);
618  vTwX2 = *((CVP) (pWLocal + 8));
619  vTwX3 = *((CVP) (pWLocal + 16));
620 
621  for (k = 0; k < numPoints; k += 64) {
622  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
623  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
624  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
625  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
626 
627  vSum1 = vX_0 + vX_N_2;
628  vSum2 = vX_N_4 + vX_3N_4;
629  vDiff1 = vX_0 - vX_N_2;
630  vDiff2 = vX_N_4 - vX_3N_4;
631 
632  vX0Temp = vSum1 + vSum2;
633  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
634  vX2Temp = vSum1 - vSum2;
635  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
636 
637  vX0 = vX0Temp;
638  vX1 = __complex_multiply (vX1Temp, vTwX1);
639  vX2 = __complex_multiply (vX2Temp, vTwX2);
640  vX3 = __complex_multiply (vX3Temp, vTwX3);
641 
642  /* __SA0ADV(CV, pXLocal) = vX0; */
643  /* __SA0ADV(CV, pXLocal) = vX2; */
644  /* __SA0ADV(CV, pXLocal) = vX1; */
645  /* __SA0ADV(CV, pXLocal) = vX3; */
646 
647  __vpred tmp;
648  CVP addr;
649  tmp = c7x::strm_agen<0, CV>::get_vpred ();
650  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
651  __vstore_pred (tmp, addr, vX0);
652 
653  tmp = c7x::strm_agen<0, CV>::get_vpred ();
654  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
655  __vstore_pred (tmp, addr, vX2);
656 
657  tmp = c7x::strm_agen<0, CV>::get_vpred ();
658  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
659  __vstore_pred (tmp, addr, vX1);
660 
661  tmp = c7x::strm_agen<0, CV>::get_vpred ();
662  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
663  __vstore_pred (tmp, addr, vX3);
664 
665  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
666  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
667  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
668  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
669 
670  vSum1 = vX_0 + vX_N_2;
671  vSum2 = vX_N_4 + vX_3N_4;
672  vDiff1 = vX_0 - vX_N_2;
673  vDiff2 = vX_N_4 - vX_3N_4;
674 
675  vX0Temp = vSum1 + vSum2;
676  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
677  vX2Temp = vSum1 - vSum2;
678  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
679 
680  vX0 = vX0Temp;
681  vX1 = __complex_multiply (vX1Temp, vTwX1);
682  vX2 = __complex_multiply (vX2Temp, vTwX2);
683  vX3 = __complex_multiply (vX3Temp, vTwX3);
684 
685  /* __SA0ADV(CV, pXLocal) = vX0; */
686  /* __SA0ADV(CV, pXLocal) = vX2; */
687  /* __SA0ADV(CV, pXLocal) = vX1; */
688  /* __SA0ADV(CV, pXLocal) = vX3; */
689 
690  tmp = c7x::strm_agen<0, CV>::get_vpred ();
691  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
692  __vstore_pred (tmp, addr, vX0);
693 
694  tmp = c7x::strm_agen<0, CV>::get_vpred ();
695  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
696  __vstore_pred (tmp, addr, vX2);
697 
698  tmp = c7x::strm_agen<0, CV>::get_vpred ();
699  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
700  __vstore_pred (tmp, addr, vX1);
701 
702  tmp = c7x::strm_agen<0, CV>::get_vpred ();
703  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
704  __vstore_pred (tmp, addr, vX3);
705  }
706  __SE0_CLOSE ();
707  __SA0_CLOSE ();
708  }
709 
710  numPointsPerDft >>= 2;
711  pWLocal += numPointsPerDft * 3;
712 
713  if (numPointsPerDft == 4) {
714  /* 4-point stage with bit-reversal */
715  se0_param = *(
716  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET));
717  __SE0_OPEN ((void *) pXLocal, se0_param);
718 
719  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
720 
721  /* pY0 = &pYLocal[0x00000000u]; */
722  /* pY1 = &pYLocal[0x40000000u >> numLeadingZeros]; */
723  /* pY2 = &pYLocal[0x80000000u >> numLeadingZeros]; */
724  /* pY3 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
725 
726  pY0 = (cfloat *) (pY + 0);
727  pY1 = (cfloat *) (pY + ((0x40000000u >> numLeadingZeros) << 1));
728  pY2 = (cfloat *) (pY + ((0x80000000u >> numLeadingZeros) << 1));
729  pY3 = (cfloat *) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
730 
731 #ifdef LAST_LOOP_UNROLL
732  /* pY4 = &pYLocal[0x20000000u >> numLeadingZeros]; */
733  /* pY5 = &pYLocal[0x60000000u >> numLeadingZeros]; */
734  /* pY6 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
735  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
736 
737  pY4 = (cfloat *) (pY + ((0x20000000u >> numLeadingZeros) << 1));
738  pY5 = (cfloat *) (pY + ((0x60000000u >> numLeadingZeros) << 1));
739  pY6 = (cfloat *) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
740  pY7 = (cfloat *) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
741 #endif
742 
743 #ifdef LAST_LOOP_UNROLL
744  for (k = 0; k<numPoints>> 3; k += 8)
745 #else
746  for (k = 0; k<numPoints>> 3; k += 4)
747 #endif
748  {
749  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
750 
751  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
752  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
753  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
754  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
755 
756  vSum1 = vX_0 + vX_N_2;
757  vSum2 = vX_N_4 + vX_3N_4;
758  vDiff1 = vX_0 - vX_N_2;
759  vDiff2 = vX_N_4 - vX_3N_4;
760 
761  vX0 = vSum1 + vSum2;
762  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
763  vX2 = vSum1 - vSum2;
764  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
765 
766  vX0 = ifft_i32fc_o32fc_scaleAndConjugate (vX0, scaleVec, xorVec);
767  vX1 = ifft_i32fc_o32fc_scaleAndConjugate (vX1, scaleVec, xorVec);
768  vX2 = ifft_i32fc_o32fc_scaleAndConjugate (vX2, scaleVec, xorVec);
769  vX3 = ifft_i32fc_o32fc_scaleAndConjugate (vX3, scaleVec, xorVec);
770 
771  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0);
772  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX1);
773  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX2);
774  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX3);
775 
776 #ifdef LAST_LOOP_UNROLL
777  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
778  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
779  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
780  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
781 
782  vSum1_1 = vX_0_1 + vX_N_2_1;
783  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
784  vDiff1_1 = vX_0_1 - vX_N_2_1;
785  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
786 
787  vX0_1 = vSum1_1 + vSum2_1;
788  vX1_1 = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
789  vX2_1 = vSum1_1 - vSum2_1;
790  vX3_1 = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
791 
792  vX0_1 =
793  ifft_i32fc_o32fc_scaleAndConjugate (vX0_1, scaleVec, xorVec);
794  vX1_1 =
795  ifft_i32fc_o32fc_scaleAndConjugate (vX1_1, scaleVec, xorVec);
796  vX2_1 =
797  ifft_i32fc_o32fc_scaleAndConjugate (vX2_1, scaleVec, xorVec);
798  vX3_1 =
799  ifft_i32fc_o32fc_scaleAndConjugate (vX3_1, scaleVec, xorVec);
800 
801  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX0_1);
802  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX1_1);
803  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX2_1);
804  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_1);
805 #endif
806  }
807  __SE0_CLOSE ();
808  }
809  else {
810  /* 4-point stage followed by 2-point stage with bit-reversal */
811  se0_param = *(
812  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET));
813  __SE0_OPEN ((void *) pXLocal, se0_param);
814 
815  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
816 #if __C7X_HOSTEM__
817  pWLocal += 1;
818  twTemp = *pWLocal;
819  vTwX1 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
820  twTemp);
821 #else
822  pWLocal += 1;
823  twTemp = *pWLocal;
824  vTwX1 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
825  twTemp);
826 #endif
827 
828 #if __C7X_HOSTEM__
829  pWLocal += 2;
830  twTemp = *pWLocal;
831  vTwX2 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
832  twTemp);
833 #else
834  pWLocal += 2;
835  twTemp = *pWLocal;
836  vTwX2 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
837  twTemp);
838 #endif
839 #if __C7X_HOSTEM__
840  pWLocal += 2;
841  twTemp = *pWLocal;
842  vTwX3 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
843  twTemp);
844 #else
845  pWLocal += 2;
846  twTemp = *pWLocal;
847  vTwX3 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
848  twTemp);
849 #endif
850 
851  /* pY0 = &pYLocal[0x00000000u]; */
852  /* pY1 = &pYLocal[0x80000000u >> numLeadingZeros]; */
853  /* pY2 = &pYLocal[0x20000000u >> numLeadingZeros]; */
854  /* pY3 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
855  /* pY4 = &pYLocal[0x40000000u >> numLeadingZeros]; */
856  /* pY5 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
857  /* pY6 = &pYLocal[0x60000000u >> numLeadingZeros]; */
858  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
859 
860  pY0 = (cfloat *) (pY + (0x00000000u));
861  pY1 = (cfloat *) (pY + ((0x80000000u >> numLeadingZeros) << 1));
862  pY2 = (cfloat *) (pY + ((0x20000000u >> numLeadingZeros) << 1));
863  pY3 = (cfloat *) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
864  pY4 = (cfloat *) (pY + ((0x40000000u >> numLeadingZeros) << 1));
865  pY5 = (cfloat *) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
866  pY6 = (cfloat *) (pY + ((0x60000000u >> numLeadingZeros) << 1));
867  pY7 = (cfloat *) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
868 
869  for (k = 0; k<numPoints>> 3; k += 8) {
870  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
871 
872  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
873  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
874  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
875  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
876  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
877  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
878  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
879  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
880 
881  vSum1 = vX_0 + vX_N_2;
882  vSum2 = vX_N_4 + vX_3N_4;
883  vDiff1 = vX_0 - vX_N_2;
884  vDiff2 = vX_N_4 - vX_3N_4;
885 
886  vX0 = vSum1 + vSum2;
887  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
888  vX2 = vSum1 - vSum2;
889  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
890 
891  vSum1_1 = vX_0_1 + vX_N_2_1;
892  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
893  vDiff1_1 = vX_0_1 - vX_N_2_1;
894  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
895 
896  vX0Temp = vSum1_1 + vSum2_1;
897  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
898  vX2Temp = vSum1_1 - vSum2_1;
899  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
900 
901  vX0_1 = vX0Temp;
902  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
903  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
904  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
905 
906  vX0_2PtDft_1 = vX0 + vX0_1;
907  vX0_2PtDft_2 = vX0 - vX0_1;
908  vX1_2PtDft_1 = vX1 + vX1_1;
909  vX1_2PtDft_2 = vX1 - vX1_1;
910  vX2_2PtDft_1 = vX2 + vX2_1;
911  vX2_2PtDft_2 = vX2 - vX2_1;
912  vX3_2PtDft_1 = vX3 + vX3_1;
913  vX3_2PtDft_2 = vX3 - vX3_1;
914 
915  vX0_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
916  vX0_2PtDft_1, scaleVec, xorVec);
917  vX0_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
918  vX0_2PtDft_2, scaleVec, xorVec);
919  vX1_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
920  vX1_2PtDft_1, scaleVec, xorVec);
921  vX1_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
922  vX1_2PtDft_2, scaleVec, xorVec);
923  vX2_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
924  vX2_2PtDft_1, scaleVec, xorVec);
925  vX2_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
926  vX2_2PtDft_2, scaleVec, xorVec);
927  vX3_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
928  vX3_2PtDft_1, scaleVec, xorVec);
929  vX3_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
930  vX3_2PtDft_2, scaleVec, xorVec);
931 
932  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0_2PtDft_1);
933  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX0_2PtDft_2);
934  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX1_2PtDft_1);
935  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX1_2PtDft_2);
936  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX2_2PtDft_1);
937  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX2_2PtDft_2);
938  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX3_2PtDft_1);
939  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_2PtDft_2);
940  }
941  __SE0_CLOSE ();
942  }
943  }
944  return (status);
945 }
946 
947 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && \
948  !defined(FFTLIB_IFFT1D_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
949  (defined(FFTLIB_CHECK_PARAMS)) || \
950  (defined(FFTLIB_IFFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS))
951 
954  FFTLIB_bufParams1D_t *bufParamsX,
955  FFTLIB_F32 * pW,
956  FFTLIB_bufParams1D_t *bufParamsW,
957  FFTLIB_F32 * pY,
958  FFTLIB_bufParams1D_t *bufParamsY,
959  void * pBlock)
960 {
961  FFTLIB_STATUS status = FFTLIB_SUCCESS;
962 
963  /* if ((pX == NULL) || (pW == NULL) || (pY == NULL)) { */
964  /* status = FFTLIB_ERR_NULL_POINTER; */
965  /* } */
966  /* else if (bufParamsX->dim_x != bufParamsW->dim_x || */
967  /* bufParamsX->dim_x != bufParamsY->dim_x) { */
968  /* status = FFTLIB_ERR_INVALID_DIMENSION; */
969  /* } */
970  /* else if (bufParamsX->dim_x < 64 * 2) { /\* Minimum number of points is 64
971  * *\/ */
972  /* status = FFTLIB_ERR_INVALID_DIMENSION; */
973  /* } */
974  /* else if ((bufParamsX->data_type != FFTLIB_FLOAT32) || */
975  /* (bufParamsW->data_type != FFTLIB_FLOAT32) || */
976  /* (bufParamsY->data_type != FFTLIB_FLOAT32)) { */
977  /* status = FFTLIB_ERR_INVALID_TYPE; */
978  /* } */
979  /* else if (((uint64_t) pX) & 0xFu) { /\* pX must be 16-byte aligned for a
980  * *\/ */
981  /* status = FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES; /\* streaming engine */
982  /* configuration *\/ */
983  /* } */
984  /* else { */
985  /* /\* Check if number of pts is a power of 2 *\/ */
986  /* uint32_t k = 0; */
987  /* while (k < 32) { */
988  /* if (bufParamsX->dim_x & (1u << k)) { */
989  /* break; */
990  /* } */
991  /* k++; */
992  /* } */
993  /* if ((1u << k) != bufParamsX->dim_x) { */
994  /* status = FFTLIB_ERR_INVALID_DIMENSION; */
995  /* } */
996  /* } */
997  return (status);
998 }
999 
1000 #endif
c7x::cfloat_vec CV
c7x::float_vec V
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
float FFTLIB_F32
Single precision floating point.
Definition: FFTLIB_types.h:169
void ifft_i32fc_o32fc_conjugate_init_ci(void *pX, uint32_t size, void *pBlock)
static c7x::cfloat_vec ifft_i32fc_o32fc_scaleAndConjugate(c7x::cfloat_vec in, c7x::float_vec scaleVec, c7x::ulong_vec xorVec)
void ifft_i32fc_o32fc_conjugate_exec_ci(void *pX, c7x::ulong_vec xorVec, uint32_t size, void *pBlock)
FFTLIB_STATUS FFTLIB_ifft1d_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_ifft1d_i32fc_c32fc_o32fc_init an...
FFTLIB_STATUS FFTLIB_ifft1d_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function should be called before the FFTLIB_ifft1d_i32fc_c32fc_o32fc_kernel function is called....
FFTLIB_STATUS FFTLIB_ifft1d_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function is the main kernel compute function.
A structure for a 1 dimensional buffer descriptor.
uint32_t dim_x
Width of buffer in X dimension in elements.