FFTLIB User Guide
c7504/FFTLIB_ifft1d_i32fc_c32fc_o32fc_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2017 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 #include "../../../common/c71/FFTLIB_debug.h"
30 #include "../FFTLIB_ifft1d_i32fc_c32fc_o32fc.h"
31 
32 #define TRACE_ON (0)
33 #define LAST_LOOP_UNROLL 0
34 
35 #if TRACE_ON
36 #include "../../../common/printv.h"
37 #include <stdio.h>
38 #endif
39 
40 // CODE_SECTION(FFTLIB_ifft1d, ".text:optimized")
41 // CODE_SECTION(FFTLIB_ifft1d_core, ".text:optimized")
42 // CODE_SECTION(FFTLIB_ifft1d_checkParams, ".text:optimized")
43 
44 #define SE_PARAM_BASE (0x0000)
45 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
46 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SE_TWID_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
51 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
52 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
53 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
54 #define SE_CONJ_LOOP_PARAM_OFFSET (SA_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
55 #define SA_CONJ_LOOP_PARAM_OFFSET (SE_CONJ_LOOP_PARAM_OFFSET + SE_PARAM_SIZE)
56 
57 typedef typename c7x::cfloat_vec CV;
58 typedef CV * CVP;
59 
60 typedef typename c7x::float_vec V;
61 typedef V * VP;
62 
63 void ifft_i32fc_o32fc_conjugate_init_ci (void *pX, uint32_t size, void *pBlock)
64 {
65  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
66  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
67  /* cfloat *restrict pXLocal = (cfloat *) pX; */
68 
69  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
70  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
71  se0_param.DIMFMT = __SE_DIMFMT_1D;
72  se0_param.ICNT0 = size;
73 
74  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_CONJ_LOOP_PARAM_OFFSET)) =
75  se0_param;
76 
77  sa0_param.ICNT0 = size;
78  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
79  sa0_param.DIMFMT = __SA_DIMFMT_1D;
80 
81  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_CONJ_LOOP_PARAM_OFFSET)) =
82  sa0_param;
83 }
84 
87  FFTLIB_bufParams1D_t *bufParamsX,
88  FFTLIB_F32 * pW,
89  FFTLIB_bufParams1D_t *bufParamsW,
90  FFTLIB_F32 * pY,
91  FFTLIB_bufParams1D_t *bufParamsY,
92  void * pBlock)
93 {
95 
96 #if defined(FFTLIB_CHECK_PARAMS) || \
97  defined(FFTLIB_IFFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS)
99  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
100  if (status == FFTLIB_SUCCESS)
101 #endif
102  {
103  uint32_t numPoints;
104  uint32_t numPointsPerDft;
105  uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
106  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
107  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
108  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
109 
110  numPoints = bufParamsX->dim_x >> 1;
111  numPointsPerDft = numPoints;
112  seCnt1 = numPoints >> 2;
113  seCnt2 = numPoints >> 4;
114  seCnt3 = 1;
115  seCnt4 = numPoints >> 2;
116 
117  ifft_i32fc_o32fc_conjugate_init_ci (pX, numPoints, pBlock);
118 
119  uint32_t elementSize = c7x::element_count_of<c7x::cfloat_vec>::value;
120  /* printf ("elementSize: %d\n", elementSize); */
121  /* se0_param = (0); */
122  se0_param.ICNT0 = elementSize;
123  se0_param.ICNT1 = 4; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
124  se0_param.DIM1 = seCnt1;
125  se0_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
126  se0_param.DIM2 = elementSize;
127  se0_param.ICNT3 = seCnt3; /* Number of DFT's */
128  se0_param.DIM3 = numPointsPerDft;
129 
130  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
131  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
132  se0_param.DIMFMT = __SE_DIMFMT_4D;
133  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET)) =
134  se0_param;
135 
136  /* se1_param = (0); */
137  se1_param.ICNT0 = elementSize;
138  se1_param.ICNT1 = 3;
139  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
140  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
141  se1_param.DIM2 = elementSize;
142  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
143  se1_param.DIM3 = 0;
144 
145  se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
146  se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
147  se1_param.DIMFMT = __SE_DIMFMT_4D;
148  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET)) =
149  se1_param;
150 
151  /* sa0_param = (0); */
152  sa0_param.ICNT0 = elementSize;
153  sa0_param.ICNT1 = 4;
154  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
155  sa0_param.ICNT2 = seCnt2; /* Number of 8-point stores within each */
156  sa0_param.DIM2 = elementSize;
157  sa0_param.ICNT3 = seCnt3;
158  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
159 
160  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
161  sa0_param.DIMFMT = __SA_DIMFMT_4D;
162  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET)) =
163  sa0_param;
164 
165  /* se0_param = (0); */
166  se0_param.ICNT0 = elementSize; /* Fetch first two quarters */
167  se0_param.ICNT1 = 1;
168  se0_param.DIM1 = 16; /* Process two 16-point DFTs in one shot */
169  se0_param.ICNT2 = seCnt2;
170  se0_param.DIM2 = 16; /* Half the number of DFT's */
171 
172  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
173  /* se0_param.TRANSPOSE = */
174  /* __SE_TRANSPOSE_128BIT; /\* Using 256BIT transpose required *\/ */
175  /* /\* 16-byte alignment on pX *\/ */
176  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
177  se0_param.DIMFMT = __SE_DIMFMT_3D;
178  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET)) =
179  se0_param;
180 
181  /* sa0_param = (0); */
182  sa0_param.ICNT0 = numPoints;
183 
184  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
185  sa0_param.DIMFMT = __SA_DIMFMT_1D;
186  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET)) =
187  sa0_param;
188 
189  /* se0_param = (0); */
190  se0_param = __gen_SE_TEMPLATE_v1 ();
191  se0_param.ICNT0 = numPoints;
192 
193  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
194  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
195  se0_param.DIMFMT = __SE_DIMFMT_1D;
196  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET)) =
197  se0_param;
198 
199  /* sa0_param = (0); */
200  sa0_param.ICNT0 = numPoints;
201 
202  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
203  sa0_param.DIMFMT = __SA_DIMFMT_1D;
204  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET)) =
205  sa0_param;
206 
207  /* se0_param = (0); */
208  se0_param.ICNT0 = seCnt4; /* Fetch consecutive four points for DFT */
209  se0_param.ICNT1 = elementSize;
210  se0_param.DIM1 = seCnt4;
211  /* Fetch 8 points separated by */ /* (numPoints >>
212  3). This fetch
213  pattern */
214  /* can be used for bit reversal */
215 
216  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
217  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
218  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
219  se0_param.DIMFMT = __SE_DIMFMT_2D;
220  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET)) =
221  se0_param;
222 
223  /* se0_param = (0); */
224  se0_param.ICNT0 = seCnt4;
225  se0_param.ICNT1 = elementSize;
226  se0_param.DIM1 = seCnt4;
227  /* Fetch 8 points separated by */ /* (numPoints >>
228  3). This fetch
229  pattern */
230  /* can be used for bit reversal */
231 
232  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
233  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
234  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
235  se0_param.DIMFMT = __SE_DIMFMT_2D;
236  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET)) =
237  se0_param;
238  }
239  return (status);
240 }
241 
242 static inline c7x::cfloat_vec
244  c7x::float_vec scaleVec,
245  c7x::ulong_vec xorVec)
246 {
247  return (c7x::as_cfloat_vec (
248  scaleVec * c7x::as_float_vec (c7x::as_ulong_vec (in) ^ xorVec)));
249 }
250 
252  c7x::ulong_vec xorVec,
253  uint32_t size,
254  void * pBlock)
255 {
256 
257  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
258  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
259  cfloat *restrict pXLocal = (cfloat *) pX;
260 
261  se0_param =
262  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_CONJ_LOOP_PARAM_OFFSET));
263 
264  sa0_param =
265  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_CONJ_LOOP_PARAM_OFFSET));
266 
267  __SE0_OPEN (pX, se0_param);
268  __SA0_OPEN (sa0_param);
269 
270  uint32_t i = 0;
271  uint32_t loopCount = (size) / c7x::element_count_of<c7x::cfloat_vec>::value;
272  c7x::cfloat_vec regIn, regStore;
273  float scale = 1.0f;
274  c7x::float_vec scaleVec = __vload_dup (&scale);
275 
276  __vpred tmp;
277  CV * addr;
278 
279  for (i = 0; i < loopCount; i++) {
280  regIn = c7x::strm_eng<0, c7x::cfloat_vec>::get_adv ();
281  /* FFTLIB_debugPrintFloatVector (c7x::as_float_vec (regIn)); */
282 
283  regStore = ifft_i32fc_o32fc_scaleAndConjugate (regIn, scaleVec, xorVec);
284  /* FFTLIB_debugPrintFloatVector (c7x::as_float_vec (regStore)); */
285 
286  tmp = c7x::strm_agen<0, CV>::get_vpred ();
287  addr = c7x::strm_agen<0, CV>::get_adv (&pXLocal[0]);
288  __vstore_pred (tmp, addr, regStore);
289  }
290 
291  __SE0_CLOSE ();
292  __SA0_CLOSE ();
293 }
294 
297  FFTLIB_bufParams1D_t *bufParamsX,
298  FFTLIB_F32 * pW,
299  FFTLIB_bufParams1D_t *bufParamsW,
300  FFTLIB_F32 * pY,
301  FFTLIB_bufParams1D_t *bufParamsY,
302  void * pBlock)
303 {
304  uint32_t k;
305  FFTLIB_STATUS status = FFTLIB_SUCCESS;
306  uint32_t numPoints;
307  uint32_t numPointsPerDft;
308  uint32_t numLeadingZeros;
309  uint32_t offsetBitReverse;
310  uint32_t seCnt1, seCnt2, seCnt3;
311  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
312  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
313  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
314 
315  cfloat *restrict pXLocal;
316  /* cfloat *restrict pYLocal; */
317  cfloat *restrict pWLocal;
318  cfloat *restrict pY0;
319  cfloat *restrict pY1;
320  cfloat *restrict pY2;
321  cfloat *restrict pY3;
322  cfloat *restrict pY4;
323  cfloat *restrict pY5;
324  cfloat *restrict pY6;
325  cfloat *restrict pY7;
326 
327  CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
328  CV vSum1, vSum2, vDiff1, vDiff2;
329  CV vTwX1, vTwX2, vTwX3;
330  CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
331  CV vX0, vX1, vX2, vX3;
332  CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
333  CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
334  CV vX0_1, vX1_1, vX2_1, vX3_1;
335  CV vX0_2PtDft_1, vX0_2PtDft_2;
336  CV vX1_2PtDft_1, vX1_2PtDft_2;
337  CV vX2_2PtDft_1, vX2_2PtDft_2;
338  CV vX3_2PtDft_1, vX3_2PtDft_2;
339  cfloat twTemp;
340 
341 #ifdef FFTLIB_CHECK_PARAMS
343  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
344  if (status == FFTLIB_SUCCESS)
345 #endif
346  {
347  numPoints = bufParamsX->dim_x >> 1;
348  numPointsPerDft = numPoints;
349 
350  float scale = 1.0 / (numPoints);
351  c7x::float_vec scaleVec = __vload_dup (&scale);
352 
353 #if defined(_HOST_BUILD)
354  c7x::ulong_vec xorVec = (c7x::ulong_vec) (0x0000000080000000);
355 
356 #else
357  c7x::ulong_vec xorVec = (0x0000000080000000);
358 #endif
359 
360  ifft_i32fc_o32fc_conjugate_exec_ci ((void *) pX, xorVec, numPoints,
361  pBlock);
362 
363  se0_param =
364  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET));
365  se1_param =
366  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET));
367  sa0_param =
368  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET));
369  seCnt1 = numPointsPerDft >> 2;
370  seCnt2 = numPointsPerDft >> 4;
371  seCnt3 = 1;
372 
373  pXLocal = (cfloat *) pX;
374  pWLocal = (cfloat *) pW;
375  /* pYLocal = (cfloat *) pY; */
376 
377  while (numPointsPerDft >= 16) {
378  /* TODO OPT: Calculate params upfront in init function,
379  * rather than generating SE params on the fly here */
380  se0_param.ICNT1 = 4;
381  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
382  se0_param.ICNT2 = seCnt2;
383  se0_param.DIM2 = 4;
384  /* Number of 8-point fetches within */ /* each quarter */
385  se0_param.ICNT3 = seCnt3;
386  se0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
387  __SE0_OPEN ((void *) pXLocal, se0_param);
388 
389  se1_param.ICNT1 = 3;
390  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
391  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
392  se1_param.DIM2 = 4; /* quarter */
393  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
394  se1_param.DIM3 = 0;
395  __SE1_OPEN ((void *) pWLocal, se1_param);
396 
397  sa0_param.ICNT1 = 4;
398  sa0_param.DIM1 = /* Save to each of the 4 quarters */ seCnt1;
399  sa0_param.ICNT2 = seCnt2;
400  sa0_param.DIM2 = 4;
401  /* Number of 8-point stores within */ /* each quarter */
402  sa0_param.ICNT3 = seCnt3;
403  sa0_param.DIM3 = numPointsPerDft;
404  /* Number of DFT's */
405  __SA0_OPEN (sa0_param);
406 
407  /* Loop is unrolled twice for better optimization */
408  for (k = 0; k < numPoints; k += 32) {
409 
410  /* First iteration of loop unroll */
411  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
412  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
413  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
414  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
415 
416  vSum1 = vX_0 + vX_N_2;
417  vSum2 = vX_N_4 + vX_3N_4;
418  vDiff1 = vX_0 - vX_N_2;
419  vDiff2 = vX_N_4 - vX_3N_4;
420 
421  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
422  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
423  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
424 
425  vX0Temp = vSum1 + vSum2;
426  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
427  vX2Temp = vSum1 - vSum2;
428  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
429 
430  vX0 = vX0Temp;
431  vX1 = __complex_multiply (vX1Temp, vTwX1);
432  vX2 = __complex_multiply (vX2Temp, vTwX2);
433  vX3 = __complex_multiply (vX3Temp, vTwX3);
434 
435  /* __SA0ADV(CV, pXLocal) = vX0; */
436  /* __SA0ADV(CV, pXLocal) = vX2; */
437  /* __SA0ADV(CV, pXLocal) = vX1; */
438  /* __SA0ADV(CV, pXLocal) = vX3; */
439 
440  __vpred tmp;
441  CVP addr;
442  tmp = c7x::strm_agen<0, CV>::get_vpred ();
443  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
444  __vstore_pred (tmp, addr, vX0);
445 
446  tmp = c7x::strm_agen<0, CV>::get_vpred ();
447  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
448  __vstore_pred (tmp, addr, vX2);
449 
450  tmp = c7x::strm_agen<0, CV>::get_vpred ();
451  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
452  __vstore_pred (tmp, addr, vX1);
453 
454  tmp = c7x::strm_agen<0, CV>::get_vpred ();
455  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
456  __vstore_pred (tmp, addr, vX3);
457 
458  /* Second iteration of loop unroll */
459 
460  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
461  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
462  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
463  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
464 
465  vSum1 = vX_0 + vX_N_2;
466  vSum2 = vX_N_4 + vX_3N_4;
467  vDiff1 = vX_0 - vX_N_2;
468  vDiff2 = vX_N_4 - vX_3N_4;
469 
470  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
471  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
472  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
473 
474  vX0Temp = vSum1 + vSum2;
475  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
476  vX2Temp = vSum1 - vSum2;
477  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
478 
479  vX0 = vX0Temp;
480  vX1 = __complex_multiply (vX1Temp, vTwX1);
481  vX2 = __complex_multiply (vX2Temp, vTwX2);
482  vX3 = __complex_multiply (vX3Temp, vTwX3);
483 
484  /* __SA0ADV(CV, pXLocal) = vX0; */
485  /* __SA0ADV(CV, pXLocal) = vX2; */
486  /* __SA0ADV(CV, pXLocal) = vX1; */
487  /* __SA0ADV(CV, pXLocal) = vX3; */
488 
489  tmp = c7x::strm_agen<0, CV>::get_vpred ();
490  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
491  __vstore_pred (tmp, addr, vX0);
492 
493  tmp = c7x::strm_agen<0, CV>::get_vpred ();
494  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
495  __vstore_pred (tmp, addr, vX2);
496 
497  tmp = c7x::strm_agen<0, CV>::get_vpred ();
498  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
499  __vstore_pred (tmp, addr, vX1);
500 
501  tmp = c7x::strm_agen<0, CV>::get_vpred ();
502  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
503  __vstore_pred (tmp, addr, vX3);
504  }
505  __SA0_CLOSE ();
506  __SE0_CLOSE ();
507  __SE1_CLOSE ();
508 
509  numPointsPerDft >>= 2;
510  pWLocal += numPointsPerDft * 3;
511  seCnt1 >>= 2;
512  seCnt2 >>= 2;
513  seCnt3 <<= 2;
514  /* printf ("\n\n"); */
515  /* for (int32_t u = 0; u < 64 * 2; u++) { */
516  /* printf ("%f, ", ((float *) pX)[u]); */
517  /* } */
518  }
519  if (numPointsPerDft == 4) {
520  /* 4-point stage with bit-reversal */
521  se0_param = *(
522  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET));
523  __SE0_OPEN ((void *) pXLocal, se0_param);
524 
525  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
526 
527  /* pY0 = &pYLocal[0x00000000u]; */
528  /* pY1 = &pYLocal[0x40000000u >> numLeadingZeros]; */
529  /* pY2 = &pYLocal[0x80000000u >> numLeadingZeros]; */
530  /* pY3 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
531 
532  pY0 = (cfloat *) (pY + 0);
533  pY1 = (cfloat *) (pY + ((0x40000000u >> numLeadingZeros) << 1));
534  pY2 = (cfloat *) (pY + ((0x80000000u >> numLeadingZeros) << 1));
535  pY3 = (cfloat *) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
536 
537 #ifdef LAST_LOOP_UNROLL
538  /* pY4 = &pYLocal[0x20000000u >> numLeadingZeros]; */
539  /* pY5 = &pYLocal[0x60000000u >> numLeadingZeros]; */
540  /* pY6 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
541  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
542 
543  pY4 = (cfloat *) (pY + ((0x20000000u >> numLeadingZeros) << 1));
544  pY5 = (cfloat *) (pY + ((0x60000000u >> numLeadingZeros) << 1));
545  pY6 = (cfloat *) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
546  pY7 = (cfloat *) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
547 #endif
548 
549 #if 1
550  for (k = 0; k<numPoints>> 2; k += 8)
551 #else
552  for (k = 0; k<numPoints>> 2; k += 4)
553 #endif
554  {
555  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
556 
557  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
558  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
559  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
560  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
561 
562  vSum1 = vX_0 + vX_N_2;
563  vSum2 = vX_N_4 + vX_3N_4;
564  vDiff1 = vX_0 - vX_N_2;
565  vDiff2 = vX_N_4 - vX_3N_4;
566 
567  vX0 = vSum1 + vSum2;
568  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
569  vX2 = vSum1 - vSum2;
570  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
571 
572  vX0 = ifft_i32fc_o32fc_scaleAndConjugate (vX0, scaleVec, xorVec);
573  vX1 = ifft_i32fc_o32fc_scaleAndConjugate (vX1, scaleVec, xorVec);
574  vX2 = ifft_i32fc_o32fc_scaleAndConjugate (vX2, scaleVec, xorVec);
575  vX3 = ifft_i32fc_o32fc_scaleAndConjugate (vX3, scaleVec, xorVec);
576 
577  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0);
578  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX1);
579  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX2);
580  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX3);
581 
582 #if 1
583  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
584  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
585  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
586  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
587 
588  vSum1_1 = vX_0_1 + vX_N_2_1;
589  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
590  vDiff1_1 = vX_0_1 - vX_N_2_1;
591  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
592 
593  vX0_1 = vSum1_1 + vSum2_1;
594  vX1_1 = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
595  vX2_1 = vSum1_1 - vSum2_1;
596  vX3_1 = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
597 
598  vX0_1 =
599  ifft_i32fc_o32fc_scaleAndConjugate (vX0_1, scaleVec, xorVec);
600  vX1_1 =
601  ifft_i32fc_o32fc_scaleAndConjugate (vX1_1, scaleVec, xorVec);
602  vX2_1 =
603  ifft_i32fc_o32fc_scaleAndConjugate (vX2_1, scaleVec, xorVec);
604  vX3_1 =
605  ifft_i32fc_o32fc_scaleAndConjugate (vX3_1, scaleVec, xorVec);
606 
607  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX0_1);
608  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX1_1);
609  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX2_1);
610  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_1);
611 #endif
612  }
613  __SE0_CLOSE ();
614 
615  /* printf ("\nStage == 4\n"); */
616  /* for (int32_t u = 0; u < 64 * 2; u++) { */
617  /* printf ("%f, ", ((float *) pY)[u]); */
618  /* } */
619  }
620  else {
621 #if 1
622  /* 4-point stage followed by 2-point stage with bit-reversal */
623  se0_param = *(
624  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET));
625  __SE0_OPEN ((void *) pXLocal, se0_param);
626 
627  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
628 #if __C7X_HOSTEM__
629  pWLocal += 1;
630  twTemp = *pWLocal;
631  vTwX1 = CV (twTemp, twTemp, twTemp, twTemp);
632 #else
633  pWLocal += 1;
634  twTemp = *pWLocal;
635  vTwX1 = (CV) (twTemp, twTemp, twTemp, twTemp);
636 #endif
637 
638 #if __C7X_HOSTEM__
639  pWLocal += 2;
640  twTemp = *pWLocal;
641  vTwX2 = CV (twTemp, twTemp, twTemp, twTemp);
642 #else
643  pWLocal += 2;
644  twTemp = *pWLocal;
645  vTwX2 = (CV) (twTemp, twTemp, twTemp, twTemp);
646 #endif
647 #if __C7X_HOSTEM__
648  pWLocal += 2;
649  twTemp = *pWLocal;
650  vTwX3 = CV (twTemp, twTemp, twTemp, twTemp);
651 #else
652  pWLocal += 2;
653  twTemp = *pWLocal;
654  vTwX3 = (CV) (twTemp, twTemp, twTemp, twTemp);
655 #endif
656 
657  /* pY0 = &pYLocal[0x00000000u]; */
658  /* pY1 = &pYLocal[0x80000000u >> numLeadingZeros]; */
659  /* pY2 = &pYLocal[0x20000000u >> numLeadingZeros]; */
660  /* pY3 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
661  /* pY4 = &pYLocal[0x40000000u >> numLeadingZeros]; */
662  /* pY5 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
663  /* pY6 = &pYLocal[0x60000000u >> numLeadingZeros]; */
664  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
665 
666  pY0 = (cfloat *) (pY + (0x00000000u));
667  pY1 = (cfloat *) (pY + ((0x80000000u >> numLeadingZeros) << 1));
668  pY2 = (cfloat *) (pY + ((0x20000000u >> numLeadingZeros) << 1));
669  pY3 = (cfloat *) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
670  pY4 = (cfloat *) (pY + ((0x40000000u >> numLeadingZeros) << 1));
671  pY5 = (cfloat *) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
672  pY6 = (cfloat *) (pY + ((0x60000000u >> numLeadingZeros) << 1));
673  pY7 = (cfloat *) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
674 
675  for (k = 0; k<numPoints>> 2; k += 8) {
676  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
677 
678  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
679  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
680  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
681  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
682  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
683  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
684  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
685  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
686 
687  vSum1 = vX_0 + vX_N_2;
688  vSum2 = vX_N_4 + vX_3N_4;
689  vDiff1 = vX_0 - vX_N_2;
690  vDiff2 = vX_N_4 - vX_3N_4;
691 
692  vX0 = vSum1 + vSum2;
693  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
694  vX2 = vSum1 - vSum2;
695  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
696 
697  vSum1_1 = vX_0_1 + vX_N_2_1;
698  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
699  vDiff1_1 = vX_0_1 - vX_N_2_1;
700  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
701 
702  vX0Temp = vSum1_1 + vSum2_1;
703  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
704  vX2Temp = vSum1_1 - vSum2_1;
705  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
706 
707  vX0_1 = vX0Temp;
708  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
709  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
710  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
711 
712  vX0_2PtDft_1 = vX0 + vX0_1;
713  vX0_2PtDft_2 = vX0 - vX0_1;
714  vX1_2PtDft_1 = vX1 + vX1_1;
715  vX1_2PtDft_2 = vX1 - vX1_1;
716  vX2_2PtDft_1 = vX2 + vX2_1;
717  vX2_2PtDft_2 = vX2 - vX2_1;
718  vX3_2PtDft_1 = vX3 + vX3_1;
719  vX3_2PtDft_2 = vX3 - vX3_1;
720 
721  vX0_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
722  vX0_2PtDft_1, scaleVec, xorVec);
723  vX0_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
724  vX0_2PtDft_2, scaleVec, xorVec);
725  vX1_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
726  vX1_2PtDft_1, scaleVec, xorVec);
727  vX1_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
728  vX1_2PtDft_2, scaleVec, xorVec);
729  vX2_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
730  vX2_2PtDft_1, scaleVec, xorVec);
731  vX2_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
732  vX2_2PtDft_2, scaleVec, xorVec);
733  vX3_2PtDft_1 = ifft_i32fc_o32fc_scaleAndConjugate (
734  vX3_2PtDft_1, scaleVec, xorVec);
735  vX3_2PtDft_2 = ifft_i32fc_o32fc_scaleAndConjugate (
736  vX3_2PtDft_2, scaleVec, xorVec);
737 
738  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0_2PtDft_1);
739  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX0_2PtDft_2);
740  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX1_2PtDft_1);
741  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX1_2PtDft_2);
742  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX2_2PtDft_1);
743  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX2_2PtDft_2);
744  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX3_2PtDft_1);
745  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_2PtDft_2);
746  }
747  __SE0_CLOSE ();
748 #endif
749  }
750  }
751  return (status);
752 }
753 
754 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && \
755  !defined(FFTLIB_IFFT1D_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
756  (defined(FFTLIB_CHECK_PARAMS)) || \
757  (defined(FFTLIB_IFFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS))
758 
761  FFTLIB_bufParams1D_t *bufParamsX,
762  FFTLIB_F32 * pW,
763  FFTLIB_bufParams1D_t *bufParamsW,
764  FFTLIB_F32 * pY,
765  FFTLIB_bufParams1D_t *bufParamsY,
766  void * pBlock)
767 {
768  FFTLIB_STATUS status = FFTLIB_SUCCESS;
769 
770  if ((pX == NULL) || (pW == NULL) || (pY == NULL)) {
771  status = FFTLIB_ERR_NULL_POINTER;
772  }
773  else if (bufParamsX->dim_x != bufParamsW->dim_x ||
774  bufParamsX->dim_x != bufParamsY->dim_x) {
776  }
777  else if (bufParamsX->dim_x < 64 * 2) { /* Minimum number of points is 64 */
779  }
780  else if ((bufParamsX->data_type != FFTLIB_FLOAT32) ||
781  (bufParamsW->data_type != FFTLIB_FLOAT32) ||
782  (bufParamsY->data_type != FFTLIB_FLOAT32)) {
783  status = FFTLIB_ERR_INVALID_TYPE;
784  }
785  else if (((uint64_t) pX) & 0xFu) { /* pX must be 16-byte aligned for a */
786  status = FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES; /* streaming engine
787  configuration */
788  }
789  else {
790  /* Check if number of pts is a power of 2 */
791  uint32_t k = 0;
792  while (k < 32) {
793  if (bufParamsX->dim_x & (1u << k)) {
794  break;
795  }
796  k++;
797  }
798  if ((1u << k) != bufParamsX->dim_x) {
800  }
801  }
802  return (status);
803 }
804 
805 #endif
@ FFTLIB_FLOAT32
c7x::cfloat_vec CV
c7x::float_vec V
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_ERR_INVALID_TYPE
Definition: FFTLIB_types.h:176
@ FFTLIB_ERR_NULL_POINTER
Definition: FFTLIB_types.h:178
@ FFTLIB_ERR_INVALID_DIMENSION
Definition: FFTLIB_types.h:177
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
@ FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES
Definition: FFTLIB_types.h:181
float FFTLIB_F32
Single precision floating point.
Definition: FFTLIB_types.h:169
void ifft_i32fc_o32fc_conjugate_init_ci(void *pX, uint32_t size, void *pBlock)
static c7x::cfloat_vec ifft_i32fc_o32fc_scaleAndConjugate(c7x::cfloat_vec in, c7x::float_vec scaleVec, c7x::ulong_vec xorVec)
void ifft_i32fc_o32fc_conjugate_exec_ci(void *pX, c7x::ulong_vec xorVec, uint32_t size, void *pBlock)
FFTLIB_STATUS FFTLIB_ifft1d_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_ifft1d_i32fc_c32fc_o32fc_init an...
FFTLIB_STATUS FFTLIB_ifft1d_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function should be called before the FFTLIB_ifft1d_i32fc_c32fc_o32fc_kernel function is called....
FFTLIB_STATUS FFTLIB_ifft1d_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function is the main kernel compute function.
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.