FFTLIB User Guide
c71/FFTLIB_fft1d_i32fc_c32fc_o32fc_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2017 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 #include "../FFTLIB_fft1d_i32fc_c32fc_o32fc.h"
30 
31 #define TRACE_ON (0)
32 
33 #if TRACE_ON
34 #include "../../../common/printv.h"
35 #include <stdio.h>
36 #endif
37 
38 // CODE_SECTION(FFTLIB_fft1d, ".text:optimized")
39 // CODE_SECTION(FFTLIB_fft1d_core, ".text:optimized")
40 // CODE_SECTION(FFTLIB_fft1d_checkParams, ".text:optimized")
41 
42 #define SE_PARAM_BASE (0x0000)
43 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
44 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
45 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_TWID_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
51 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
52 
55  FFTLIB_bufParams1D_t *bufParamsX,
56  FFTLIB_F32 *pW,
57  FFTLIB_bufParams1D_t *bufParamsW,
58  FFTLIB_F32 *pY,
59  FFTLIB_bufParams1D_t *bufParamsY,
60  void *pBlock)
61 {
63 
64 #if defined(FFTLIB_CHECK_PARAMS) || \
65  defined(FFTLIB_FFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS)
67  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
68  if (status == FFTLIB_SUCCESS)
69 #endif
70  {
71  uint32_t numPoints;
72  uint32_t numPointsPerDft;
73  uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
74  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
75  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
76  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
77 
78  numPoints = bufParamsX->dim_x >> 1;
79  numPointsPerDft = numPoints;
80  seCnt1 = numPoints >> 2;
81  seCnt2 = numPoints >> 5;
82  seCnt3 = 1;
83  seCnt4 = numPoints >> 3;
84 
85  /* se0_param = (0); */
86  se0_param.ICNT0 = 8; /* 8-point vectors processed in one shot */
87  se0_param.ICNT1 = 4;
88  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
89  se0_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
90  se0_param.DIM2 = 8; /* quarter */
91  se0_param.ICNT3 = seCnt3; /* Number of DFT's */
92  se0_param.DIM3 = numPointsPerDft;
93 
94  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
95  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
96  se0_param.DIMFMT = __SE_DIMFMT_4D;
97  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET)) =
98  se0_param;
99 
100  /* se1_param = (0); */
101  se1_param.ICNT0 = 8; /* 8-point vectors processed in one shot */
102  se1_param.ICNT1 = 3;
103  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
104  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
105  se1_param.DIM2 = 8; /* quarter */
106  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
107  se1_param.DIM3 = 0;
108 
109  se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
110  se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
111  se1_param.DIMFMT = __SE_DIMFMT_4D;
112  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET)) =
113  se1_param;
114 
115  /* sa0_param = (0); */
116  sa0_param.ICNT0 = 8;
117  sa0_param.ICNT1 = 4;
118  sa0_param.DIM1 = seCnt1; /* Save to each of the 4 quarters */
119  sa0_param.ICNT2 = seCnt2; /* Number of 8-point stores within each */
120  sa0_param.DIM2 = 8; /* quarter */
121  sa0_param.ICNT3 = seCnt3;
122  sa0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
123 
124  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
125  sa0_param.DIMFMT = __SA_DIMFMT_4D;
126  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET)) =
127  sa0_param;
128 
129  /* se0_param = (0); */
130  se0_param.ICNT0 = 8; /* Fetch first two quarters */
131  se0_param.ICNT1 = 2;
132  se0_param.DIM1 = 16; /* Process two 16-point DFTs in one shot */
133  se0_param.ICNT2 = seCnt2;
134  se0_param.DIM2 = 32; /* Half the number of DFT's */
135 
136  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
137  se0_param.TRANSPOSE =
138  __SE_TRANSPOSE_256BIT; /* Using 256BIT transpose required */
139  /* 16-byte alignment on pX */
140  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
141  se0_param.DIMFMT = __SE_DIMFMT_3D;
142  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET)) =
143  se0_param;
144 
145  /* sa0_param = (0); */
146  sa0_param.ICNT0 = numPoints;
147 
148  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
149  sa0_param.DIMFMT = __SA_DIMFMT_1D;
150  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET)) =
151  sa0_param;
152 
153  /* se0_param = (0); */
154  se0_param = __gen_SE_TEMPLATE_v1 ();
155  se0_param.ICNT0 = numPoints;
156 
157  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
158  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
159  se0_param.DIMFMT = __SE_DIMFMT_1D;
160  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET)) =
161  se0_param;
162 
163  /* sa0_param = (0); */
164  sa0_param.ICNT0 = numPoints;
165 
166  sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
167  sa0_param.DIMFMT = __SA_DIMFMT_1D;
168  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET)) =
169  sa0_param;
170 
171  /* se0_param = (0); */
172  se0_param.ICNT0 = seCnt4; /* Fetch consecutive four points for DFT */
173  se0_param.ICNT1 = 8;
174  se0_param.DIM1 = seCnt4;
175  /* Fetch 8 points separated by */ /* (numPoints >>
176  3). This fetch
177  pattern */
178  /* can be used for bit reversal */
179 
180  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
181  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
182  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
183  se0_param.DIMFMT = __SE_DIMFMT_2D;
184  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET)) =
185  se0_param;
186 
187  /* se0_param = (0); */
188  se0_param.ICNT0 = seCnt4;
189  se0_param.ICNT1 = 8;
190  se0_param.DIM1 = seCnt4;
191  /* Fetch 8 points separated by */ /* (numPoints >>
192  3). This fetch
193  pattern */
194  /* can be used for bit reversal */
195 
196  se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
197  se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
198  se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
199  se0_param.DIMFMT = __SE_DIMFMT_2D;
200  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET)) =
201  se0_param;
202  }
203  return (status);
204 }
205 
208  FFTLIB_bufParams1D_t *bufParamsX,
209  FFTLIB_F32 *pW,
210  FFTLIB_bufParams1D_t *bufParamsW,
211  FFTLIB_F32 *pY,
212  FFTLIB_bufParams1D_t *bufParamsY,
213  void *pBlock)
214 {
215  uint32_t k;
216  FFTLIB_STATUS status = FFTLIB_SUCCESS;
217  uint32_t numPoints;
218  uint32_t numPointsPerDft;
219  uint32_t numLeadingZeros;
220  uint32_t offsetBitReverse;
221  uint32_t seCnt1, seCnt2, seCnt3;
222  __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
223  __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
224  __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
225 
226  cfloat* restrict pXLocal;
227  cfloat* restrict pYLocal;
228  cfloat* restrict pWLocal;
229  cfloat* restrict pY0;
230  cfloat* restrict pY1;
231  cfloat* restrict pY2;
232  cfloat* restrict pY3;
233  cfloat* restrict pY4;
234  cfloat* restrict pY5;
235  cfloat* restrict pY6;
236  cfloat* restrict pY7;
237 
238  typedef typename c7x::cfloat_vec CV;
239  typedef CV* CVP;
240 
241  typedef typename c7x::float_vec V;
242  typedef V* VP;
243 
244  CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
245  CV vSum1, vSum2, vDiff1, vDiff2;
246  CV vTwX1, vTwX2, vTwX3;
247  CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
248  CV vX0, vX1, vX2, vX3;
249  CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
250  CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
251  CV vX0_1, vX1_1, vX2_1, vX3_1;
252  CV vX0_2PtDft_1, vX0_2PtDft_2;
253  CV vX1_2PtDft_1, vX1_2PtDft_2;
254  CV vX2_2PtDft_1, vX2_2PtDft_2;
255  CV vX3_2PtDft_1, vX3_2PtDft_2;
256  cfloat twTemp;
257 
258 #ifdef FFTLIB_CHECK_PARAMS
260  pX, bufParamsX, pW, bufParamsW, pY, bufParamsY, pBlock);
261  if (status == FFTLIB_SUCCESS)
262 #endif
263  {
264  numPoints = bufParamsX->dim_x >> 1;
265  numPointsPerDft = numPoints;
266 
267  se0_param =
268  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP1_PARAM_OFFSET));
269  se1_param =
270  *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_TWID_PARAM_OFFSET));
271  sa0_param =
272  *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP1_PARAM_OFFSET));
273  seCnt1 = numPointsPerDft >> 2;
274  seCnt2 = numPointsPerDft >> 5;
275  seCnt3 = 1;
276 
277  pXLocal = (cfloat*) pX;
278  pWLocal = (cfloat*) pW;
279  pYLocal = (cfloat*) pY;
280 
281  while (numPointsPerDft >= 64) {
282  /* TODO OPT: Calculate params upfront in init function,
283  * rather than generating SE params on the fly here */
284  se0_param.ICNT1 = 4;
285  se0_param.DIM1 = seCnt1; /* 4 quarters(Offsets: 0, N/4, N/2, 3N/4) */
286  se0_param.ICNT2 = seCnt2;
287  se0_param.DIM2 = 8;
288  /* Number of 8-point fetches within */ /* each quarter */
289  se0_param.ICNT3 = seCnt3;
290  se0_param.DIM3 = numPointsPerDft; /* Number of DFT's */
291  __SE0_OPEN ((void *) pXLocal, se0_param);
292 
293  se1_param.ICNT1 = 3;
294  se1_param.DIM1 = seCnt1; /* Twiddle factors for x1, x2 and x3 */
295  se1_param.ICNT2 = seCnt2; /* Number of 8-point fetches within each */
296  se1_param.DIM2 = 8; /* quarter */
297  se1_param.ICNT3 = seCnt3; /* Number of DFT's */
298  se1_param.DIM3 = 0;
299  __SE1_OPEN ((void *) pWLocal, se1_param);
300 
301  sa0_param.ICNT1 = 4;
302  sa0_param.DIM1 = /* Save to each of the 4 quarters */ seCnt1;
303  sa0_param.ICNT2 = seCnt2;
304  sa0_param.DIM2 = 8;
305  /* Number of 8-point stores within */ /* each quarter */
306  sa0_param.ICNT3 = seCnt3;
307  sa0_param.DIM3 = numPointsPerDft;
308  /* Number of DFT's */
309  __SA0_OPEN (sa0_param);
310 
311  /* Loop is unrolled twice for better optimization */
312  for (k = 0; k < numPoints; k += 64) {
313 
314  /* First iteration of loop unroll */
315  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
316  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
317  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
318  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
319 
320  vSum1 = vX_0 + vX_N_2;
321  vSum2 = vX_N_4 + vX_3N_4;
322  vDiff1 = vX_0 - vX_N_2;
323  vDiff2 = vX_N_4 - vX_3N_4;
324 
325  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
326  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
327  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
328 
329  vX0Temp = vSum1 + vSum2;
330  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
331  vX2Temp = vSum1 - vSum2;
332  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
333 
334  vX0 = vX0Temp;
335  vX1 = __complex_multiply (vX1Temp, vTwX1);
336  vX2 = __complex_multiply (vX2Temp, vTwX2);
337  vX3 = __complex_multiply (vX3Temp, vTwX3);
338 
339  /* __SA0ADV(CV, pXLocal) = vX0; */
340  /* __SA0ADV(CV, pXLocal) = vX2; */
341  /* __SA0ADV(CV, pXLocal) = vX1; */
342  /* __SA0ADV(CV, pXLocal) = vX3; */
343 
344  __vpred tmp;
345  CVP addr;
346  tmp = c7x::strm_agen<0, CV>::get_vpred ();
347  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
348  __vstore_pred (tmp, addr, vX0);
349 
350  tmp = c7x::strm_agen<0, CV>::get_vpred ();
351  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
352  __vstore_pred (tmp, addr, vX2);
353 
354  tmp = c7x::strm_agen<0, CV>::get_vpred ();
355  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
356  __vstore_pred (tmp, addr, vX1);
357 
358  tmp = c7x::strm_agen<0, CV>::get_vpred ();
359  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
360  __vstore_pred (tmp, addr, vX3);
361 
362  /* Second iteration of loop unroll */
363  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
364  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
365  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
366  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
367 
368  vSum1 = vX_0 + vX_N_2;
369  vSum2 = vX_N_4 + vX_3N_4;
370  vDiff1 = vX_0 - vX_N_2;
371  vDiff2 = vX_N_4 - vX_3N_4;
372 
373  vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
374  vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
375  vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
376 
377  vX0Temp = vSum1 + vSum2;
378  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
379  vX2Temp = vSum1 - vSum2;
380  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
381 
382  vX0 = vX0Temp;
383  vX1 = __complex_multiply (vX1Temp, vTwX1);
384  vX2 = __complex_multiply (vX2Temp, vTwX2);
385  vX3 = __complex_multiply (vX3Temp, vTwX3);
386 
387  /* __SA0ADV(CV, pXLocal) = vX0; */
388  /* __SA0ADV(CV, pXLocal) = vX2; */
389  /* __SA0ADV(CV, pXLocal) = vX1; */
390  /* __SA0ADV(CV, pXLocal) = vX3; */
391 
392  tmp = c7x::strm_agen<0, CV>::get_vpred ();
393  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
394  __vstore_pred (tmp, addr, vX0);
395 
396  tmp = c7x::strm_agen<0, CV>::get_vpred ();
397  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
398  __vstore_pred (tmp, addr, vX2);
399 
400  tmp = c7x::strm_agen<0, CV>::get_vpred ();
401  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
402  __vstore_pred (tmp, addr, vX1);
403 
404  tmp = c7x::strm_agen<0, CV>::get_vpred ();
405  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
406  __vstore_pred (tmp, addr, vX3);
407  }
408  __SA0_CLOSE ();
409  __SE0_CLOSE ();
410  __SE1_CLOSE ();
411 
412  numPointsPerDft >>= 2;
413  pWLocal += numPointsPerDft * 3;
414  seCnt1 >>= 2;
415  seCnt2 >>= 2;
416  seCnt3 <<= 2;
417  }
418 
419  if (numPointsPerDft == 16) {
420  /* TODO OPT: Use one SE and see compiler behavior.
421  * There may be L2 bank conflicts using
422  * two SE's separated by 64 bytes */
423  /* 16-point stage */
424  se0_param = *(
425  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP2_PARAM_OFFSET));
426  __SE0_OPEN ((void *) pXLocal, se0_param);
427  __SE1_OPEN ((void *) (pXLocal + 8), se0_param);
428 
429  sa0_param = *(
430  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP2_PARAM_OFFSET));
431  __SA0_OPEN (sa0_param);
432 
433 #if __C7X_HOSTEM__
434  vTwX1 = *((CVP) pWLocal);
435  vTwX1 = CV (vTwX1.lo(), vTwX1.lo());
436  vTwX2 = *((CVP) (pWLocal + 4));
437  vTwX2 = CV (vTwX2.lo(), vTwX2.lo());
438  vTwX3 = *((CVP) (pWLocal + 8));
439  vTwX3 = CV (vTwX3.lo(), vTwX3.lo());
440 #else
441  vTwX1 = *((CVP) pWLocal);
442  vTwX1 = (CV) (vTwX1.lo(), vTwX1.lo());
443  vTwX2 = *((CVP) (pWLocal + 4));
444  vTwX2 = (CV) (vTwX2.lo(), vTwX2.lo());
445  vTwX3 = *((CVP) (pWLocal + 8));
446  vTwX3 = (CV) (vTwX3.lo(), vTwX3.lo());
447 
448 #endif
449  for (k = 0; k < numPoints; k += 32) {
450  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
451  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
452  vX_N_2 = c7x::strm_eng<1, CV>::get_adv ();
453  vX_3N_4 = c7x::strm_eng<1, CV>::get_adv ();
454 
455  vSum1 = vX_0 + vX_N_2;
456  vSum2 = vX_N_4 + vX_3N_4;
457  vDiff1 = vX_0 - vX_N_2;
458  vDiff2 = vX_N_4 - vX_3N_4;
459 
460  vX0Temp = vSum1 + vSum2;
461  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
462  vX2Temp = vSum1 - vSum2;
463  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
464 
465  vX0 = vX0Temp;
466  vX1 = __complex_multiply (vX1Temp, vTwX1);
467  vX2 = __complex_multiply (vX2Temp, vTwX2);
468  vX3 = __complex_multiply (vX3Temp, vTwX3);
469 
470  /* __SA0ADV(CV, pXLocal) = (CV)(vX0.lo(),
471  * vX2.lo()); */
472  /* __SA0ADV(CV, pXLocal) = (CV)(vX1.lo(),
473  * vX3.lo()); */
474  /* __SA0ADV(CV, pXLocal) = (CV)(vX0.hi(),
475  * vX2.hi()); */
476  /* __SA0ADV(CV, pXLocal) = (CV)(vX1.hi(),
477  * vX3.hi()); */
478 
479 #if __C7X_HOSTEM__
480  __vpred tmp;
481  CVP addr;
482  tmp = c7x::strm_agen<0, CV>::get_vpred ();
483  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
484  __vstore_pred (tmp, addr, CV (vX0.lo(), vX2.lo()));
485  tmp = c7x::strm_agen<0, CV>::get_vpred ();
486  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
487  __vstore_pred (tmp, addr, CV (vX1.lo(), vX3.lo()));
488  tmp = c7x::strm_agen<0, CV>::get_vpred ();
489  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
490  __vstore_pred (tmp, addr, CV (vX0.hi(), vX2.hi()));
491  tmp = c7x::strm_agen<0, CV>::get_vpred ();
492  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
493  __vstore_pred (tmp, addr, CV (vX1.hi(), vX3.hi()));
494 #else
495  __vpred tmp;
496  CVP addr;
497  tmp = c7x::strm_agen<0, CV>::get_vpred ();
498  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
499  __vstore_pred (tmp, addr, (CV) (vX0.lo(), vX2.lo()));
500  tmp = c7x::strm_agen<0, CV>::get_vpred ();
501  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
502  __vstore_pred (tmp, addr, (CV) (vX1.lo(), vX3.lo()));
503  tmp = c7x::strm_agen<0, CV>::get_vpred ();
504  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
505  __vstore_pred (tmp, addr, (CV) (vX0.hi(), vX2.hi()));
506  tmp = c7x::strm_agen<0, CV>::get_vpred ();
507  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
508  __vstore_pred (tmp, addr, (CV) (vX1.hi(), vX3.hi()));
509 #endif
510  }
511  __SA0_CLOSE ();
512  __SE0_CLOSE ();
513  __SE1_CLOSE ();
514  }
515  else {
516  /* 32-point stage */
517  se0_param = *(
518  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP3_PARAM_OFFSET));
519  __SE0_OPEN ((void *) pXLocal, se0_param);
520 
521  sa0_param = *(
522  (__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA_LOOP3_PARAM_OFFSET));
523  __SA0_OPEN (sa0_param);
524 
525  vTwX1 = *((CVP) pWLocal);
526  vTwX2 = *((CVP) (pWLocal + 8));
527  vTwX3 = *((CVP) (pWLocal + 16));
528 
529  for (k = 0; k < numPoints; k += 64) {
530  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
531  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
532  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
533  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
534 
535  vSum1 = vX_0 + vX_N_2;
536  vSum2 = vX_N_4 + vX_3N_4;
537  vDiff1 = vX_0 - vX_N_2;
538  vDiff2 = vX_N_4 - vX_3N_4;
539 
540  vX0Temp = vSum1 + vSum2;
541  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
542  vX2Temp = vSum1 - vSum2;
543  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
544 
545  vX0 = vX0Temp;
546  vX1 = __complex_multiply (vX1Temp, vTwX1);
547  vX2 = __complex_multiply (vX2Temp, vTwX2);
548  vX3 = __complex_multiply (vX3Temp, vTwX3);
549 
550  /* __SA0ADV(CV, pXLocal) = vX0; */
551  /* __SA0ADV(CV, pXLocal) = vX2; */
552  /* __SA0ADV(CV, pXLocal) = vX1; */
553  /* __SA0ADV(CV, pXLocal) = vX3; */
554 
555  __vpred tmp;
556  CVP addr;
557  tmp = c7x::strm_agen<0, CV>::get_vpred ();
558  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
559  __vstore_pred (tmp, addr, vX0);
560 
561  tmp = c7x::strm_agen<0, CV>::get_vpred ();
562  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
563  __vstore_pred (tmp, addr, vX2);
564 
565  tmp = c7x::strm_agen<0, CV>::get_vpred ();
566  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
567  __vstore_pred (tmp, addr, vX1);
568 
569  tmp = c7x::strm_agen<0, CV>::get_vpred ();
570  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
571  __vstore_pred (tmp, addr, vX3);
572 
573  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
574  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
575  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
576  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
577 
578  vSum1 = vX_0 + vX_N_2;
579  vSum2 = vX_N_4 + vX_3N_4;
580  vDiff1 = vX_0 - vX_N_2;
581  vDiff2 = vX_N_4 - vX_3N_4;
582 
583  vX0Temp = vSum1 + vSum2;
584  vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
585  vX2Temp = vSum1 - vSum2;
586  vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
587 
588  vX0 = vX0Temp;
589  vX1 = __complex_multiply (vX1Temp, vTwX1);
590  vX2 = __complex_multiply (vX2Temp, vTwX2);
591  vX3 = __complex_multiply (vX3Temp, vTwX3);
592 
593  /* __SA0ADV(CV, pXLocal) = vX0; */
594  /* __SA0ADV(CV, pXLocal) = vX2; */
595  /* __SA0ADV(CV, pXLocal) = vX1; */
596  /* __SA0ADV(CV, pXLocal) = vX3; */
597 
598  tmp = c7x::strm_agen<0, CV>::get_vpred ();
599  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
600  __vstore_pred (tmp, addr, vX0);
601 
602  tmp = c7x::strm_agen<0, CV>::get_vpred ();
603  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
604  __vstore_pred (tmp, addr, vX2);
605 
606  tmp = c7x::strm_agen<0, CV>::get_vpred ();
607  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
608  __vstore_pred (tmp, addr, vX1);
609 
610  tmp = c7x::strm_agen<0, CV>::get_vpred ();
611  addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
612  __vstore_pred (tmp, addr, vX3);
613  }
614  __SE0_CLOSE ();
615  __SA0_CLOSE ();
616  }
617 
618  numPointsPerDft >>= 2;
619  pWLocal += numPointsPerDft * 3;
620 
621  if (numPointsPerDft == 4) {
622  /* 4-point stage with bit-reversal */
623  se0_param = *(
624  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP4_PARAM_OFFSET));
625  __SE0_OPEN ((void *) pXLocal, se0_param);
626 
627  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
628 
629  /* pY0 = &pYLocal[0x00000000u]; */
630  /* pY1 = &pYLocal[0x40000000u >> numLeadingZeros]; */
631  /* pY2 = &pYLocal[0x80000000u >> numLeadingZeros]; */
632  /* pY3 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
633 
634  pY0 = (cfloat*) (pY + 0);
635  pY1 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
636  pY2 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
637  pY3 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
638 
639 #ifdef LAST_LOOP_UNROLL
640  /* pY4 = &pYLocal[0x20000000u >> numLeadingZeros]; */
641  /* pY5 = &pYLocal[0x60000000u >> numLeadingZeros]; */
642  /* pY6 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
643  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
644 
645  pY4 = (cfloat*) (pY + ((0x20000000u >> numLeadingZeros) << 1));
646  pY5 = (cfloat*) (pY + ((0x60000000u >> numLeadingZeros) << 1));
647  pY6 = (cfloat*) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
648  pY7 = (cfloat*) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
649 #endif
650 
651 #ifdef LAST_LOOP_UNROLL
652  for (k = 0; k < numPoints >> 3; k += 8)
653 #else
654  for (k = 0; k < numPoints >> 3; k += 4)
655 #endif
656  {
657  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
658 
659  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
660  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
661  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
662  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
663 
664  vSum1 = vX_0 + vX_N_2;
665  vSum2 = vX_N_4 + vX_3N_4;
666  vDiff1 = vX_0 - vX_N_2;
667  vDiff2 = vX_N_4 - vX_3N_4;
668 
669  vX0 = vSum1 + vSum2;
670  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
671  vX2 = vSum1 - vSum2;
672  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
673 
674  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0);
675  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX1);
676  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX2);
677  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX3);
678 
679 #ifdef LAST_LOOP_UNROLL
680  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
681  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
682  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
683  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
684 
685  vSum1_1 = vX_0_1 + vX_N_2_1;
686  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
687  vDiff1_1 = vX_0_1 - vX_N_2_1;
688  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
689 
690  vX0_1 = vSum1_1 + vSum2_1;
691  vX1_1 = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
692  vX2_1 = vSum1_1 - vSum2_1;
693  vX3_1 = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
694 
695  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX0_1);
696  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX1_1);
697  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX2_1);
698  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_1);
699 #endif
700  }
701  __SE0_CLOSE ();
702  }
703  else {
704  /* 4-point stage followed by 2-point stage with bit-reversal */
705  se0_param = *(
706  (__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_LOOP5_PARAM_OFFSET));
707  __SE0_OPEN ((void *) pXLocal, se0_param);
708 
709  numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
710 #if __C7X_HOSTEM__
711  pWLocal += 1;
712  twTemp = *pWLocal;
713  vTwX1 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
714  twTemp);
715 #else
716  pWLocal += 1;
717  twTemp = *pWLocal;
718  vTwX1 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
719  twTemp);
720 #endif
721 
722 #if __C7X_HOSTEM__
723  pWLocal += 2;
724  twTemp = *pWLocal;
725  vTwX2 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
726  twTemp);
727 #else
728  pWLocal += 2;
729  twTemp = *pWLocal;
730  vTwX2 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
731  twTemp);
732 #endif
733 #if __C7X_HOSTEM__
734  pWLocal += 2;
735  twTemp = *pWLocal;
736  vTwX3 = CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
737  twTemp);
738 #else
739  pWLocal += 2;
740  twTemp = *pWLocal;
741  vTwX3 = (CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
742  twTemp);
743 #endif
744 
745  /* pY0 = &pYLocal[0x00000000u]; */
746  /* pY1 = &pYLocal[0x80000000u >> numLeadingZeros]; */
747  /* pY2 = &pYLocal[0x20000000u >> numLeadingZeros]; */
748  /* pY3 = &pYLocal[0xA0000000u >> numLeadingZeros]; */
749  /* pY4 = &pYLocal[0x40000000u >> numLeadingZeros]; */
750  /* pY5 = &pYLocal[0xC0000000u >> numLeadingZeros]; */
751  /* pY6 = &pYLocal[0x60000000u >> numLeadingZeros]; */
752  /* pY7 = &pYLocal[0xE0000000u >> numLeadingZeros]; */
753 
754  pY0 = (cfloat*) (pY + (0x00000000u));
755  pY1 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
756  pY2 = (cfloat*) (pY + ((0x20000000u >> numLeadingZeros) << 1));
757  pY3 = (cfloat*) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
758  pY4 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
759  pY5 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
760  pY6 = (cfloat*) (pY + ((0x60000000u >> numLeadingZeros) << 1));
761  pY7 = (cfloat*) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
762 
763  for (k = 0; k < numPoints >> 3; k += 8) {
764  offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
765 
766  vX_0 = c7x::strm_eng<0, CV>::get_adv ();
767  vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
768  vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
769  vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
770  vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
771  vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
772  vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
773  vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
774 
775  vSum1 = vX_0 + vX_N_2;
776  vSum2 = vX_N_4 + vX_3N_4;
777  vDiff1 = vX_0 - vX_N_2;
778  vDiff2 = vX_N_4 - vX_3N_4;
779 
780  vX0 = vSum1 + vSum2;
781  vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
782  vX2 = vSum1 - vSum2;
783  vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
784 
785  vSum1_1 = vX_0_1 + vX_N_2_1;
786  vSum2_1 = vX_N_4_1 + vX_3N_4_1;
787  vDiff1_1 = vX_0_1 - vX_N_2_1;
788  vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
789 
790  vX0Temp = vSum1_1 + vSum2_1;
791  vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
792  vX2Temp = vSum1_1 - vSum2_1;
793  vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
794 
795  vX0_1 = vX0Temp;
796  vX1_1 = __complex_multiply (vX1Temp, vTwX1);
797  vX2_1 = __complex_multiply (vX2Temp, vTwX2);
798  vX3_1 = __complex_multiply (vX3Temp, vTwX3);
799 
800  vX0_2PtDft_1 = vX0 + vX0_1;
801  vX0_2PtDft_2 = vX0 - vX0_1;
802  vX1_2PtDft_1 = vX1 + vX1_1;
803  vX1_2PtDft_2 = vX1 - vX1_1;
804  vX2_2PtDft_1 = vX2 + vX2_1;
805  vX2_2PtDft_2 = vX2 - vX2_1;
806  vX3_2PtDft_1 = vX3 + vX3_1;
807  vX3_2PtDft_2 = vX3 - vX3_1;
808 
809  __vstore_reverse_bit ((CVP) (pY0 + offsetBitReverse), vX0_2PtDft_1);
810  __vstore_reverse_bit ((CVP) (pY1 + offsetBitReverse), vX0_2PtDft_2);
811  __vstore_reverse_bit ((CVP) (pY2 + offsetBitReverse), vX1_2PtDft_1);
812  __vstore_reverse_bit ((CVP) (pY3 + offsetBitReverse), vX1_2PtDft_2);
813  __vstore_reverse_bit ((CVP) (pY4 + offsetBitReverse), vX2_2PtDft_1);
814  __vstore_reverse_bit ((CVP) (pY5 + offsetBitReverse), vX2_2PtDft_2);
815  __vstore_reverse_bit ((CVP) (pY6 + offsetBitReverse), vX3_2PtDft_1);
816  __vstore_reverse_bit ((CVP) (pY7 + offsetBitReverse), vX3_2PtDft_2);
817  }
818  __SE0_CLOSE ();
819  }
820  }
821  return (status);
822 }
823 
824 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && \
825  !defined(FFTLIB_FFT1D_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
826  (defined(FFTLIB_CHECK_PARAMS)) || \
827  (defined(FFTLIB_FFT1D_I32FC_C32FC_O32FC_CHECK_PARAMS))
828 
831  FFTLIB_bufParams1D_t *bufParamsX,
832  FFTLIB_F32 *pW,
833  FFTLIB_bufParams1D_t *bufParamsW,
834  FFTLIB_F32 *pY,
835  FFTLIB_bufParams1D_t *bufParamsY,
836  void *pBlock)
837 {
838  FFTLIB_STATUS status = FFTLIB_SUCCESS;
839 
840  if ((pX == NULL) || (pW == NULL) || (pY == NULL)) {
841  status = FFTLIB_ERR_NULL_POINTER;
842  }
843  else if (bufParamsX->dim_x != bufParamsW->dim_x ||
844  bufParamsX->dim_x != bufParamsY->dim_x) {
846  }
847  else if (bufParamsX->dim_x < 64 * 2) { /* Minimum number of points is 64 */
849  }
850  else if ((bufParamsX->data_type != FFTLIB_FLOAT32) ||
851  (bufParamsW->data_type != FFTLIB_FLOAT32) ||
852  (bufParamsY->data_type != FFTLIB_FLOAT32)) {
853  status = FFTLIB_ERR_INVALID_TYPE;
854  }
855  else if (((uint64_t) pX) & 0xFu) { /* pX must be 16-byte aligned for a */
856  status = FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES; /* streaming engine
857  configuration */
858  }
859  else {
860  /* Check if number of pts is a power of 2 */
861  uint32_t k = 0;
862  while (k < 32) {
863  if (bufParamsX->dim_x & (1u << k)) {
864  break;
865  }
866  k++;
867  }
868  if ((1u << k) != bufParamsX->dim_x) {
870  }
871  }
872  return (status);
873 }
874 
875 #endif
@ FFTLIB_FLOAT32
c7x::cfloat_vec CV
c7x::float_vec V
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_ERR_INVALID_TYPE
Definition: FFTLIB_types.h:176
@ FFTLIB_ERR_NULL_POINTER
Definition: FFTLIB_types.h:178
@ FFTLIB_ERR_INVALID_DIMENSION
Definition: FFTLIB_types.h:177
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
@ FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES
Definition: FFTLIB_types.h:181
float FFTLIB_F32
Single precision floating point.
Definition: FFTLIB_types.h:169
FFTLIB_STATUS FFTLIB_fft1d_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function should be called before the FFTLIB_fft1d_i32fc_c32fc_o32fc_kernel function is called....
FFTLIB_STATUS FFTLIB_fft1d_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_fft1d_i32fc_c32fc_o32fc_init and...
FFTLIB_STATUS FFTLIB_fft1d_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, void *pBlock)
This function is the main kernel compute function.
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.