29 #include "../FFTLIB_fft1dBatched_i32fc_c32fc_o32fc.h"
34 #include "../../../common/printv.h"
43 #define SE_PARAM_BASE (0x0000)
44 #define SE_LOOP1_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE_LOOP2_PARAM_OFFSET (SE_LOOP1_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SE_LOOP3_PARAM_OFFSET (SE_LOOP2_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define SE_LOOP4_PARAM_OFFSET (SE_LOOP3_PARAM_OFFSET + SE_PARAM_SIZE)
48 #define SE_LOOP5_PARAM_OFFSET (SE_LOOP4_PARAM_OFFSET + SE_PARAM_SIZE)
49 #define SE_LOOP6_PARAM_OFFSET (SE_LOOP5_PARAM_OFFSET + SE_PARAM_SIZE)
50 #define SE_LOOP7_PARAM_OFFSET (SE_LOOP6_PARAM_OFFSET + SE_PARAM_SIZE)
51 #define SE_TWID_PARAM_OFFSET (SE_LOOP7_PARAM_OFFSET + SE_PARAM_SIZE)
52 #define SA_LOOP1_PARAM_OFFSET (SE_TWID_PARAM_OFFSET + SE_PARAM_SIZE)
53 #define SA_LOOP2_PARAM_OFFSET (SA_LOOP1_PARAM_OFFSET + SA_PARAM_SIZE)
54 #define SA_LOOP3_PARAM_OFFSET (SA_LOOP2_PARAM_OFFSET + SA_PARAM_SIZE)
55 #define SA_LOOP4_PARAM_OFFSET (SA_LOOP3_PARAM_OFFSET + SA_PARAM_SIZE)
56 #define SA_LOOP6_PARAM_OFFSET (SA_LOOP4_PARAM_OFFSET + SA_PARAM_SIZE)
71 #if defined(FFTLIB_CHECK_PARAMS) || \
72 defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS)
80 uint32_t numPointsPerDft;
81 uint32_t seCnt1, seCnt2, seCnt3, seCnt4;
82 uint32_t seCnt6, seCnt7, seCnt8, seCnt9, seCnt10;
84 __SE_TEMPLATE_v1 se0_param = __gen_SE_TEMPLATE_v1 ();
85 __SE_TEMPLATE_v1 se1_param = __gen_SE_TEMPLATE_v1 ();
86 __SA_TEMPLATE_v1 sa0_param = __gen_SA_TEMPLATE_v1 ();
88 numPointsPerDft = numPoints;
89 seCnt1 = numPoints >> 2;
90 seCnt2 = numPoints >> 5;
92 seCnt4 = numPoints >> 3;
93 seCnt6 = seCnt3 * numChannels;
95 (numPoints * numChannels >> 5) > 1 ? numPoints * numChannels >> 5 : 1;
96 seCnt8 = numPoints * numChannels;
97 seCnt9 = (numPoints * numChannels > 32) ? numPoints * numChannels : 32;
99 (numPoints * numChannels >> 6) > 1 ? numPoints * numChannels >> 6 : 1;
100 seCnt11 = (numPoints * numChannels > 64) ? numPoints * numChannels : 64;
102 se0_param = __gen_SE_TEMPLATE_v1 ();
105 se0_param.DIM1 = seCnt1;
106 se0_param.ICNT2 = seCnt2;
108 se0_param.ICNT3 = seCnt6;
109 se0_param.DIM3 = numPointsPerDft;
111 se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
112 se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
113 se0_param.DIMFMT = __SE_DIMFMT_4D;
117 se1_param = __gen_SE_TEMPLATE_v1 ();
120 se1_param.DIM1 = seCnt1;
121 se1_param.ICNT2 = seCnt2;
123 se1_param.ICNT3 = seCnt6;
126 se1_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
127 se1_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
128 se1_param.DIMFMT = __SE_DIMFMT_4D;
132 sa0_param = __gen_SA_TEMPLATE_v1 ();
135 sa0_param.DIM1 = seCnt1;
136 sa0_param.ICNT2 = seCnt2;
138 sa0_param.ICNT3 = seCnt6;
139 sa0_param.DIM3 = numPointsPerDft;
141 sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
142 sa0_param.DIMFMT = __SA_DIMFMT_4D;
146 se0_param = __gen_SE_TEMPLATE_v1 ();
150 se0_param.ICNT2 = seCnt7;
153 se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
154 se0_param.TRANSPOSE =
155 __SE_TRANSPOSE_256BIT;
157 se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
158 se0_param.DIMFMT = __SE_DIMFMT_3D;
162 sa0_param = __gen_SA_TEMPLATE_v1 ();
163 sa0_param.ICNT0 = seCnt8;
167 sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
168 sa0_param.DIMFMT = __SA_DIMFMT_1D;
172 se0_param = __gen_SE_TEMPLATE_v1 ();
173 se0_param.ICNT0 = seCnt8;
175 se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
176 se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
177 se0_param.DIMFMT = __SE_DIMFMT_1D;
181 sa0_param = __gen_SA_TEMPLATE_v1 ();
182 sa0_param.ICNT0 = seCnt8;
184 sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
185 sa0_param.DIMFMT = __SA_DIMFMT_1D;
192 se0_param = __gen_SE_TEMPLATE_v1 ();
196 se0_param.ICNT2 = seCnt7;
199 se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
200 se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
201 se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
202 se0_param.DIMFMT = __SE_DIMFMT_3D;
206 sa0_param = __gen_SA_TEMPLATE_v1 ();
207 sa0_param.ICNT0 = seCnt9;
211 sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
212 sa0_param.DIMFMT = __SA_DIMFMT_1D;
216 se0_param = __gen_SE_TEMPLATE_v1 ();
217 se0_param.ICNT0 = seCnt4;
224 se0_param.ICNT2 = numChannels;
225 se0_param.DIM2 = numPoints;
227 se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
228 se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
229 se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
230 se0_param.DIMFMT = __SE_DIMFMT_3D;
234 se0_param = __gen_SE_TEMPLATE_v1 ();
238 se0_param.ICNT2 = seCnt10;
241 se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
242 se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
243 se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
244 se0_param.DIMFMT = __SE_DIMFMT_3D;
248 sa0_param = __gen_SA_TEMPLATE_v1 ();
249 sa0_param.ICNT0 = seCnt11;
253 sa0_param.VECLEN = c7x::sa_veclen<c7x::cfloat_vec>::value;
254 sa0_param.DIMFMT = __SA_DIMFMT_1D;
258 se0_param = __gen_SE_TEMPLATE_v1 ();
259 se0_param.ICNT0 = seCnt4;
266 se0_param.ICNT2 = numChannels;
267 se0_param.DIM2 = numPoints;
269 se0_param.ELETYPE = __SE_ELETYPE_32BIT_CMPLX_SWAP;
270 se0_param.TRANSPOSE = __SE_TRANSPOSE_64BIT;
271 se0_param.VECLEN = c7x::se_veclen<c7x::cfloat_vec>::value;
272 se0_param.DIMFMT = __SE_DIMFMT_3D;
287 uint32_t numChannels,
292 uint32_t numPointsPerDft;
293 uint32_t numLeadingZeros;
294 uint32_t offsetBitReverse;
295 uint32_t seCnt1, seCnt2, seCnt3, seCnt6;
297 __SE_TEMPLATE_v1 se0_param;
298 __SE_TEMPLATE_v1 se1_param;
299 __SA_TEMPLATE_v1 sa0_param;
301 cfloat* restrict pXLocal;
302 cfloat* restrict pYLocal;
303 cfloat* restrict pWLocal;
304 cfloat* restrict pY0;
305 cfloat* restrict pY1;
306 cfloat* restrict pY2;
307 cfloat* restrict pY3;
308 cfloat* restrict pY4;
309 cfloat* restrict pY5;
310 cfloat* restrict pY6;
311 cfloat* restrict pY7;
313 typedef typename c7x::cfloat_vec
CV;
319 CV vX_0, vX_N_4, vX_N_2, vX_3N_4;
320 CV vSum1, vSum2, vDiff1, vDiff2;
321 CV vTwX1, vTwX2, vTwX3;
322 CV vX0Temp, vX1Temp, vX2Temp, vX3Temp;
323 CV vX0, vX1, vX2, vX3;
324 CV vX_0_1, vX_N_4_1, vX_N_2_1, vX_3N_4_1;
325 CV vSum1_1, vSum2_1, vDiff1_1, vDiff2_1;
326 CV vX0_1, vX1_1, vX2_1, vX3_1;
327 CV vX0_2PtDft_1, vX0_2PtDft_2;
328 CV vX1_2PtDft_1, vX1_2PtDft_2;
329 CV vX2_2PtDft_1, vX2_2PtDft_2;
330 CV vX3_2PtDft_1, vX3_2PtDft_2;
331 CV vX01_lo, vX23_lo, vX01_hi, vX23_hi;
334 #ifdef FFTLIB_CHECK_PARAMS
342 numPointsPerDft = numPoints;
350 seCnt1 = numPointsPerDft >> 2;
351 seCnt2 = numPointsPerDft >> 5;
354 pXLocal = (cfloat*) pX;
355 pWLocal = (cfloat*) pW;
356 pYLocal = (cfloat*) pY;
358 while (numPointsPerDft >= 64) {
360 seCnt6 = seCnt3 * numChannels;
362 se0_param.DIM1 = seCnt1;
363 se0_param.ICNT2 = seCnt2;
365 se0_param.ICNT3 = seCnt6;
368 __SE0_OPEN ((
void *) pXLocal, se0_param);
371 se1_param.DIM1 = seCnt1;
375 se1_param.ICNT3 = seCnt6;
377 __SE1_OPEN ((
void *) pWLocal, se1_param);
380 sa0_param.DIM1 = seCnt1;
381 sa0_param.ICNT2 = seCnt2;
384 sa0_param.ICNT3 = seCnt6;
385 sa0_param.DIM3 = numPointsPerDft;
386 __SA0_OPEN (sa0_param);
389 for (k = 0; k < numPoints * numChannels; k += 64) {
392 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
393 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
394 vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
395 vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
397 vSum1 = vX_0 + vX_N_2;
398 vSum2 = vX_N_4 + vX_3N_4;
399 vDiff1 = vX_0 - vX_N_2;
400 vDiff2 = vX_N_4 - vX_3N_4;
402 vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
403 vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
404 vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
406 vX0Temp = vSum1 + vSum2;
407 vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
408 vX2Temp = vSum1 - vSum2;
409 vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
412 vX1 = __complex_multiply (vX1Temp, vTwX1);
413 vX2 = __complex_multiply (vX2Temp, vTwX2);
414 vX3 = __complex_multiply (vX3Temp, vTwX3);
419 tmp = c7x::strm_agen<0, CV>::get_vpred ();
420 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
421 __vstore_pred (tmp, addr, vX0);
423 tmp = c7x::strm_agen<0, CV>::get_vpred ();
424 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
425 __vstore_pred (tmp, addr, vX2);
427 tmp = c7x::strm_agen<0, CV>::get_vpred ();
428 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
429 __vstore_pred (tmp, addr, vX1);
431 tmp = c7x::strm_agen<0, CV>::get_vpred ();
432 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
433 __vstore_pred (tmp, addr, vX3);
436 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
437 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
438 vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
439 vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
441 vSum1 = vX_0 + vX_N_2;
442 vSum2 = vX_N_4 + vX_3N_4;
443 vDiff1 = vX_0 - vX_N_2;
444 vDiff2 = vX_N_4 - vX_3N_4;
446 vTwX1 = c7x::strm_eng<1, CV>::get_adv ();
447 vTwX2 = c7x::strm_eng<1, CV>::get_adv ();
448 vTwX3 = c7x::strm_eng<1, CV>::get_adv ();
450 vX0Temp = vSum1 + vSum2;
451 vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
452 vX2Temp = vSum1 - vSum2;
453 vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
456 vX1 = __complex_multiply (vX1Temp, vTwX1);
457 vX2 = __complex_multiply (vX2Temp, vTwX2);
458 vX3 = __complex_multiply (vX3Temp, vTwX3);
460 tmp = c7x::strm_agen<0, CV>::get_vpred ();
461 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
462 __vstore_pred (tmp, addr, vX0);
464 tmp = c7x::strm_agen<0, CV>::get_vpred ();
465 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
466 __vstore_pred (tmp, addr, vX2);
468 tmp = c7x::strm_agen<0, CV>::get_vpred ();
469 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
470 __vstore_pred (tmp, addr, vX1);
472 tmp = c7x::strm_agen<0, CV>::get_vpred ();
473 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
474 __vstore_pred (tmp, addr, vX3);
480 numPointsPerDft >>= 2;
481 pWLocal += numPointsPerDft * 3;
487 if (numPointsPerDft == 16) {
491 __SE0_OPEN ((
void *) pXLocal, se0_param);
492 __SE1_OPEN ((
void *) (pXLocal + 8), se0_param);
496 __SA0_OPEN (sa0_param);
498 vTwX1 = *((
CVP) pWLocal);
499 vTwX2 = *((
CVP) (pWLocal + 4));
500 vTwX3 = *((
CVP) (pWLocal + 8));
503 vTwX1 =
CV (vTwX1.lo(), vTwX1.lo());
504 vTwX2 =
CV (vTwX2.lo(), vTwX2.lo());
505 vTwX3 =
CV (vTwX3.lo(), vTwX3.lo());
507 vTwX1 = (
CV) (vTwX1.lo(), vTwX1.lo());
508 vTwX2 = (
CV) (vTwX2.lo(), vTwX2.lo());
509 vTwX3 = (
CV) (vTwX3.lo(), vTwX3.lo());
512 for (k = 0; k < numPoints * numChannels; k += 32) {
513 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
514 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
515 vX_N_2 = c7x::strm_eng<1, CV>::get_adv ();
516 vX_3N_4 = c7x::strm_eng<1, CV>::get_adv ();
518 vSum1 = vX_0 + vX_N_2;
519 vSum2 = vX_N_4 + vX_3N_4;
520 vDiff1 = vX_0 - vX_N_2;
521 vDiff2 = vX_N_4 - vX_3N_4;
523 vX0Temp = vSum1 + vSum2;
524 vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
525 vX2Temp = vSum1 - vSum2;
526 vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
529 vX1 = __complex_multiply (vX1Temp, vTwX1);
530 vX2 = __complex_multiply (vX2Temp, vTwX2);
531 vX3 = __complex_multiply (vX3Temp, vTwX3);
534 __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
536 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
537 __vstore_pred (tmp, addr,
CV (vX0.lo(), vX2.lo()));
539 tmp = c7x::strm_agen<0, CV>::get_vpred ();
540 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
541 __vstore_pred (tmp, addr,
CV (vX1.lo(), vX3.lo()));
543 tmp = c7x::strm_agen<0, CV>::get_vpred ();
544 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
545 __vstore_pred (tmp, addr,
CV (vX0.hi(), vX2.hi()));
547 tmp = c7x::strm_agen<0, CV>::get_vpred ();
548 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
549 __vstore_pred (tmp, addr,
CV (vX1.hi(), vX3.hi()));
551 __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
553 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
554 __vstore_pred (tmp, addr, (
CV) (vX0.lo(), vX2.lo()));
556 tmp = c7x::strm_agen<0, CV>::get_vpred ();
557 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
558 __vstore_pred (tmp, addr, (
CV) (vX1.lo(), vX3.lo()));
560 tmp = c7x::strm_agen<0, CV>::get_vpred ();
561 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
562 __vstore_pred (tmp, addr, (
CV) (vX0.hi(), vX2.hi()));
564 tmp = c7x::strm_agen<0, CV>::get_vpred ();
565 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
566 __vstore_pred (tmp, addr, (
CV) (vX1.hi(), vX3.hi()));
577 __SE0_OPEN ((
void *) pXLocal, se0_param);
581 __SA0_OPEN (sa0_param);
583 vTwX1 = *((
CVP) pWLocal);
584 vTwX2 = *((
CVP) (pWLocal + 8));
585 vTwX3 = *((
CVP) (pWLocal + 16));
587 for (k = 0; k < numPoints * numChannels; k += 32) {
588 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
589 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
590 vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
591 vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
593 vSum1 = vX_0 + vX_N_2;
594 vSum2 = vX_N_4 + vX_3N_4;
595 vDiff1 = vX_0 - vX_N_2;
596 vDiff2 = vX_N_4 - vX_3N_4;
598 vX0Temp = vSum1 + vSum2;
599 vX1Temp = vDiff1 - __vcrot90sp_vv (vDiff2);
600 vX2Temp = vSum1 - vSum2;
601 vX3Temp = vDiff1 + __vcrot90sp_vv (vDiff2);
604 vX1 = __complex_multiply (vX1Temp, vTwX1);
605 vX2 = __complex_multiply (vX2Temp, vTwX2);
606 vX3 = __complex_multiply (vX3Temp, vTwX3);
608 __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
610 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
611 __vstore_pred (tmp, addr, vX0);
613 tmp = c7x::strm_agen<0, CV>::get_vpred ();
614 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
615 __vstore_pred (tmp, addr, vX2);
617 tmp = c7x::strm_agen<0, CV>::get_vpred ();
618 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
619 __vstore_pred (tmp, addr, vX1);
621 tmp = c7x::strm_agen<0, CV>::get_vpred ();
622 addr = c7x::strm_agen<0, CV>::get_adv (pXLocal);
623 __vstore_pred (tmp, addr, vX3);
629 numPointsPerDft >>= 2;
630 pWLocal += numPointsPerDft * 3;
632 if (numPointsPerDft == 4) {
635 if (numPoints == 16) {
638 c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
639 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
640 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
641 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
642 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
643 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
644 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
645 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
647 c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
648 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
649 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
650 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
651 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
652 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
653 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
654 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
658 se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
660 __SE0_OPEN ((
void *) pXLocal, se0_param);
662 sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
664 __SA0_OPEN (sa0_param);
666 for (k = 0; k < numChannels << 4; k += 32) {
667 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
668 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
669 vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
670 vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
672 vSum1 = vX_0 + vX_N_2;
673 vSum2 = vX_N_4 + vX_3N_4;
674 vDiff1 = vX_0 - vX_N_2;
675 vDiff2 = vX_N_4 - vX_3N_4;
678 vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
680 vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
682 vX01_lo = c7x::as_cfloat_vec (
683 __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
684 c7x::as_uchar_vec (vX0)));
685 vX23_lo = c7x::as_cfloat_vec (
686 __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
687 c7x::as_uchar_vec (vX2)));
688 vX01_hi = c7x::as_cfloat_vec (
689 __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1),
690 c7x::as_uchar_vec (vX0)));
691 vX23_hi = c7x::as_cfloat_vec (
692 __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3),
693 c7x::as_uchar_vec (vX2)));
695 __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
697 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
698 __vstore_pred (tmp, addr, vX01_lo);
700 tmp = c7x::strm_agen<0, CV>::get_vpred ();
701 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
702 __vstore_pred (tmp, addr, vX23_lo);
704 tmp = c7x::strm_agen<0, CV>::get_vpred ();
705 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
706 __vstore_pred (tmp, addr, vX01_hi);
708 tmp = c7x::strm_agen<0, CV>::get_vpred ();
709 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
710 __vstore_pred (tmp, addr, vX23_hi);
716 se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
718 __SE0_OPEN ((
void *) pXLocal, se0_param);
720 numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
727 pY0 = (cfloat*) (pY + 0);
728 pY1 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
729 pY2 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
730 pY3 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
732 #ifdef CL7X_HE_CFLOAT_PTR_BUG
733 float *myPY0 = (
float *) pY0;
734 float *myPY1 = (
float *) pY1;
735 float *myPY2 = (
float *) pY2;
736 float *myPY3 = (
float *) pY3;
739 for (l = 0; l < numChannels; l++) {
740 for (k = 0; k < numPoints >> 3; k += 4) {
741 offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
743 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
744 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
745 vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
746 vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
748 vSum1 = vX_0 + vX_N_2;
749 vSum2 = vX_N_4 + vX_3N_4;
750 vDiff1 = vX_0 - vX_N_2;
751 vDiff2 = vX_N_4 - vX_3N_4;
754 vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
756 vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
771 __vstore_reverse_bit ((
CVP) (pY0 + offsetBitReverse), vX0);
772 __vstore_reverse_bit ((
CVP) (pY1 + offsetBitReverse), vX1);
773 __vstore_reverse_bit ((
CVP) (pY2 + offsetBitReverse), vX2);
774 __vstore_reverse_bit ((
CVP) (pY3 + offsetBitReverse), vX3);
777 #ifdef CL7X_HE_CFLOAT_PTR_BUG
778 myPY0 += (numPoints << 1);
779 myPY1 += (numPoints << 1);
780 myPY2 += (numPoints << 1);
781 myPY3 += (numPoints << 1);
783 pY0 = (cfloat*) myPY0;
784 pY1 = (cfloat*) myPY1;
785 pY2 = (cfloat*) myPY2;
786 pY3 = (cfloat*) myPY3;
804 vTwX1 =
CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
808 vTwX2 =
CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
812 vTwX3 =
CV (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
817 vTwX1 = (
CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
821 vTwX2 = (
CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
825 vTwX3 = (
CV) (twTemp, twTemp, twTemp, twTemp, twTemp, twTemp, twTemp,
829 if (numPoints == 32) {
832 c7x::uchar_vec vXPermCtrl = c7x::uchar_vec(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
833 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
834 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
835 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
836 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
837 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
838 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
839 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
841 c7x::uchar_vec vXPermCtrl = (c7x::uchar_vec)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
842 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
843 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
844 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
845 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
846 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
847 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
848 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F);
851 CV vX01_2PtDft_1_lo, vX23_2PtDft_1_lo, vX01_2PtDft_2_lo,
853 CV vX01_2PtDft_1_hi, vX23_2PtDft_1_hi, vX01_2PtDft_2_hi,
856 se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
858 __SE0_OPEN ((
void *) pXLocal, se0_param);
860 sa0_param = *((__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
862 __SA0_OPEN (sa0_param);
864 for (k = 0; k < numChannels << 5; k += 64) {
865 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
866 vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
867 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
868 vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
869 vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
870 vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
871 vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
872 vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
874 vSum1 = vX_0 + vX_N_2;
875 vSum2 = vX_N_4 + vX_3N_4;
876 vDiff1 = vX_0 - vX_N_2;
877 vDiff2 = vX_N_4 - vX_3N_4;
880 vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
882 vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
884 vSum1_1 = vX_0_1 + vX_N_2_1;
885 vSum2_1 = vX_N_4_1 + vX_3N_4_1;
886 vDiff1_1 = vX_0_1 - vX_N_2_1;
887 vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
889 vX0Temp = vSum1_1 + vSum2_1;
890 vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
891 vX2Temp = vSum1_1 - vSum2_1;
892 vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
895 vX1_1 = __complex_multiply (vX1Temp, vTwX1);
896 vX2_1 = __complex_multiply (vX2Temp, vTwX2);
897 vX3_1 = __complex_multiply (vX3Temp, vTwX3);
899 vX0_2PtDft_1 = vX0 + vX0_1;
900 vX0_2PtDft_2 = vX0 - vX0_1;
901 vX1_2PtDft_1 = vX1 + vX1_1;
902 vX1_2PtDft_2 = vX1 - vX1_1;
903 vX2_2PtDft_1 = vX2 + vX2_1;
904 vX2_2PtDft_2 = vX2 - vX2_1;
905 vX3_2PtDft_1 = vX3 + vX3_1;
906 vX3_2PtDft_2 = vX3 - vX3_1;
909 vX01_2PtDft_1_lo = c7x::as_cfloat_vec (
910 __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
911 c7x::as_uchar_vec (vX0_2PtDft_1)));
912 vX23_2PtDft_1_lo = c7x::as_cfloat_vec (
913 __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
914 c7x::as_uchar_vec (vX2_2PtDft_1)));
915 vX01_2PtDft_2_lo = c7x::as_cfloat_vec (
916 __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
917 c7x::as_uchar_vec (vX0_2PtDft_2)));
918 vX23_2PtDft_2_lo = c7x::as_cfloat_vec (
919 __vpermll_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
920 c7x::as_uchar_vec (vX2_2PtDft_2)));
921 vX01_2PtDft_1_hi = c7x::as_cfloat_vec (
922 __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_1),
923 c7x::as_uchar_vec (vX0_2PtDft_1)));
924 vX23_2PtDft_1_hi = c7x::as_cfloat_vec (
925 __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_1),
926 c7x::as_uchar_vec (vX2_2PtDft_1)));
927 vX01_2PtDft_2_hi = c7x::as_cfloat_vec (
928 __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX1_2PtDft_2),
929 c7x::as_uchar_vec (vX0_2PtDft_2)));
930 vX23_2PtDft_2_hi = c7x::as_cfloat_vec (
931 __vpermhh_yvvv (vXPermCtrl, c7x::as_uchar_vec (vX3_2PtDft_2),
932 c7x::as_uchar_vec (vX2_2PtDft_2)));
934 __vpred tmp = c7x::strm_agen<0, CV>::get_vpred ();
936 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
937 __vstore_pred (tmp, addr, vX01_2PtDft_1_lo);
939 tmp = c7x::strm_agen<0, CV>::get_vpred ();
940 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
941 __vstore_pred (tmp, addr, vX23_2PtDft_1_lo);
943 tmp = c7x::strm_agen<0, CV>::get_vpred ();
944 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
945 __vstore_pred (tmp, addr, vX01_2PtDft_2_lo);
947 tmp = c7x::strm_agen<0, CV>::get_vpred ();
948 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
949 __vstore_pred (tmp, addr, vX23_2PtDft_2_lo);
951 tmp = c7x::strm_agen<0, CV>::get_vpred ();
952 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
953 __vstore_pred (tmp, addr, vX01_2PtDft_1_hi);
955 tmp = c7x::strm_agen<0, CV>::get_vpred ();
956 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
957 __vstore_pred (tmp, addr, vX23_2PtDft_1_hi);
959 tmp = c7x::strm_agen<0, CV>::get_vpred ();
960 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
961 __vstore_pred (tmp, addr, vX01_2PtDft_2_hi);
963 tmp = c7x::strm_agen<0, CV>::get_vpred ();
964 addr = c7x::strm_agen<0, CV>::get_adv (pYLocal);
965 __vstore_pred (tmp, addr, vX23_2PtDft_2_hi);
971 se0_param = *((__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
973 __SE0_OPEN ((
void *) pXLocal, se0_param);
975 numLeadingZeros = __norm ((int32_t) (numPoints - 1)) + 1;
986 pY0 = (cfloat*) (pY + (0x00000000u));
987 pY1 = (cfloat*) (pY + ((0x80000000u >> numLeadingZeros) << 1));
988 pY2 = (cfloat*) (pY + ((0x20000000u >> numLeadingZeros) << 1));
989 pY3 = (cfloat*) (pY + ((0xA0000000u >> numLeadingZeros) << 1));
990 pY4 = (cfloat*) (pY + ((0x40000000u >> numLeadingZeros) << 1));
991 pY5 = (cfloat*) (pY + ((0xC0000000u >> numLeadingZeros) << 1));
992 pY6 = (cfloat*) (pY + ((0x60000000u >> numLeadingZeros) << 1));
993 pY7 = (cfloat*) (pY + ((0xE0000000u >> numLeadingZeros) << 1));
995 #ifdef CL7X_HE_CFLOAT_PTR_BUG
996 float *myPY0 = (
float *) pY0;
997 float *myPY1 = (
float *) pY1;
998 float *myPY2 = (
float *) pY2;
999 float *myPY3 = (
float *) pY3;
1000 float *myPY4 = (
float *) pY4;
1001 float *myPY5 = (
float *) pY5;
1002 float *myPY6 = (
float *) pY6;
1003 float *myPY7 = (
float *) pY7;
1006 for (l = 0; l < numChannels; l++) {
1007 for (k = 0; k < numPoints >> 3; k += 8) {
1008 offsetBitReverse = __bit_reverse (k) >> numLeadingZeros;
1010 vX_0 = c7x::strm_eng<0, CV>::get_adv ();
1011 vX_0_1 = c7x::strm_eng<0, CV>::get_adv ();
1012 vX_N_4 = c7x::strm_eng<0, CV>::get_adv ();
1013 vX_N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
1014 vX_N_2 = c7x::strm_eng<0, CV>::get_adv ();
1015 vX_N_2_1 = c7x::strm_eng<0, CV>::get_adv ();
1016 vX_3N_4 = c7x::strm_eng<0, CV>::get_adv ();
1017 vX_3N_4_1 = c7x::strm_eng<0, CV>::get_adv ();
1019 vSum1 = vX_0 + vX_N_2;
1020 vSum2 = vX_N_4 + vX_3N_4;
1021 vDiff1 = vX_0 - vX_N_2;
1022 vDiff2 = vX_N_4 - vX_3N_4;
1024 vX0 = vSum1 + vSum2;
1025 vX1 = vDiff1 - __vcrot90sp_vv (vDiff2);
1026 vX2 = vSum1 - vSum2;
1027 vX3 = vDiff1 + __vcrot90sp_vv (vDiff2);
1029 vSum1_1 = vX_0_1 + vX_N_2_1;
1030 vSum2_1 = vX_N_4_1 + vX_3N_4_1;
1031 vDiff1_1 = vX_0_1 - vX_N_2_1;
1032 vDiff2_1 = vX_N_4_1 - vX_3N_4_1;
1034 vX0Temp = vSum1_1 + vSum2_1;
1035 vX1Temp = vDiff1_1 - __vcrot90sp_vv (vDiff2_1);
1036 vX2Temp = vSum1_1 - vSum2_1;
1037 vX3Temp = vDiff1_1 + __vcrot90sp_vv (vDiff2_1);
1040 vX1_1 = __complex_multiply (vX1Temp, vTwX1);
1041 vX2_1 = __complex_multiply (vX2Temp, vTwX2);
1042 vX3_1 = __complex_multiply (vX3Temp, vTwX3);
1044 vX0_2PtDft_1 = vX0 + vX0_1;
1045 vX0_2PtDft_2 = vX0 - vX0_1;
1046 vX1_2PtDft_1 = vX1 + vX1_1;
1047 vX1_2PtDft_2 = vX1 - vX1_1;
1048 vX2_2PtDft_1 = vX2 + vX2_1;
1049 vX2_2PtDft_2 = vX2 - vX2_1;
1050 vX3_2PtDft_1 = vX3 + vX3_1;
1051 vX3_2PtDft_2 = vX3 - vX3_1;
1053 __vstore_reverse_bit ((
CVP) (pY0 + offsetBitReverse),
1055 __vstore_reverse_bit ((
CVP) (pY1 + offsetBitReverse),
1057 __vstore_reverse_bit ((
CVP) (pY2 + offsetBitReverse),
1059 __vstore_reverse_bit ((
CVP) (pY3 + offsetBitReverse),
1061 __vstore_reverse_bit ((
CVP) (pY4 + offsetBitReverse),
1063 __vstore_reverse_bit ((
CVP) (pY5 + offsetBitReverse),
1065 __vstore_reverse_bit ((
CVP) (pY6 + offsetBitReverse),
1067 __vstore_reverse_bit ((
CVP) (pY7 + offsetBitReverse),
1071 #ifdef CL7X_HE_CFLOAT_PTR_BUG
1072 myPY0 += (numPoints << 1);
1073 myPY1 += (numPoints << 1);
1074 myPY2 += (numPoints << 1);
1075 myPY3 += (numPoints << 1);
1076 myPY4 += (numPoints << 1);
1077 myPY5 += (numPoints << 1);
1078 myPY6 += (numPoints << 1);
1079 myPY7 += (numPoints << 1);
1081 pY0 = (cfloat*) myPY0;
1082 pY1 = (cfloat*) myPY1;
1083 pY2 = (cfloat*) myPY2;
1084 pY3 = (cfloat*) myPY3;
1085 pY4 = (cfloat*) myPY4;
1086 pY5 = (cfloat*) myPY5;
1087 pY6 = (cfloat*) myPY6;
1088 pY7 = (cfloat*) myPY7;
1109 #if (!defined(FFTLIB_REMOVE_CHECK_PARAMS) && \
1110 !defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_REMOVE_CHECK_PARAMS)) || \
1111 (defined(FFTLIB_CHECK_PARAMS)) || \
1112 (defined(FFTLIB_FFT1DBATCHED_I32FC_C32FC_O32FC_CHECK_PARAMS))
1122 uint32_t numChannels,
1127 if ((pX == NULL) || (pW == NULL) || (pY == NULL) || (pBlock == NULL)) {
1130 else if (bufParamsX->
dim_x != bufParamsY->
dim_x) {
1133 else if (bufParamsX->
dim_x < numPoints * numChannels * 2) {
1140 else if (bufParamsX->
dim_x < 64 * 2) {
1143 else if (bufParamsW->
dim_x != numPoints * 2) {
1151 else if (((uint64_t) pX) & 0xFu) {
1159 if (numPoints & (1u << k)) {
1164 if ((1u << k) != numPoints) {
1168 if ((numChannels != 1) && (numChannels != 2) && (numChannels != 4) &&
1169 (numChannels != 8) && (numChannels != 16)) {
FFTLIB_STATUS_NAME
The enumeration of all status codes.
@ FFTLIB_ERR_INVALID_TYPE
@ FFTLIB_ERR_NULL_POINTER
@ FFTLIB_ERR_INVALID_DIMENSION
@ FFTLIB_ERR_NOT_ALIGNED_PTRS_STRIDES
float FFTLIB_F32
Single precision floating point.
#define SA_LOOP4_PARAM_OFFSET
#define SE_LOOP6_PARAM_OFFSET
#define SE_LOOP4_PARAM_OFFSET
#define SA_LOOP6_PARAM_OFFSET
#define SA_LOOP2_PARAM_OFFSET
#define SE_LOOP7_PARAM_OFFSET
#define SE_LOOP1_PARAM_OFFSET
#define SA_LOOP1_PARAM_OFFSET
#define SE_TWID_PARAM_OFFSET
#define SE_LOOP2_PARAM_OFFSET
#define SE_LOOP3_PARAM_OFFSET
#define SA_LOOP3_PARAM_OFFSET
#define SE_LOOP5_PARAM_OFFSET
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_init(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function should be called before the FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_kernel function is ca...
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_kernel(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function is the main kernel compute function.
FFTLIB_STATUS FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_checkParams(FFTLIB_F32 *pX, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_F32 *pW, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_F32 *pY, FFTLIB_bufParams1D_t *bufParamsY, uint32_t numPoints, uint32_t numChannels, void *pBlock)
This function checks the validity of the parameters passed to FFTLIB_fft1dBatched_i32fc_c32fc_o32fc_i...
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.