44 #include "../FFTLIB_FFT_dftSmall_ixX_cxX_oxX_priv.h"
45 #include "../../../common/c71/FFTLIB_inlines.h"
53 #define SE_PARAM_BASE (0x0000)
54 #define SE_SE0_PARAM_OFFSET (SE_PARAM_BASE)
55 #define SE_SE1_PARAM_OFFSET (SE_SE0_PARAM_OFFSET + SE_PARAM_SIZE)
56 #define SE_SA0_PARAM_OFFSET (SE_SE1_PARAM_OFFSET + SE_PARAM_SIZE)
57 #define NUMBLOCKS_OFFSET (SE_SA0_PARAM_OFFSET + SE_PARAM_SIZE)
59 #define MMA_PARAM_BASE (NUMBLOCKS_OFFSET + sizeof(int32_t))
60 #define MMA_PARAM_OFFSET (MMA_PARAM_BASE)
61 #define MMA_CONFIGREG_OFFSET (MMA_PARAM_OFFSET)
62 #define MMA_OFFSETREG_OFFSET (MMA_CONFIGREG_OFFSET + sizeof(FFTLIB_MMA_CONFIG_REG))
75 uint32_t batchSizePerRow, numRows, numBlocks;
76 uint64_t startupCycles, teardownCycles, computeCycles, operationCycles, overheadCycles;
77 uint64_t storeLatency = 24;
87 batchSizePerRow = MMA_SIZE/(fftSize*2);
88 numRows = batchSize/batchSizePerRow;
89 numRows = numRows*batchSizePerRow == batchSize ?
90 numRows : numRows + 1;
94 numBlocks = numRows/MMA_SIZE;
95 numBlocks = numBlocks*MMA_SIZE == numRows ?
96 numBlocks : numBlocks + 1;
107 computeCycles = numBlocks*MMA_SIZE;
118 *archCycles = startupCycles + operationCycles + teardownCycles;
119 *estCycles = startupCycles + operationCycles + overheadCycles + teardownCycles;
124 template <u
int32_t dataType>
131 uint32_t fftSize = pKerInitArgs->
fftSize;
132 uint32_t batchSize = pKerInitArgs->
batchSize;
144 if (fftSize > (MMA_SIZE >> 1)) {
149 bufParamsX->
dim_x = batchSize*fftSize*2;
150 bufParamsY->
dim_x = batchSize*fftSize*2;
151 bufParamsW->
dim_x = MMA_SIZE*MMA_SIZE;
168 template <
typename dataType>
176 uint32_t batchSizePerRow;
181 dataType *pWLocal = (dataType *)pW;
182 uint32_t fftSize = pKerInitArgs->
fftSize;
183 uint32_t batchSize = pKerInitArgs->
batchSize;
184 uint32_t interleave = pKerInitArgs->
interleave;
187 if (
sizeof(dataType) == 4) {
193 }
else if (
sizeof(dataType) == 2) {
203 if (bufParamsW->
dim_x != bufParamsWLocal.
dim_x) {
208 batchSizePerRow = (MMA_SIZE >> 1)/fftSize;
209 for (j = 0; j < MMA_SIZE; j++) {
210 for (k = 0; k < MMA_SIZE; k++) {
211 pWLocal[j*MMA_SIZE+k] = 0;
214 twF2sScale = ((uint32_t)1 << ((uint32_t)(
sizeof(dataType)*8-2))) - 1;
215 twF2sScale += ((uint32_t)1 << ((uint32_t)(
sizeof(dataType)*8-2)));
218 for (j = 0; j < batchSizePerRow; j++) {
219 for ( k = 0; k < fftSize; k++) {
220 for (l = 0; l < batchSizePerRow; l++) {
221 index = (j*fftSize+k)*2*MMA_SIZE+l*fftSize*2;
222 for (n = 0; n < fftSize; n++) {
227 FFTLIB_UTIL_cos_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
228 pWLocal[index+n*2+1] =
229 -FFTLIB_UTIL_sin_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
230 pWLocal[index+MMA_SIZE+n*2] =
231 FFTLIB_UTIL_sin_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
232 pWLocal[index+MMA_SIZE+n*2+1] =
233 FFTLIB_UTIL_cos_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
240 for (j = 0; j < batchSizePerRow; j++) {
241 for ( k = 0; k < fftSize; k++) {
242 for (l = 0; l < batchSize; l++) {
243 index = (j*fftSize*2+k)*MMA_SIZE+l*fftSize*2;
244 for (n = 0; n < fftSize; n++) {
249 FFTLIB_UTIL_cos_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
250 pWLocal[index+fftSize+n] =
251 -FFTLIB_UTIL_sin_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
252 pWLocal[index+fftSize*MMA_SIZE+n] =
253 FFTLIB_UTIL_sin_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
254 pWLocal[index+fftSize*MMA_SIZE+fftSize+n] =
255 FFTLIB_UTIL_cos_i64f_oxX<dataType>(2*PI*k*n/fftSize, twF2sScale);
276 template <u
int32_t dataType>
285 __SE_TEMPLATE_v1 se0Params;
286 __SE_TEMPLATE_v1 se1Params;
287 __SA_TEMPLATE_v1 sa0Params;
288 uint32_t batchSizePerRow, numRows, numBlocks;
290 __HWA_OFFSET_REG mmaOffset;
293 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
294 uint32_t fftSize = pKerInitArgs->
fftSize;
295 uint32_t batchSize = pKerInitArgs->
batchSize;
298 __SE_ELETYPE SE_ELETYPE;
299 __SE_VECLEN SE_VECLEN;
304 SE_ELETYPE = __SE_ELETYPE_32BIT;
305 SE_VECLEN = __SE_VECLEN_16ELEMS;
309 SE_ELETYPE = __SE_ELETYPE_16BIT;
310 SE_VECLEN = __SE_VECLEN_32ELEMS;
317 batchSizePerRow = (MMA_SIZE >> 1)/fftSize;
318 numRows = batchSize/batchSizePerRow;
319 numRows = numRows*batchSizePerRow == batchSize ?
320 numRows : numRows + 1;
324 numBlocks = numRows/MMA_SIZE;
325 numBlocks = numBlocks*MMA_SIZE == numRows ?
326 numBlocks : numBlocks + 1;
331 se0Params = __gen_SE_TEMPLATE_v1();
332 se0Params.ICNT0 = batchSizePerRow*fftSize*2;
333 se0Params.ICNT1 = MMA_SIZE*numBlocks;
334 se0Params.DIM1 = batchSizePerRow*fftSize*2;
336 se0Params.DECDIM1_WIDTH = fftSize*2*batchSize;
337 se0Params.DECDIM1 = __SE_DECDIM_DIM1;
338 se0Params.ELETYPE = SE_ELETYPE;
339 se0Params.VECLEN = SE_VECLEN;
340 se0Params.DIMFMT = __SE_DIMFMT_2D;
345 se1Params = __gen_SE_TEMPLATE_v1();
346 se1Params.ICNT0 = MMA_SIZE;
347 se1Params.ICNT1 = MMA_SIZE;
348 se1Params.DIM1 = MMA_SIZE;
353 se1Params.ELETYPE = SE_ELETYPE;
354 se1Params.VECLEN = SE_VECLEN;
355 se1Params.DIMFMT = __SE_DIMFMT_3D;
360 sa0Params = __gen_SA_TEMPLATE_v1();
362 sa0Params.ICNT1 = numBlocks*MMA_SIZE;
366 sa0Params.DECDIM1 = __SA_DECDIM_DIM1;
367 sa0Params.VECLEN = __SA_VECLEN_64ELEMS;
368 sa0Params.DIMFMT = __SA_DIMFMT_2D;
373 mmaConfig = mmaConfigDefault;
376 mmaConfig.B_BSWPER = 0xFFFFFFFFu;
378 mmaConfig.C_BSWPER = 0xFFFFFFFFu;
379 mmaConfig.C_CWSWPER = MMA_SIZE;
380 mmaConfig.C_CRSWPER = MMA_SIZE;
381 mmaConfig.C_CRRSTPER = MMA_SIZE;
382 mmaConfig.C_CWRSTPER = MMA_SIZE;
384 mmaConfig.X_SHIFT = shift;
385 mmaConfig.X_CSWPER = MMA_SIZE;
386 mmaConfig.X_CRRSTPER = MMA_SIZE;
420 template <
typename dataType, u
int32_t MMA_SIZE>
423 const void *restrict pX,
424 const void *restrict pW,
427 __SE_TEMPLATE_v1 se0Params;
428 __SE_TEMPLATE_v1 se1Params;
429 __SA_TEMPLATE_v1 sa0Params;
431 __HWA_OFFSET_REG mmaOffset;
436 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
439 __SE1_OPEN((
const void *)pW, se1Params);
443 __HWAOPEN(mmaConfig, mmaOffset, __MMA_OPEN_FSM_RESET);
447 __SE0_OPEN((
const void *)pX, se0Params);
450 __SA0_OPEN(sa0Params);
458 loopCount = (numBlocks-1)*MMA_SIZE;
461 _nassert(loopCount > 0);
464 FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)pY);
466 FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)pY);
479 const void *restrict pX,
480 const void *restrict pW,
485 const void *restrict pX,
486 const void *restrict pW,
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_exec_ci< int32_t, FFTLIB_MMA_SIZE_32_BIT >(FFTLIB_kernelHandle handle, const void *restrict pX, const void *restrict pW, void *restrict pY)
#define MMA_CONFIGREG_OFFSET
FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_exec_ci(FFTLIB_kernelHandle handle, const void *restrict pX, const void *restrict pW, void *restrict pY)
This function is the main execution function for the C7x implementation of the kernel....
#define SE_SE0_PARAM_OFFSET
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_getSizes< FFTLIB_INT32 >(const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_bufParams1D_t *bufParamsY)
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_getSizes< FFTLIB_INT16 >(const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_bufParams1D_t *bufParamsY)
#define SE_SE1_PARAM_OFFSET
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_twGen< int16_t >(const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs, void *restrict pW, const FFTLIB_bufParams1D_t *bufParamsW)
FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_init_ci(FFTLIB_kernelHandle handle, const FFTLIB_bufParams1D_t *bufParamsX, const FFTLIB_bufParams1D_t *bufParamsW, const FFTLIB_bufParams1D_t *bufParamsY, const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_init_ci< FFTLIB_INT32 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams1D_t *bufParamsX, const FFTLIB_bufParams1D_t *bufParamsW, const FFTLIB_bufParams1D_t *bufParamsY, const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs)
#define MMA_OFFSETREG_OFFSET
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_exec_ci< int16_t, FFTLIB_MMA_SIZE_16_BIT >(FFTLIB_kernelHandle handle, const void *restrict pX, const void *restrict pW, void *restrict pY)
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_init_ci< FFTLIB_INT16 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams1D_t *bufParamsX, const FFTLIB_bufParams1D_t *bufParamsW, const FFTLIB_bufParams1D_t *bufParamsY, const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs)
FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_twGen(const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs, void *restrict pW, const FFTLIB_bufParams1D_t *bufParamsW)
This is a utility function that generates the DFT computational matrix into the provided buffer.
template FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_twGen< int32_t >(const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs, void *restrict pW, const FFTLIB_bufParams1D_t *bufParamsW)
#define SE_SA0_PARAM_OFFSET
static void FFTLIB_UTIL_SE0AloadComputeCSA0Cstore(int32_t numRows, uint8_t *CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeC(int32_t numRows)
use streaming engine 0 to load numRows of A compute C(row) = A(row) x Bfore(:,:)
static void FFTLIB_UTIL_SE1Bload(int32_t numRows)
load numRows of B into the Bload buffer
static int32_t FFTLIB_sizeof(uint32_t type)
Inline function returns number of bytes per element given a type of _FFTLIB_data_type_e.
double FFTLIB_D64
Double precision floating point.
void * FFTLIB_kernelHandle
Handle type for FFTLIB operations.
FFTLIB_STATUS_NAME
The enumeration of all status codes.
@ FFTLIB_ERR_NOT_IMPLEMENTED
@ FFTLIB_ERR_INVALID_DIMENSION
const __HWA_OFFSET_REG offsetRegStruct_zeros
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i16s_i16s_o16s
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i32s_i32s_o32s
#define FFTLIB_MMA_CONFIG_REG
#define FFTLIB_MMA_SIZE_16_BIT
type is 16-bit integers
#define FFTLIB_MMA_SIZE_8_BIT
MMA size as a function of precision.
#define FFTLIB_MMA_SIZE_32_BIT
type is 32-bit integers
FFTLIB_STATUS FFTLIB_FFT_dftSmall_ixX_cxX_oxX_getSizes(const FFTLIB_FFT_dftSmall_ixX_cxX_oxX_InitArgs *pKerInitArgs, FFTLIB_bufParams1D_t *bufParamsX, FFTLIB_bufParams1D_t *bufParamsW, FFTLIB_bufParams1D_t *bufParamsY)
This is a query function to calculate the sizes of input, output and the DFT computational matrix buf...
void FFTLIB_FFT_dftSmall_ixX_cxX_oxX_perfEst(FFTLIB_kernelHandle handle, const FFTLIB_bufParams1D_t *bufParamsX, const FFTLIB_bufParams1D_t *bufParamsY, const FFTLIB_bufParams1D_t *bufParamsW, uint32_t fftSize, uint32_t batchSize, uint64_t *archCycles, uint64_t *estCycles)
This is a utility function that gives an estimate of the cycles consumed for the kernel execution.
Structure containing the parameters for DFT computation.
uint32_t interleave
Flag to indicate if the real and imaginary parts of data are interleaved or not. A value of 1 indicat...
uint32_t shiftVector[FFTLIB_FFT_DFTSMALL_IXX_CXX_OXX_NUMSHIFTS]
Array containing the bit-shift values to be used for internal computation.
uint32_t fftSize
Size of each channel's data in terms of number of complex points.
uint32_t batchSize
Size of the batch in terms of number of channels of DFT's
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[FFTLIB_FFT_DFTSMALL_IXX_CXX_OXX_PBLOCK_SIZE]
Array to store the configuration prepared by FFTLIB_FFT_dftSmall_ixX_cxX_oxX_init_ci that will be ret...
A structure for a 1 dimensional buffer descriptor.
uint32_t data_type
Values are of type FFTLIB_data_type_e.
uint32_t dim_x
Width of buffer in X dimension in elements.