37 #ifndef COMMON_FFTLIB_UTIL_SE0ALOADSE1BLOADCOMPUTEC_H_
38 #define COMMON_FFTLIB_UTIL_SE0ALOADSE1BLOADCOMPUTEC_H_ 1
64 _nassert(numRows > 0);
65 for(r = 0; r < numRows; r++){
66 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
67 __mma_vec valB = c7x::strm_eng<1, __mma_vec>::get_adv();
68 __HWALDAB(valA, valB);
81 for(r = 0; r < numRows; r++){
82 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
83 __mma_vec valB = c7x::strm_eng<1, __mma_vec>::get_adv();
84 __HWALDAB(valA, valB);
97 for(r = 0; r < numRows; r++){
98 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
99 __mma_vec valB = c7x::strm_eng<1, __mma_vec>::get_adv();
100 __HWALDAB(valA, valB);
101 __HWAOP(__MMA_A_LDA);
113 for(r = 0; r < numRows; r++){
114 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
115 __mma_vec valB = c7x::strm_eng<1, __mma_vec>::get_adv();
116 __HWALDAB(valA, valB);
117 __HWAOP(__MMA_A_LDA);
125 template <
int32_t UNROLL_TIMES>
126 static inline void FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll(int32_t numRows)
130 FFTLIB_UNROLL(UNROLL_TIMES)
131 for(r = 0; r < numRows; r++){
132 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
133 __mma_vec valB = c7x::strm_eng<1, __mma_vec>::get_adv();
134 __HWALDAB(valA, valB);
135 __HWAOP(__MMA_A_LDA);
144 template <
int32_t UNROLL_TIMES>
145 static inline void FFTLIB_UTIL_SE0AloadSE1BloadComputeC_peel(int32_t numRows, int32_t peelSize)
149 FFTLIB_UNROLL(UNROLL_TIMES)
150 for(r = 0; r < peelSize; r++){
151 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
152 __mma_vec valB = c7x::strm_eng<1, __mma_vec>::get_adv();
153 __HWALDAB(valA, valB);
154 __HWAOP(__MMA_A_LDA);
158 _nassert(numRows > 0);
159 for(r = peelSize; r < numRows; r++){
160 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
161 __mma_vec valB = c7x::strm_eng<1, __mma_vec>::get_adv();
162 __HWALDAB(valA, valB);
163 __HWAOP(__MMA_A_LDA);
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll16(int32_t numRows)
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll32(int32_t numRows)
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll64(int32_t numRows)
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeC(int32_t numRows)