37 #ifndef COMMON_FFTLIB_UTIL_SE0ALOADCOMPUTECSA0CSTORE_H_
38 #define COMMON_FFTLIB_UTIL_SE0ALOADCOMPUTECSA0CSTORE_H_ 1
69 uint8_t * CbackDestination)
75 for(r = 0; r < numRows; r++){
76 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
78 __HWAOPXFER(__MMA_A_LDA);
84 __vpred tmp = c7x::strm_agen<0, __mma_vec>::get_vpred();
85 __mma_vec * VB1 = c7x::strm_agen<0, __mma_vec>::get_adv(CbackDestination);
87 __vstore_pred(tmp, VB1, VB0);
101 uint8_t *restrict CbackDestination)
107 for(r = 0; r < numRows; r++){
108 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
110 __HWAOPXFER(__MMA_A_LDA);
116 __vpred tmp = c7x::strm_agen<0, __mma_vec>::get_vpred();
117 __mma_vec * VB1 = c7x::strm_agen<0, __mma_vec>::get_adv(CbackDestination);
119 __vstore_pred(tmp, VB1, VB0);
132 uint8_t *restrict CbackDestination)
138 for(r = 0; r < numRows; r++){
139 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
141 __HWAOPXFER(__MMA_A_LDA);
147 __vpred tmp = c7x::strm_agen<0, __mma_vec>::get_vpred();
148 __mma_vec * VB1 = c7x::strm_agen<0, __mma_vec>::get_adv(CbackDestination);
150 __vstore_pred(tmp, VB1, VB0);
163 uint8_t *restrict CbackDestination)
169 for(r = 0; r < numRows; r++){
170 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
172 __HWAOPXFER(__MMA_A_LDA);
178 __vpred tmp = c7x::strm_agen<0, __mma_vec>::get_vpred();
179 __mma_vec * VB1 = c7x::strm_agen<0, __mma_vec>::get_adv(CbackDestination);
181 __vstore_pred(tmp, VB1, VB0);
192 template <
int32_t UNROLL_TIMES>
194 uint8_t *restrict CbackDestination)
199 FFTLIB_UNROLL(UNROLL_TIMES)
200 for(r = 0; r < numRows; r++){
201 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
203 __HWAOPXFER(__MMA_A_LDA);
209 __vpred tmp = c7x::strm_agen<0, __mma_vec>::get_vpred();
210 __mma_vec * VB1 = c7x::strm_agen<0, __mma_vec>::get_adv(CbackDestination);
212 __vstore_pred(tmp, VB1, VB0);
225 uint8_t *restrict CbackDestination)
228 int64_t ARows = (int64_t) numRows;
229 int64_t CRows = (int64_t) numRows;
230 int64_t blockRows = (int64_t) numBlockRows;
231 __mma_vec zeroVect = (__mma_vec)(0);
235 #pragma MUST_ITERATE(1)
236 #pragma LOOP_NO_ALIAS
237 for (r = 0; r < blockRows; r++)
239 AVal = (ARows-- > 0) ? c7x::strm_eng<0, __mma_vec>::get_adv() : zeroVect;
242 __HWAOPXFER(__MMA_A_LDA);
248 __vpred tmp = c7x::strm_agen<0, __mma_vec>::get_vpred();
249 __mma_vec * VB1 = c7x::strm_agen<0, __mma_vec>::get_adv(CbackDestination);
251 __vstore_pred(tmp, VB1, VB0);
static void FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll(int32_t numRows, uint8_t *restrict CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll16(int32_t numRows, uint8_t *restrict CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeCSA0Cstore(int32_t numRows, uint8_t *CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_OE(int32_t numRows, int32_t numBlockRows, uint8_t *restrict CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll64(int32_t numRows, uint8_t *restrict CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll32(int32_t numRows, uint8_t *restrict CbackDestination)