37 #ifndef COMMON_FFTLIB_UTIL_SE0ALOADCOMPUTEC_H_
38 #define COMMON_FFTLIB_UTIL_SE0ALOADCOMPUTEC_H_ 1
61 #pragma FUNC_ALWAYS_INLINE
63 #pragma FUNC_ALWAYS_INLINE(FFTLIB_UTIL_SE0AloadComputeC)
69 for(r = 0; r < numRows; r++){
70 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
80 #pragma FUNC_ALWAYS_INLINE
82 #pragma FUNC_ALWAYS_INLINE(FFTLIB_UTIL_SE0AloadComputeC_unroll16)
88 for(r = 0; r < numRows; r++){
89 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
99 #pragma FUNC_ALWAYS_INLINE
101 #pragma FUNC_ALWAYS_INLINE(FFTLIB_UTIL_SE0AloadComputeC_unroll32)
108 for(r = 0; r < numRows; r++){
109 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
111 __HWAOP(__MMA_A_LDA);
119 #pragma FUNC_ALWAYS_INLINE
121 #pragma FUNC_ALWAYS_INLINE(FFTLIB_UTIL_SE0AloadComputeC_unroll64)
128 for(r = 0; r < numRows; r++){
129 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
131 __HWAOP(__MMA_A_LDA);
139 #pragma FUNC_ALWAYS_INLINE
141 #pragma FUNC_ALWAYS_INLINE(FFTLIB_UTIL_SE0AloadComputeC_unroll)
144 template <
int32_t UNROLL_TIMES>
145 static inline void FFTLIB_UTIL_SE0AloadComputeC_unroll(int32_t numRows)
149 FFTLIB_UNROLL(UNROLL_TIMES)
150 for(r = 0; r < numRows; r++){
151 __mma_vec valA = c7x::strm_eng<0, __mma_vec>::get_adv();
153 __HWAOP(__MMA_A_LDA);
static void FFTLIB_UTIL_SE0AloadComputeC(int32_t numRows)
use streaming engine 0 to load numRows of A compute C(row) = A(row) x Bfore(:,:)
static void FFTLIB_UTIL_SE0AloadComputeC_unroll16(int32_t numRows)
static void FFTLIB_UTIL_SE0AloadComputeC_unroll32(int32_t numRows)
static void FFTLIB_UTIL_SE0AloadComputeC_unroll64(int32_t numRows)