55 #if (__C7X_VEC_SIZE_BITS__ == 512)
58 template <
typename dataType>
static inline void v_process_1st_iter(dataType *U,
63 template <>
inline void v_process_1st_iter<float>(
float *U,
71 template <>
inline void v_process_1st_iter<double>(
double *U,
81 typedef typename c7x::make_full_vector<double>::type vec;
82 __vpred pred_2_elem = __create_vpred(0x000000000000FFFFU);
83 __vpred pred_2_elem_u = __create_vpred(0x000000000000FF00U);
86 vec vec_top_row = (vec)0.0;
87 vec_top_row.s[0] = 1.0;
89 vec vec_in0 = __vload_pred(pred_2_elem_u, (vec *)&V[0 + 1 * colVStride]);
91 vec vec_u_row = __vload_pred(pred_2_elem_u, (vec *)U);
93 double normFactor = vec_u_row.s[1] * (*s);
94 vec vec_invNorm = (vec)
getRecip(normFactor);
96 vec vec_v_col = vec_u_row * vec_invNorm;
98 vec vec_si = (vec)0.0;
99 vec_si += vec_u_row * vec_in0;
101 vec_in0 += vec_si * vec_v_col;
103 __vstore_pred(pred_2_elem, (vec *)&V[0 + 0 * colVStride], vec_top_row);
104 __vstore_pred(pred_2_elem, (vec *)&V[0 + 1 * colVStride], vec_in0);
113 template <
typename dataType>
static inline void v_process_2nd_iter(dataType *U,
115 const int colUStride,
116 const int colVStride,
118 template <>
inline void v_process_2nd_iter<float>(
float *U,
120 const int colUStride,
121 const int colVStride,
126 template <>
inline void v_process_2nd_iter<double>(
double *U,
128 const int colUStride,
129 const int colVStride,
136 typedef typename c7x::make_full_vector<double>::type vec;
137 __vpred pred_3_elem = __create_vpred(0x0000000000FFFFFFU);
138 __vpred pred_3_elem_u = __create_vpred(0x0000000000FFFF00U);
141 vec vec_top_row = (vec)0.0;
142 vec_top_row.s[0] = 1.0;
144 vec vec_in0 = __vload_pred(pred_3_elem_u, (vec *)&V[0 + 1 * colVStride]);
145 vec vec_in1 = __vload_pred(pred_3_elem_u, (vec *)&V[0 + 2 * colVStride]);
147 vec vec_u_row = __vload_pred(pred_3_elem_u, (vec *)U);
149 double normFactor = vec_u_row.s[1] * (*s);
150 vec vec_invNorm = (vec)
getRecip(normFactor);
152 vec vec_v_col = vec_u_row * vec_invNorm;
154 vec vec_si = (vec)0.0;
155 vec_si += vec_u_row.s[1] * vec_in0;
156 vec_si += vec_u_row.s[2] * vec_in1;
158 vec_in0 += vec_si * vec_v_col.s[1];
159 vec_in1 += vec_si * vec_v_col.s[2];
161 __vstore_pred(pred_3_elem, (vec *)&V[0 + 0 * colVStride], vec_top_row);
162 __vstore_pred(pred_3_elem, (vec *)&V[0 + 1 * colVStride], vec_in0);
163 __vstore_pred(pred_3_elem, (vec *)&V[0 + 2 * colVStride], vec_in1);
172 template <
typename dataType>
static inline void v_process_3rd_iter(dataType *U,
174 const int colUStride,
175 const int colVStride,
177 template <>
inline void v_process_3rd_iter<float>(
float *U,
179 const int colUStride,
180 const int colVStride,
185 template <>
inline void v_process_3rd_iter<double>(
double *U,
187 const int colUStride,
188 const int colVStride,
195 typedef typename c7x::make_full_vector<double>::type vec;
196 __vpred pred_4_elem = __create_vpred(0x00000000FFFFFFFFU);
197 __vpred pred_4_elem_u = __create_vpred(0x00000000FFFFFF00U);
200 vec vec_top_row = (vec)0.0;
201 vec_top_row.s[0] = 1.0;
203 vec vec_in0 = __vload_pred(pred_4_elem_u, (vec *)&V[0 + 1 * colVStride]);
204 vec vec_in1 = __vload_pred(pred_4_elem_u, (vec *)&V[0 + 2 * colVStride]);
205 vec vec_in2 = __vload_pred(pred_4_elem_u, (vec *)&V[0 + 3 * colVStride]);
207 vec vec_u_row = __vload_pred(pred_4_elem_u, (vec *)U);
209 double normFactor = vec_u_row.s[1] * (*s);
210 vec vec_invNorm = (vec)
getRecip(normFactor);
212 vec vec_v_col = vec_u_row * vec_invNorm;
214 vec vec_si = (vec)0.0;
215 vec_si += vec_u_row.s[1] * vec_in0;
216 vec_si += vec_u_row.s[2] * vec_in1;
217 vec_si += vec_u_row.s[3] * vec_in2;
219 vec_in0 += vec_si * vec_v_col.s[1];
220 vec_in1 += vec_si * vec_v_col.s[2];
221 vec_in2 += vec_si * vec_v_col.s[3];
224 __vstore_pred(pred_4_elem, (vec *)&V[0 + 0 * colVStride], vec_top_row);
225 __vstore_pred(pred_4_elem, (vec *)&V[0 + 1 * colVStride], vec_in0);
226 __vstore_pred(pred_4_elem, (vec *)&V[0 + 2 * colVStride], vec_in1);
227 __vstore_pred(pred_4_elem, (vec *)&V[0 + 3 * colVStride], vec_in2);
234 template <
typename dataType>
static inline void v_process_4th_iter(dataType *U,
236 const int colUStride,
237 const int colVStride,
239 template <>
inline void v_process_4th_iter<float>(
float *U,
241 const int colUStride,
242 const int colVStride,
247 template <>
inline void v_process_4th_iter<double>(
double *U,
249 const int colUStride,
250 const int colVStride,
257 typedef typename c7x::make_full_vector<double>::type vec;
258 __vpred pred_5_elem = __create_vpred(0x000000FFFFFFFFFFU);
259 __vpred pred_5_elem_u = __create_vpred(0x000000FFFFFFFF00U);
262 vec vec_top_row = (vec)0.0;
263 vec_top_row.s[0] = 1.0;
265 vec vec_in0 = __vload_pred(pred_5_elem_u, (vec *)&V[0 + 1 * colVStride]);
266 vec vec_in1 = __vload_pred(pred_5_elem_u, (vec *)&V[0 + 2 * colVStride]);
267 vec vec_in2 = __vload_pred(pred_5_elem_u, (vec *)&V[0 + 3 * colVStride]);
268 vec vec_in3 = __vload_pred(pred_5_elem_u, (vec *)&V[0 + 4 * colVStride]);
270 vec vec_u_row = __vload_pred(pred_5_elem_u, (vec *)U);
272 double normFactor = vec_u_row.s[1] * (*s);
273 vec vec_invNorm = (vec)
getRecip(normFactor);
275 vec vec_v_col = vec_u_row * vec_invNorm;
277 vec vec_si = (vec)0.0;
278 vec_si += vec_u_row.s[1] * vec_in0;
279 vec_si += vec_u_row.s[2] * vec_in1;
280 vec_si += vec_u_row.s[3] * vec_in2;
281 vec_si += vec_u_row.s[4] * vec_in3;
283 vec_in0 += vec_si * vec_v_col.s[1];
284 vec_in1 += vec_si * vec_v_col.s[2];
285 vec_in2 += vec_si * vec_v_col.s[3];
286 vec_in3 += vec_si * vec_v_col.s[4];
289 __vstore_pred(pred_5_elem, (vec *)&V[0 + 0 * colVStride], vec_top_row);
290 __vstore_pred(pred_5_elem, (vec *)&V[0 + 1 * colVStride], vec_in0);
291 __vstore_pred(pred_5_elem, (vec *)&V[0 + 2 * colVStride], vec_in1);
292 __vstore_pred(pred_5_elem, (vec *)&V[0 + 3 * colVStride], vec_in2);
293 __vstore_pred(pred_5_elem, (vec *)&V[0 + 4 * colVStride], vec_in3);
302 template <
typename dataType>
static inline void v_process_5th_iter(dataType *U,
304 const int colUStride,
305 const int colVStride,
307 template <>
inline void v_process_5th_iter<float>(
float *U,
309 const int colUStride,
310 const int colVStride,
315 template <>
inline void v_process_5th_iter<double>(
double *U,
317 const int colUStride,
318 const int colVStride,
325 typedef typename c7x::make_full_vector<double>::type vec;
326 __vpred pred_6_elem = __create_vpred(0x0000FFFFFFFFFFFFU);
327 __vpred pred_6_elem_u = __create_vpred(0x0000FFFFFFFFFF00U);
330 vec vec_top_row = (vec)0.0;
331 vec_top_row.s[0] = 1.0;
333 vec vec_in0 = __vload_pred(pred_6_elem_u, (vec *)&V[0 + 1 * colVStride]);
334 vec vec_in1 = __vload_pred(pred_6_elem_u, (vec *)&V[0 + 2 * colVStride]);
335 vec vec_in2 = __vload_pred(pred_6_elem_u, (vec *)&V[0 + 3 * colVStride]);
336 vec vec_in3 = __vload_pred(pred_6_elem_u, (vec *)&V[0 + 4 * colVStride]);
337 vec vec_in4 = __vload_pred(pred_6_elem_u, (vec *)&V[0 + 5 * colVStride]);
339 vec vec_u_row = __vload_pred(pred_6_elem_u, (vec *)U);
341 double normFactor = vec_u_row.s[1] * (*s);
342 vec vec_invNorm = (vec)
getRecip(normFactor);
344 vec vec_v_col = vec_u_row * vec_invNorm;
346 vec vec_si = (vec)0.0;
347 vec_si += vec_u_row.s[1] * vec_in0;
348 vec_si += vec_u_row.s[2] * vec_in1;
349 vec_si += vec_u_row.s[3] * vec_in2;
350 vec_si += vec_u_row.s[4] * vec_in3;
351 vec_si += vec_u_row.s[5] * vec_in4;
353 vec_in0 += vec_si * vec_v_col.s[1];
354 vec_in1 += vec_si * vec_v_col.s[2];
355 vec_in2 += vec_si * vec_v_col.s[3];
356 vec_in3 += vec_si * vec_v_col.s[4];
357 vec_in4 += vec_si * vec_v_col.s[5];
360 __vstore_pred(pred_6_elem, (vec *)&V[0 + 0 * colVStride], vec_top_row);
361 __vstore_pred(pred_6_elem, (vec *)&V[0 + 1 * colVStride], vec_in0);
362 __vstore_pred(pred_6_elem, (vec *)&V[0 + 2 * colVStride], vec_in1);
363 __vstore_pred(pred_6_elem, (vec *)&V[0 + 3 * colVStride], vec_in2);
364 __vstore_pred(pred_6_elem, (vec *)&V[0 + 4 * colVStride], vec_in3);
365 __vstore_pred(pred_6_elem, (vec *)&V[0 + 5 * colVStride], vec_in4);
dataType getRecip(dataType value)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_svd.