56 #if (__C7X_VEC_SIZE_BITS__ == 512)
60 template <
typename V,
typename W>
inline void c7x_horizontal_add_6_elems(V inVec, W *horizontalSum);
61 template <>
inline void c7x_horizontal_add_6_elems(c7x::double_vec inVec,
double *horizontalSum)
63 double s0 = inVec.s[0];
64 double s1 = inVec.s[1];
65 double s2 = inVec.s[2];
66 double s3 = inVec.s[3];
67 double s4 = inVec.s[4];
68 double s5 = inVec.s[5];
72 double sss0 = ss0 + ss1 + ss2;
73 *horizontalSum = sss0;
79 template <
typename V,
typename W>
inline void c7x_horizontal_add_5_elems(V inVec, W *horizontalSum);
80 template <>
inline void c7x_horizontal_add_5_elems(c7x::double_vec inVec,
double *horizontalSum)
82 double s0 = inVec.s[0];
83 double s1 = inVec.s[1];
84 double s2 = inVec.s[2];
85 double s3 = inVec.s[3];
86 double s4 = inVec.s[4];
88 double ss1 = s2 + s3 + s4;
89 double sss0 = ss0 + ss1;
90 *horizontalSum = sss0;
95 template <
typename V,
typename W>
inline void c7x_horizontal_add_4_elems(V inVec, W *horizontalSum);
96 template <>
inline void c7x_horizontal_add_4_elems(c7x::double_vec inVec,
double *horizontalSum)
98 double s0 = inVec.s[0];
99 double s1 = inVec.s[1];
100 double s2 = inVec.s[2];
101 double s3 = inVec.s[3];
102 double ss0 = s0 + s1;
103 double ss1 = s2 + s3 ;
104 double sss0 = ss0 + ss1;
105 *horizontalSum = sss0;
109 template <
typename V,
typename W>
inline void c7x_horizontal_add_3_elems(V inVec, W *horizontalSum);
110 template <>
inline void c7x_horizontal_add_3_elems(c7x::double_vec inVec,
double *horizontalSum)
112 double s0 = inVec.s[0];
113 double s1 = inVec.s[1];
114 double s2 = inVec.s[2];
115 double ss0 = s0 + s1;
117 double sss0 = ss0 + ss1;
118 *horizontalSum = sss0;
122 template <
typename V,
typename W>
inline void c7x_horizontal_add_2_elems(V inVec, W *horizontalSum);
123 template <>
inline void c7x_horizontal_add_2_elems(c7x::double_vec inVec,
double *horizontalSum)
125 double s0 = inVec.s[0];
126 double s1 = inVec.s[1];
127 double ss0 = s0 + s1;
128 *horizontalSum = ss0;
134 template <
typename X,
typename Y>
inline Y vec_put(uint8_t idx, X val, Y vecIn);
136 template <>
inline c7x::float_vec vec_put<float, c7x::float_vec>(uint8_t idx,
float val, c7x::float_vec vecIn)
138 return __vputw_rkv(val, idx, vecIn);
141 template <>
inline c7x::double_vec vec_put<double, c7x::double_vec>(uint8_t idx,
double val, c7x::double_vec vecIn)
143 return __vputd_dkv(val, idx, vecIn);
146 #if (__C7X_VEC_SIZE_BITS__ == 512)
147 template <
typename X>
inline X gen_u_row_process_mask();
148 template <>
inline c7x::float_vec gen_u_row_process_mask<c7x::float_vec>()
150 c7x::float_vec ret_mask = c7x::float_vec(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
153 template <>
inline c7x::double_vec gen_u_row_process_mask<c7x::double_vec>()
155 c7x::double_vec ret_mask = c7x::double_vec(0, 1, 1, 1, 1, 1, 1, 1);
159 template <
typename X>
inline X gen_u_row_process_rev_mask();
160 template <>
inline c7x::float_vec gen_u_row_process_rev_mask<c7x::float_vec>()
162 c7x::float_vec ret_mask = c7x::float_vec(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
165 template <>
inline c7x::double_vec gen_u_row_process_rev_mask<c7x::double_vec>()
167 c7x::double_vec ret_mask = c7x::double_vec(1, 0, 0, 0, 0, 0, 0, 0);
174 #define U_PROC_SKIP_VLOAD 0
176 #if U_PROC_SKIP_VLOAD
177 c7x::double_vec vec_in0_u_proc = (c7x::double_vec) 0.0;
178 c7x::double_vec vec_in1_u_proc = (c7x::double_vec) 0.0;
179 c7x::double_vec vec_in2_u_proc = (c7x::double_vec) 0.0;
180 c7x::double_vec vec_in3_u_proc = (c7x::double_vec) 0.0;
181 c7x::double_vec vec_in4_u_proc = (c7x::double_vec) 0.0;
182 c7x::double_vec vec_in5_u_proc = (c7x::double_vec) 0.0;
185 template <
typename dataType>
186 static inline void u_process_0th_iter(dataType *U,
187 const int colUStride,
192 const uint32_t num_cols,
193 const uint32_t diag_idx);
195 inline void u_process_0th_iter<float>(
float *U,
196 const int colUStride,
201 const uint32_t num_cols,
202 const uint32_t diag_idx)
207 inline void u_process_0th_iter<double>(
double *U,
208 const int colUStride,
213 const uint32_t num_cols,
214 const uint32_t diag_idx)
216 double s2 = 0, half_norm_squared = 0;
222 typedef typename c7x::make_full_vector<double>::type vec;
225 __vpred pred_col_ele = __mask_long(num_cols);
227 #if U_PROC_SKIP_VLOAD
228 vec vec_in0 = vec_in0_u_proc;
229 vec vec_in1 = vec_in1_u_proc;
230 vec vec_in2 = vec_in2_u_proc;
231 vec vec_in3 = vec_in3_u_proc;
232 vec vec_in4 = vec_in4_u_proc;
233 vec vec_in5 = vec_in5_u_proc;
235 vec vec_in0 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 0 * colUStride]));
236 vec vec_in1 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 1 * colUStride]));
237 vec vec_in2 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 2 * colUStride]));
238 vec vec_in3 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 3 * colUStride]));
239 vec vec_in4 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 4 * colUStride]));
240 vec vec_in5 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 5 * colUStride]));
241 vec vec_in6 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 6 * colUStride]));
245 superdiag[diag_idx] = (*scale) * (*s);
253 double in0_s0 = vec_in0.s[0];
254 double in1_s0 = vec_in1.s[0];
255 double in2_s0 = vec_in2.s[0];
256 double in3_s0 = vec_in3.s[0];
257 double in4_s0 = vec_in4.s[0];
258 double in5_s0 = vec_in5.s[0];
259 double in6_s0 = vec_in6.s[0];
262 vec vec_first_col = (vec) 0.0;
263 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
264 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
265 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
266 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
267 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in4_s0), 4, c7x::as_long_vec(vec_first_col)));
268 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in5_s0), 5, c7x::as_long_vec(vec_first_col)));
269 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in6_s0), 6, c7x::as_long_vec(vec_first_col)));
271 (*scale) += __abs(in0_s0);
272 (*scale) += __abs(in1_s0);
273 (*scale) += __abs(in2_s0);
274 (*scale) += __abs(in3_s0);
275 (*scale) += __abs(in4_s0);
276 (*scale) += __abs(in5_s0);
277 (*scale) += __abs(in6_s0);
284 vec_first_col = vec_first_col * (vec)
getRecip((*scale));
285 vec vec_first_col_sq = vec_first_col * vec_first_col;
286 c7x_horizontal_add(vec_first_col_sq, &s2);
290 if (vec_first_col.s[0] >= 0) {
294 half_norm_squared = vec_first_col.s[0] * (*s) - s2;
295 vec inv_hnsq = (vec)
getRecip(half_norm_squared);
296 vec_first_col.s[0] -= (*s);
298 vec vec_si = (vec) 0;
300 vec_si += (vec_first_col.s[0] * vec_in0);
301 vec_si += (vec_first_col.s[1] * vec_in1);
302 vec_si += (vec_first_col.s[2] * vec_in2);
303 vec_si += (vec_first_col.s[3] * vec_in3);
304 vec_si += (vec_first_col.s[4] * vec_in4);
305 vec_si += (vec_first_col.s[5] * vec_in5);
306 vec_si += (vec_first_col.s[6] * vec_in6);
308 vec_si = vec_si * inv_hnsq;
310 vec_in0 += vec_si * vec_first_col.s[0];
311 vec_in1 += vec_si * vec_first_col.s[1];
312 vec_in2 += vec_si * vec_first_col.s[2];
313 vec_in3 += vec_si * vec_first_col.s[3];
314 vec_in4 += vec_si * vec_first_col.s[4];
315 vec_in5 += vec_si * vec_first_col.s[5];
316 vec_in6 += vec_si * vec_first_col.s[6];
319 vec_first_col *= (*scale);
320 vec_in0.s[0] = vec_first_col.s[0];
321 vec_in1.s[0] = vec_first_col.s[1];
322 vec_in2.s[0] = vec_first_col.s[2];
323 vec_in3.s[0] = vec_first_col.s[3];
324 vec_in4.s[0] = vec_first_col.s[4];
325 vec_in5.s[0] = vec_first_col.s[5];
326 vec_in6.s[0] = vec_first_col.s[6];
328 diag[diag_idx] = (*s) * (*scale);
333 vec vec_row_mask = gen_u_row_process_mask<vec>();
335 vec vec_first_row = vec_row_mask * vec_in0;
336 c7x_horizontal_add_6_elems(__abs(vec_first_row), scale);
342 vec_first_row = vec_first_row * (vec)
getRecip((*scale));
343 vec vec_first_row_sq = vec_first_row * vec_first_row;
344 c7x_horizontal_add_6_elems(vec_first_row_sq, &s2);
347 if (vec_first_row.s[(num_cols - 1)] >= 0) {
351 half_norm_squared = vec_first_row.s[1] * (*s) - s2;
352 vec_first_row.s[1] -= (*s);
354 vec vec_superdiag_temp = vec_first_row * (vec)
getRecip(half_norm_squared);
356 vec vec_si = (vec) 0;
358 vec vec_temp1 = vec_first_row * vec_in1;
359 vec vec_temp2 = vec_first_row * vec_in2;
360 vec vec_temp3 = vec_first_row * vec_in3;
361 vec vec_temp4 = vec_first_row * vec_in4;
362 vec vec_temp5 = vec_first_row * vec_in5;
363 vec vec_temp6 = vec_first_row * vec_in6;
365 c7x_horizontal_add_6_elems(vec_temp1, &vec_si.s[1]);
366 c7x_horizontal_add_6_elems(vec_temp2, &vec_si.s[2]);
367 c7x_horizontal_add_6_elems(vec_temp3, &vec_si.s[3]);
368 c7x_horizontal_add_6_elems(vec_temp4, &vec_si.s[4]);
369 c7x_horizontal_add_6_elems(vec_temp5, &vec_si.s[5]);
370 c7x_horizontal_add_6_elems(vec_temp6, &vec_si.s[6]);
372 vec_in1 += vec_si.s[1] * vec_superdiag_temp;
373 vec_in2 += vec_si.s[2] * vec_superdiag_temp;
374 vec_in3 += vec_si.s[3] * vec_superdiag_temp;
375 vec_in4 += vec_si.s[4] * vec_superdiag_temp;
376 vec_in5 += vec_si.s[5] * vec_superdiag_temp;
377 vec_in6 += vec_si.s[6] * vec_superdiag_temp;
379 vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
380 vec_in0 *= vec_row_rev_mask;
381 vec_in0 += vec_first_row * (*scale);
384 #if U_PROC_SKIP_VLOAD
385 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
386 U[0 + 1 * colUStride] = vec_in1.s[0];
387 U[0 + 2 * colUStride] = vec_in2.s[0];
388 U[0 + 3 * colUStride] = vec_in3.s[0];
389 U[0 + 4 * colUStride] = vec_in4.s[0];
390 U[0 + 5 * colUStride] = vec_in5.s[0];
391 vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
392 vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
393 vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
394 vec_in3_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in4), 64u));
395 vec_in4_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in5), 64u));
399 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
400 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
401 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
402 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
403 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 4 * colUStride]), vec_in4);
404 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 5 * colUStride]), vec_in5);
405 __vstore_pred(pred_col_ele, (vec *) (&U[0 + 6 * colUStride]), vec_in6);
414 template <
typename dataType>
415 static inline void u_process_1st_iter(dataType *U,
416 const int colUStride,
421 const uint32_t num_cols,
422 const uint32_t diag_idx);
424 inline void u_process_1st_iter<float>(
float *U,
425 const int colUStride,
430 const uint32_t num_cols,
431 const uint32_t diag_idx)
436 inline void u_process_1st_iter<double>(
double *U,
437 const int colUStride,
442 const uint32_t num_cols,
443 const uint32_t diag_idx)
445 double s2 = 0, half_norm_squared = 0;
452 typedef typename c7x::make_full_vector<double>::type vec;
455 __vpred pred_6_elem = __mask_long(num_cols);
457 #if U_PROC_SKIP_VLOAD
458 vec vec_in0 = vec_in0_u_proc;
459 vec vec_in1 = vec_in1_u_proc;
460 vec vec_in2 = vec_in2_u_proc;
461 vec vec_in3 = vec_in3_u_proc;
462 vec vec_in4 = vec_in4_u_proc;
463 vec vec_in5 = vec_in5_u_proc;
465 vec vec_in0 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 0 * colUStride]));
466 vec vec_in1 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 1 * colUStride]));
467 vec vec_in2 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 2 * colUStride]));
468 vec vec_in3 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 3 * colUStride]));
469 vec vec_in4 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 4 * colUStride]));
470 vec vec_in5 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 5 * colUStride]));
473 superdiag[diag_idx] = (*scale) * (*s);
481 double in0_s0 = vec_in0.s[0];
482 double in1_s0 = vec_in1.s[0];
483 double in2_s0 = vec_in2.s[0];
484 double in3_s0 = vec_in3.s[0];
485 double in4_s0 = vec_in4.s[0];
486 double in5_s0 = vec_in5.s[0];
489 vec vec_first_col = (vec) 0.0;
490 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
491 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
492 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
493 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
494 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in4_s0), 4, c7x::as_long_vec(vec_first_col)));
495 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in5_s0), 5, c7x::as_long_vec(vec_first_col)));
497 (*scale) += __abs(in0_s0);
498 (*scale) += __abs(in1_s0);
499 (*scale) += __abs(in2_s0);
500 (*scale) += __abs(in3_s0);
501 (*scale) += __abs(in4_s0);
502 (*scale) += __abs(in5_s0);
509 vec_first_col = vec_first_col * (vec)
getRecip((*scale));
510 vec vec_first_col_sq = vec_first_col * vec_first_col;
511 c7x_horizontal_add_6_elems(vec_first_col_sq, &s2);
515 if (vec_first_col.s[0] >= 0) {
519 half_norm_squared = vec_first_col.s[0] * (*s) - s2;
520 vec inv_hnsq = (vec)
getRecip(half_norm_squared);
521 vec_first_col.s[0] -= (*s);
523 vec vec_si = (vec) 0;
525 vec_si += (vec_first_col.s[0] * vec_in0);
526 vec_si += (vec_first_col.s[1] * vec_in1);
527 vec_si += (vec_first_col.s[2] * vec_in2);
528 vec_si += (vec_first_col.s[3] * vec_in3);
529 vec_si += (vec_first_col.s[4] * vec_in4);
530 vec_si += (vec_first_col.s[5] * vec_in5);
532 vec_si = vec_si * inv_hnsq;
534 vec_in0 += vec_si * vec_first_col.s[0];
535 vec_in1 += vec_si * vec_first_col.s[1];
536 vec_in2 += vec_si * vec_first_col.s[2];
537 vec_in3 += vec_si * vec_first_col.s[3];
538 vec_in4 += vec_si * vec_first_col.s[4];
539 vec_in5 += vec_si * vec_first_col.s[5];
542 vec_first_col *= (*scale);
543 vec_in0.s[0] = vec_first_col.s[0];
544 vec_in1.s[0] = vec_first_col.s[1];
545 vec_in2.s[0] = vec_first_col.s[2];
546 vec_in3.s[0] = vec_first_col.s[3];
547 vec_in4.s[0] = vec_first_col.s[4];
548 vec_in5.s[0] = vec_first_col.s[5];
550 diag[diag_idx] = (*s) * (*scale);
555 vec vec_row_mask = gen_u_row_process_mask<vec>();
557 vec vec_first_row = vec_row_mask * vec_in0;
558 c7x_horizontal_add_6_elems(__abs(vec_first_row), scale);
564 vec_first_row = vec_first_row * (vec)
getRecip((*scale));
565 vec vec_first_row_sq = vec_first_row * vec_first_row;
566 c7x_horizontal_add_6_elems(vec_first_row_sq, &s2);
569 if (vec_first_row.s[(num_cols - 1)] >= 0) {
573 half_norm_squared = vec_first_row.s[1] * (*s) - s2;
574 vec_first_row.s[1] -= (*s);
576 vec vec_superdiag_temp = vec_first_row * (vec)
getRecip(half_norm_squared);
578 vec vec_si = (vec) 0;
580 vec vec_temp1 = vec_first_row * vec_in1;
581 vec vec_temp2 = vec_first_row * vec_in2;
582 vec vec_temp3 = vec_first_row * vec_in3;
583 vec vec_temp4 = vec_first_row * vec_in4;
584 vec vec_temp5 = vec_first_row * vec_in5;
586 c7x_horizontal_add_6_elems(vec_temp1, &vec_si.s[1]);
587 c7x_horizontal_add_6_elems(vec_temp2, &vec_si.s[2]);
588 c7x_horizontal_add_6_elems(vec_temp3, &vec_si.s[3]);
589 c7x_horizontal_add_6_elems(vec_temp4, &vec_si.s[4]);
590 c7x_horizontal_add_6_elems(vec_temp5, &vec_si.s[5]);
592 vec_in1 += vec_si.s[1] * vec_superdiag_temp;
593 vec_in2 += vec_si.s[2] * vec_superdiag_temp;
594 vec_in3 += vec_si.s[3] * vec_superdiag_temp;
595 vec_in4 += vec_si.s[4] * vec_superdiag_temp;
596 vec_in5 += vec_si.s[5] * vec_superdiag_temp;
598 vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
599 vec_in0 *= vec_row_rev_mask;
600 vec_in0 += vec_first_row * (*scale);
603 #if U_PROC_SKIP_VLOAD
604 __vstore_pred(pred_6_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
605 U[0 + 1 * colUStride] = vec_in1.s[0];
606 U[0 + 2 * colUStride] = vec_in2.s[0];
607 U[0 + 3 * colUStride] = vec_in3.s[0];
608 U[0 + 4 * colUStride] = vec_in4.s[0];
609 U[0 + 5 * colUStride] = vec_in5.s[0];
610 vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
611 vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
612 vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
613 vec_in3_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in4), 64u));
614 vec_in4_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in5), 64u));
618 __vstore_pred(pred_6_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
619 __vstore_pred(pred_6_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
620 __vstore_pred(pred_6_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
621 __vstore_pred(pred_6_elem, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
622 __vstore_pred(pred_6_elem, (vec *) (&U[0 + 4 * colUStride]), vec_in4);
623 __vstore_pred(pred_6_elem, (vec *) (&U[0 + 5 * colUStride]), vec_in5);
630 template <
typename dataType>
631 static inline void u_process_2nd_iter(dataType *U,
632 const int colUStride,
637 const uint32_t num_cols,
638 const uint32_t diag_idx);
640 inline void u_process_2nd_iter<float>(
float *U,
641 const int colUStride,
646 const uint32_t num_cols,
647 const uint32_t diag_idx)
652 inline void u_process_2nd_iter<double>(
double *U,
653 const int colUStride,
658 const uint32_t num_cols,
659 const uint32_t diag_idx)
661 double s2 = 0, half_norm_squared = 0;
666 typedef typename c7x::make_full_vector<double>::type vec;
669 __vpred pred_5_elem = __mask_long(num_cols);
671 #if U_PROC_SKIP_VLOAD
672 vec vec_in0 = vec_in0_u_proc;
673 vec vec_in1 = vec_in1_u_proc;
674 vec vec_in2 = vec_in2_u_proc;
675 vec vec_in3 = vec_in3_u_proc;
676 vec vec_in4 = vec_in4_u_proc;
678 vec vec_in0 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 0 * colUStride]));
679 vec vec_in1 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 1 * colUStride]));
680 vec vec_in2 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 2 * colUStride]));
681 vec vec_in3 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 3 * colUStride]));
682 vec vec_in4 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 4 * colUStride]));
685 superdiag[diag_idx] = (*scale) * (*s);
693 double in0_s0 = vec_in0.s[0];
694 double in1_s0 = vec_in1.s[0];
695 double in2_s0 = vec_in2.s[0];
696 double in3_s0 = vec_in3.s[0];
697 double in4_s0 = vec_in4.s[0];
700 vec vec_first_col = (vec) 0.0;
701 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
702 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
703 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
704 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
705 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in4_s0), 4, c7x::as_long_vec(vec_first_col)));
707 (*scale) += __abs(in0_s0);
708 (*scale) += __abs(in1_s0);
709 (*scale) += __abs(in2_s0);
710 (*scale) += __abs(in3_s0);
711 (*scale) += __abs(in4_s0);
716 vec_first_col = vec_first_col * (vec)
getRecip((*scale));
717 vec vec_first_col_sq = vec_first_col * vec_first_col;
718 c7x_horizontal_add_5_elems(vec_first_col_sq, &s2);
721 if (vec_first_col.s[0] >= 0) {
725 half_norm_squared = vec_first_col.s[0] * (*s) - s2;
726 vec_first_col.s[0] -= (*s);
728 vec vec_si = (vec) 0;
730 vec_si += (vec_first_col.s[0] * vec_in0);
731 vec_si += (vec_first_col.s[1] * vec_in1);
732 vec_si += (vec_first_col.s[2] * vec_in2);
733 vec_si += (vec_first_col.s[3] * vec_in3);
734 vec_si += (vec_first_col.s[4] * vec_in4);
736 vec_si = vec_si * (vec)
getRecip(half_norm_squared);
738 vec_in0 += vec_si * vec_first_col.s[0];
739 vec_in1 += vec_si * vec_first_col.s[1];
740 vec_in2 += vec_si * vec_first_col.s[2];
741 vec_in3 += vec_si * vec_first_col.s[3];
742 vec_in4 += vec_si * vec_first_col.s[4];
744 vec_first_col *= (*scale);
745 vec_in0.s[0] = vec_first_col.s[0];
746 vec_in1.s[0] = vec_first_col.s[1];
747 vec_in2.s[0] = vec_first_col.s[2];
748 vec_in3.s[0] = vec_first_col.s[3];
749 vec_in4.s[0] = vec_first_col.s[4];
751 diag[diag_idx] = (*s) * (*scale);
756 vec vec_row_mask = gen_u_row_process_mask<vec>();
758 vec vec_first_row = vec_row_mask * vec_in0;
759 c7x_horizontal_add_5_elems(__abs(vec_first_row), scale);
763 vec_first_row = vec_first_row * (vec)
getRecip((*scale));
764 vec vec_first_row_sq = vec_first_row * vec_first_row;
765 c7x_horizontal_add_5_elems(vec_first_row_sq, &s2);
768 if (vec_first_row.s[(num_cols - 1)] >= 0) {
772 half_norm_squared = vec_first_row.s[1] * (*s) - s2;
773 vec_first_row.s[1] -= (*s);
775 vec vec_superdiag_temp = vec_first_row * (vec)
getRecip(half_norm_squared);
777 vec vec_si = (vec) 0;
779 vec vec_temp1 = vec_first_row * vec_in1;
780 vec vec_temp2 = vec_first_row * vec_in2;
781 vec vec_temp3 = vec_first_row * vec_in3;
782 vec vec_temp4 = vec_first_row * vec_in4;
784 c7x_horizontal_add_5_elems(vec_temp1, &vec_si.s[1]);
785 c7x_horizontal_add_5_elems(vec_temp2, &vec_si.s[2]);
786 c7x_horizontal_add_5_elems(vec_temp3, &vec_si.s[3]);
787 c7x_horizontal_add_5_elems(vec_temp4, &vec_si.s[4]);
789 vec_in1 += vec_si.s[1] * vec_superdiag_temp;
790 vec_in2 += vec_si.s[2] * vec_superdiag_temp;
791 vec_in3 += vec_si.s[3] * vec_superdiag_temp;
792 vec_in4 += vec_si.s[4] * vec_superdiag_temp;
794 vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
795 vec_in0 *= vec_row_rev_mask;
796 vec_in0 += vec_first_row * (*scale);
799 #if U_PROC_SKIP_VLOAD
800 __vstore_pred(pred_5_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
801 U[0 + 1 * colUStride] = vec_in1.s[0];
802 U[0 + 2 * colUStride] = vec_in2.s[0];
803 U[0 + 3 * colUStride] = vec_in3.s[0];
804 U[0 + 4 * colUStride] = vec_in4.s[0];
806 vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
807 vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
808 vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
809 vec_in3_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in4), 64u));
811 __vstore_pred(pred_5_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
812 __vstore_pred(pred_5_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
813 __vstore_pred(pred_5_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
814 __vstore_pred(pred_5_elem, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
815 __vstore_pred(pred_5_elem, (vec *) (&U[0 + 4 * colUStride]), vec_in4);
821 template <
typename dataType>
822 static inline void u_process_3rd_iter(dataType *U,
823 const int colUStride,
828 const uint32_t num_cols,
829 const uint32_t diag_idx);
831 inline void u_process_3rd_iter<float>(
float *U,
832 const int colUStride,
837 const uint32_t num_cols,
838 const uint32_t diag_idx)
843 inline void u_process_3rd_iter<double>(
double *U,
844 const int colUStride,
849 const uint32_t num_cols,
850 const uint32_t diag_idx)
852 double s2 = 0, half_norm_squared = 0;
857 typedef typename c7x::make_full_vector<double>::type vec;
860 __vpred pred_4_elem = __mask_long(num_cols);
862 #if U_PROC_SKIP_VLOAD
863 vec vec_in0 = vec_in0_u_proc;
864 vec vec_in1 = vec_in1_u_proc;
865 vec vec_in2 = vec_in2_u_proc;
866 vec vec_in3 = vec_in3_u_proc;
868 vec vec_in0 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 0 * colUStride]));
869 vec vec_in1 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 1 * colUStride]));
870 vec vec_in2 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 2 * colUStride]));
871 vec vec_in3 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 3 * colUStride]));
874 superdiag[diag_idx] = (*scale) * (*s);
882 double in0_s0 = vec_in0.s[0];
883 double in1_s0 = vec_in1.s[0];
884 double in2_s0 = vec_in2.s[0];
885 double in3_s0 = vec_in3.s[0];
888 vec vec_first_col = (vec) 0.0;
889 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
890 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
891 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
892 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
894 (*scale) += __abs(in0_s0);
895 (*scale) += __abs(in1_s0);
896 (*scale) += __abs(in2_s0);
897 (*scale) += __abs(in3_s0);
902 vec_first_col = vec_first_col * (vec)
getRecip((*scale));
903 vec vec_first_col_sq = vec_first_col * vec_first_col;
904 c7x_horizontal_add_4_elems(vec_first_col_sq, &s2);
907 if (vec_first_col.s[0] >= 0) {
911 half_norm_squared = vec_first_col.s[0] * (*s) - s2;
912 vec_first_col.s[0] -= (*s);
914 vec vec_si = (vec) 0;
916 vec_si += (vec_first_col.s[0] * vec_in0);
917 vec_si += (vec_first_col.s[1] * vec_in1);
918 vec_si += (vec_first_col.s[2] * vec_in2);
919 vec_si += (vec_first_col.s[3] * vec_in3);
921 vec_si = vec_si * (vec)
getRecip(half_norm_squared);
923 vec_in0 += vec_si * vec_first_col.s[0];
924 vec_in1 += vec_si * vec_first_col.s[1];
925 vec_in2 += vec_si * vec_first_col.s[2];
926 vec_in3 += vec_si * vec_first_col.s[3];
928 vec_first_col *= (*scale);
929 vec_in0.s[0] = vec_first_col.s[0];
930 vec_in1.s[0] = vec_first_col.s[1];
931 vec_in2.s[0] = vec_first_col.s[2];
932 vec_in3.s[0] = vec_first_col.s[3];
934 diag[diag_idx] = (*s) * (*scale);
939 vec vec_row_mask = gen_u_row_process_mask<vec>();
941 vec vec_first_row = vec_row_mask * vec_in0;
942 c7x_horizontal_add_4_elems(__abs(vec_first_row), scale);
946 vec_first_row = vec_first_row * (vec)
getRecip((*scale));
947 vec vec_first_row_sq = vec_first_row * vec_first_row;
948 c7x_horizontal_add_4_elems(vec_first_row_sq, &s2);
951 if (vec_first_row.s[(num_cols - 1)] >= 0) {
955 half_norm_squared = vec_first_row.s[1] * (*s) - s2;
956 vec_first_row.s[1] -= (*s);
958 vec vec_superdiag_temp = vec_first_row * (vec)
getRecip(half_norm_squared);
960 vec vec_si = (vec) 0;
962 vec vec_temp1 = vec_first_row * vec_in1;
963 vec vec_temp2 = vec_first_row * vec_in2;
964 vec vec_temp3 = vec_first_row * vec_in3;
966 c7x_horizontal_add_4_elems(vec_temp1, &vec_si.s[1]);
967 c7x_horizontal_add_4_elems(vec_temp2, &vec_si.s[2]);
968 c7x_horizontal_add_4_elems(vec_temp3, &vec_si.s[3]);
970 vec_in1 += vec_si.s[1] * vec_superdiag_temp;
971 vec_in2 += vec_si.s[2] * vec_superdiag_temp;
972 vec_in3 += vec_si.s[3] * vec_superdiag_temp;
974 vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
975 vec_in0 *= vec_row_rev_mask;
976 vec_in0 += vec_first_row * (*scale);
979 #if U_PROC_SKIP_VLOAD
980 __vstore_pred(pred_4_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
981 U[0 + 1 * colUStride] = vec_in1.s[0];
982 U[0 + 2 * colUStride] = vec_in2.s[0];
983 U[0 + 3 * colUStride] = vec_in3.s[0];
985 vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
986 vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
987 vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
989 __vstore_pred(pred_4_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
990 __vstore_pred(pred_4_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
991 __vstore_pred(pred_4_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
992 __vstore_pred(pred_4_elem, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
998 template <
typename dataType>
999 static inline void u_process_4th_iter(dataType *U,
1000 const int colUStride,
1002 dataType *superdiag,
1005 const uint32_t num_cols,
1006 const uint32_t diag_idx);
1008 inline void u_process_4th_iter<float>(
float *U,
1009 const int colUStride,
1014 const uint32_t num_cols,
1015 const uint32_t diag_idx)
1020 inline void u_process_4th_iter<double>(
double *U,
1021 const int colUStride,
1026 const uint32_t num_cols,
1027 const uint32_t diag_idx)
1029 double s2 = 0, half_norm_squared = 0;
1034 typedef typename c7x::make_full_vector<double>::type vec;
1036 __vpred pred_3_elem = __mask_long(num_cols);
1038 #if U_PROC_SKIP_VLOAD
1039 vec vec_in0 = vec_in0_u_proc;
1040 vec vec_in1 = vec_in1_u_proc;
1041 vec vec_in2 = vec_in2_u_proc;
1043 vec vec_in0 = __vload_pred(pred_3_elem, (vec *) (&U[0 + 0 * colUStride]));
1044 vec vec_in1 = __vload_pred(pred_3_elem, (vec *) (&U[0 + 1 * colUStride]));
1045 vec vec_in2 = __vload_pred(pred_3_elem, (vec *) (&U[0 + 2 * colUStride]));
1047 superdiag[diag_idx] = (*scale) * (*s);
1055 double in0_s0 = vec_in0.s[0];
1056 double in1_s0 = vec_in1.s[0];
1057 double in2_s0 = vec_in2.s[0];
1060 vec vec_first_col = (vec) 0.0;
1061 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
1062 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
1063 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
1065 (*scale) += __abs(in0_s0);
1066 (*scale) += __abs(in1_s0);
1067 (*scale) += __abs(in2_s0);
1072 vec_first_col = vec_first_col * (vec)
getRecip((*scale));
1073 vec vec_first_col_sq = vec_first_col * vec_first_col;
1074 c7x_horizontal_add_3_elems(vec_first_col_sq, &s2);
1077 if (vec_first_col.s[0] >= 0) {
1081 half_norm_squared = vec_first_col.s[0] * (*s) - s2;
1082 vec_first_col.s[0] -= (*s);
1084 vec vec_si = (vec) 0;
1086 vec_si += (vec_first_col.s[0] * vec_in0);
1087 vec_si += (vec_first_col.s[1] * vec_in1);
1088 vec_si += (vec_first_col.s[2] * vec_in2);
1090 vec_si = vec_si * (vec)
getRecip(half_norm_squared);
1092 vec_in0 += vec_si * vec_first_col.s[0];
1093 vec_in1 += vec_si * vec_first_col.s[1];
1094 vec_in2 += vec_si * vec_first_col.s[2];
1096 vec_first_col *= (*scale);
1097 vec_in0.s[0] = vec_first_col.s[0];
1098 vec_in1.s[0] = vec_first_col.s[1];
1099 vec_in2.s[0] = vec_first_col.s[2];
1101 diag[diag_idx] = (*s) * (*scale);
1106 vec vec_row_mask = gen_u_row_process_mask<vec>();
1108 vec vec_first_row = vec_row_mask * vec_in0;
1109 c7x_horizontal_add_3_elems(__abs(vec_first_row), scale);
1113 vec_first_row = vec_first_row * (vec)
getRecip((*scale));
1114 vec vec_first_row_sq = vec_first_row * vec_first_row;
1115 c7x_horizontal_add_3_elems(vec_first_row_sq, &s2);
1118 if (vec_first_row.s[(num_cols - 1)] >= 0) {
1122 half_norm_squared = vec_first_row.s[1] * (*s) - s2;
1123 vec_first_row.s[1] -= (*s);
1125 vec vec_superdiag_temp = vec_first_row * (vec)
getRecip(half_norm_squared);
1127 vec vec_si = (vec) 0;
1129 vec vec_temp1 = vec_first_row * vec_in1;
1130 vec vec_temp2 = vec_first_row * vec_in2;
1132 c7x_horizontal_add_3_elems(vec_temp1, &vec_si.s[1]);
1133 c7x_horizontal_add_3_elems(vec_temp2, &vec_si.s[2]);
1135 vec_in1 += vec_si.s[1] * vec_superdiag_temp;
1136 vec_in2 += vec_si.s[2] * vec_superdiag_temp;
1138 vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
1139 vec_in0 *= vec_row_rev_mask;
1140 vec_in0 += vec_first_row * (*scale);
1143 #if U_PROC_SKIP_VLOAD
1144 __vstore_pred(pred_3_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1145 U[0 + 1 * colUStride] = vec_in1.s[0];
1146 U[0 + 2 * colUStride] = vec_in2.s[0];
1148 vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
1149 vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
1151 __vstore_pred(pred_3_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1152 __vstore_pred(pred_3_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
1153 __vstore_pred(pred_3_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
1159 template <
typename dataType>
1160 static inline void u_process_5th_iter(dataType *U,
1161 const int colUStride,
1163 dataType *superdiag,
1166 const uint32_t num_cols,
1167 const uint32_t diag_idx);
1169 inline void u_process_5th_iter<float>(
float *U,
1170 const int colUStride,
1175 const uint32_t num_cols,
1176 const uint32_t diag_idx)
1181 inline void u_process_5th_iter<double>(
double *U,
1182 const int colUStride,
1187 const uint32_t num_cols,
1188 const uint32_t diag_idx)
1190 double s2 = 0, half_norm_squared = 0;
1196 typedef typename c7x::make_full_vector<double>::type vec;
1199 __vpred pred_2_elem = __mask_long(num_cols);
1201 #if U_PROC_SKIP_VLOAD
1202 vec vec_in0 = vec_in0_u_proc;
1203 vec vec_in1 = vec_in1_u_proc;
1205 vec vec_in0 = __vload_pred(pred_2_elem, (vec *) (&U[0 + 0 * colUStride]));
1206 vec vec_in1 = __vload_pred(pred_2_elem, (vec *) (&U[0 + 1 * colUStride]));
1208 superdiag[diag_idx] = (*scale) * (*s);
1216 double in0_s0 = vec_in0.s[0];
1217 double in1_s0 = vec_in1.s[0];
1220 vec vec_first_col = (vec) 0.0;
1221 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
1222 vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
1224 (*scale) += __abs(in0_s0);
1225 (*scale) += __abs(in1_s0);
1230 vec_first_col = vec_first_col * (vec)
getRecip((*scale));
1231 vec vec_first_col_sq = vec_first_col * vec_first_col;
1232 c7x_horizontal_add_2_elems(vec_first_col_sq, &s2);
1235 if (vec_first_col.s[0] >= 0) {
1239 half_norm_squared = vec_first_col.s[0] * (*s) - s2;
1240 vec_first_col.s[0] -= (*s);
1242 vec vec_si = (vec) 0;
1244 vec_si += (vec_first_col.s[0] * vec_in0);
1245 vec_si += (vec_first_col.s[1] * vec_in1);
1247 vec_si = vec_si * (vec)
getRecip(half_norm_squared);
1249 vec_in0 += vec_si * vec_first_col.s[0];
1250 vec_in1 += vec_si * vec_first_col.s[1];
1252 vec_first_col *= (*scale);
1253 vec_in0.s[0] = vec_first_col.s[0];
1254 vec_in1.s[0] = vec_first_col.s[1];
1256 diag[diag_idx] = (*s) * (*scale);
1261 vec vec_row_mask = gen_u_row_process_mask<vec>();
1263 vec vec_first_row = vec_row_mask * vec_in0;
1264 c7x_horizontal_add_2_elems(__abs(vec_first_row), scale);
1268 vec_first_row = vec_first_row * (vec)
getRecip((*scale));
1269 vec vec_first_row_sq = vec_first_row * vec_first_row;
1270 c7x_horizontal_add_2_elems(vec_first_row_sq, &s2);
1273 if (vec_first_row.s[(num_cols - 1)] >= 0) {
1277 half_norm_squared = vec_first_row.s[1] * (*s) - s2;
1278 vec_first_row.s[1] -= (*s);
1280 vec vec_superdiag_temp = vec_first_row * (vec)
getRecip(half_norm_squared);
1282 vec vec_si = (vec) 0;
1284 vec vec_temp1 = vec_first_row * vec_in1;
1286 c7x_horizontal_add_2_elems(vec_temp1, &vec_si.s[1]);
1288 vec_in1 += vec_si.s[1] * vec_superdiag_temp;
1290 vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
1291 vec_in0 *= vec_row_rev_mask;
1292 vec_in0 += vec_first_row * (*scale);
1295 #if U_PROC_SKIP_VLOAD
1296 __vstore_pred(pred_2_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1297 __vstore_pred(pred_2_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
1299 vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
1301 __vstore_pred(pred_2_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1302 __vstore_pred(pred_2_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
1308 template <
typename dataType>
1309 static inline void u_process_6th_iter(dataType *U,
1310 const int colUStride,
1312 dataType *superdiag,
1315 const uint32_t num_cols,
1316 const uint32_t diag_idx);
1318 inline void u_process_6th_iter<float>(
float *U,
1319 const int colUStride,
1324 const uint32_t num_cols,
1325 const uint32_t diag_idx)
1330 inline void u_process_6th_iter<double>(
double *U,
1331 const int colUStride,
1336 const uint32_t num_cols,
1337 const uint32_t diag_idx)
1339 double s2 = 0, half_norm_squared = 0;
1344 double vec_in0 = U[0 + 0 * colUStride];
1346 superdiag[5] = (*scale) * (*s);
1354 double in0_s0 = vec_in0;
1357 double vec_first_col = in0_s0;
1359 (*scale) += __abs(in0_s0);
1361 if ((*scale) > 0.0) {
1364 vec_first_col = vec_first_col * (double)
getRecip((*scale));
1365 double vec_first_col_sq = vec_first_col * vec_first_col;
1366 s2 = vec_first_col_sq;
1369 if (vec_first_col >= 0) {
1373 half_norm_squared = vec_first_col * (*s) - s2;
1374 vec_first_col -= (*s);
1376 double vec_si = (double) 0.0;
1378 vec_si += (vec_first_col * vec_in0);
1380 vec_si = vec_si * (double)
getRecip(half_norm_squared);
1382 vec_in0 += vec_si * vec_first_col;
1384 vec_first_col *= (*scale);
1385 vec_in0 = vec_first_col;
1387 diag[5] = (*s) * (*scale);
1392 U[0 + 0 * colUStride] = vec_in0;
1404 #if (__C7X_VEC_SIZE_BITS__ == 512)
1405 template <
typename X>
static inline X gen_plusOneVec();
1407 template <>
inline c7x::float_vec gen_plusOneVec<c7x::float_vec>()
1409 c7x::float_vec ret_vec = c7x::float_vec(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1413 template <>
inline c7x::double_vec gen_plusOneVec<c7x::double_vec>()
1415 c7x::double_vec ret_vec = c7x::double_vec(1, 0, 0, 0, 0, 0, 0, 0);
1418 #elif (__C7X_VEC_SIZE_BITS__ == 256)
1419 template <
typename X>
static inline X gen_plusOneVec();
1421 template <>
inline c7x::float_vec gen_plusOneVec<c7x::float_vec>()
1423 c7x::float_vec ret_vec = c7x::float_vec(1, 0, 0, 0, 0, 0, 0, 0);
1427 template <>
inline c7x::double_vec gen_plusOneVec<c7x::double_vec>()
1429 c7x::double_vec ret_vec = c7x::double_vec(1, 0, 0, 0);
1436 template <
typename dataType>
1437 static inline void u_update_1st_iter(dataType *U,
const int colUStride, dataType *diag, dataType s);
1438 template <>
inline void u_update_1st_iter<float>(
float *U,
const int colUStride,
float *diag,
float s) {
return; }
1439 template <>
inline void u_update_1st_iter<double>(
double *U,
const int colUStride,
double *diag,
double s)
1442 typedef typename c7x::make_full_vector<double>::type vec;
1444 __vpred pred_1_elem = __create_vpred(0x00000000000000FFU);
1445 vec vec_in0 = (vec) 0;
1446 vec vec_mask_first_row = gen_plusOneVec<vec>();
1448 vec_in0 = __vload_pred(pred_1_elem, (vec *) U);
1449 vec_in0 = vec_in0 * (vec)
getRecip(s) + vec_mask_first_row;
1450 __vstore_pred(pred_1_elem, (vec *) U, vec_in0);
1453 __vstore_pred(pred_1_elem, (vec *) U, (vec_in0 + vec_mask_first_row));
1459 template <
typename dataType>
1460 static inline void u_update_2nd_iter(dataType *U,
const int colUStride, dataType *diag, dataType s);
1461 template <>
inline void u_update_2nd_iter<float>(
float *U,
const int colUStride,
float *diag,
float s) {
return; }
1462 template <>
inline void u_update_2nd_iter<double>(
double *U,
const int colUStride,
double *diag,
double s)
1465 typedef typename c7x::make_full_vector<double>::type vec;
1467 vec vec_mask_first_row = gen_plusOneVec<vec>();
1468 __vpred pred_2_elem = __create_vpred(0x000000000000FFFFU);
1470 vec vec_first_col = (vec) 0;
1471 vec vec_si = (vec) 0.0;
1472 vec vec_in0 = __vload_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride]);
1473 vec_first_col.s[0] = vec_in0.s[0];
1474 vec_in0 = vec_in0 * vec_mask_first_row;
1475 vec norm_factor = (vec) (vec_in0.s[0] * s);
1476 vec inv_norm_factor =
getRecip(norm_factor);
1479 vec vec_in1 = __vload_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride]);
1480 vec_first_col.s[1] = vec_in1.s[0];
1482 vec_si += vec_first_col.s[1] * vec_in1;
1484 vec_si = vec_si * inv_norm_factor;
1486 vec_in0 += vec_si * vec_first_col.s[0];
1487 vec_in1 += vec_si * vec_first_col.s[1];
1489 vec_first_col = vec_first_col * (vec)
getRecip(s);
1491 vec_in0.s[0] = vec_first_col.s[0];
1492 vec_in1.s[0] = vec_first_col.s[1];
1494 vec_in0 += vec_mask_first_row;
1496 __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1497 __vstore_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1500 vec_in0 += vec_mask_first_row;
1501 __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1502 U[0 + 1 * colUStride] = 0.0;
1508 template <
typename dataType>
1509 static inline void u_update_3rd_iter(dataType *U,
const int colUStride, dataType *diag, dataType s);
1510 template <>
inline void u_update_3rd_iter<float>(
float *U,
const int colUStride,
float *diag,
float s) {
return; }
1511 template <>
inline void u_update_3rd_iter<double>(
double *U,
const int colUStride,
double *diag,
double s)
1514 typedef typename c7x::make_full_vector<double>::type vec;
1516 vec vec_mask_first_row = gen_plusOneVec<vec>();
1517 __vpred pred_3_elem = __create_vpred(0x0000000000FFFFFFU);
1519 vec vec_first_col = (vec) 0;
1520 vec vec_si = (vec) 0.0;
1521 vec vec_in0 = __vload_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride]);
1522 vec_first_col.s[0] = vec_in0.s[0];
1523 vec_in0 = vec_in0 * vec_mask_first_row;
1524 vec norm_factor = (vec) (vec_in0.s[0] * s);
1525 vec inv_norm_factor =
getRecip(norm_factor);
1528 vec vec_in1 = __vload_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride]);
1529 vec vec_in2 = __vload_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride]);
1531 vec_first_col.s[1] = vec_in1.s[0];
1532 vec_first_col.s[2] = vec_in2.s[0];
1534 vec_si += vec_first_col.s[1] * vec_in1;
1535 vec_si += vec_first_col.s[2] * vec_in2;
1537 vec_si = vec_si * inv_norm_factor;
1539 vec_in0 += vec_si * vec_first_col.s[0];
1540 vec_in1 += vec_si * vec_first_col.s[1];
1541 vec_in2 += vec_si * vec_first_col.s[2];
1543 vec_first_col = vec_first_col * (vec)
getRecip(s);
1545 vec_in0.s[0] = vec_first_col.s[0];
1546 vec_in1.s[0] = vec_first_col.s[1];
1547 vec_in2.s[0] = vec_first_col.s[2];
1549 vec_in0 += vec_mask_first_row;
1551 __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1552 __vstore_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1553 __vstore_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1556 vec_in0 += vec_mask_first_row;
1557 __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1558 U[0 + 1 * colUStride] = 0.0;
1559 U[0 + 2 * colUStride] = 0.0;
1565 template <
typename dataType>
1566 static inline void u_update_4th_iter(dataType *U,
const int colUStride, dataType *diag, dataType s);
1567 template <>
inline void u_update_4th_iter<float>(
float *U,
const int colUStride,
float *diag,
float s) {
return; }
1568 template <>
inline void u_update_4th_iter<double>(
double *U,
const int colUStride,
double *diag,
double s)
1571 typedef typename c7x::make_full_vector<double>::type vec;
1573 vec vec_mask_first_row = gen_plusOneVec<vec>();
1574 __vpred pred_4_elem = __create_vpred(0x00000000FFFFFFFFU);
1576 vec vec_first_col = (vec) 0;
1577 vec vec_si = (vec) 0.0;
1578 vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1579 vec_first_col.s[0] = vec_in0.s[0];
1580 vec_in0 = vec_in0 * vec_mask_first_row;
1581 vec norm_factor = (vec) (vec_in0.s[0] * s);
1582 vec inv_norm_factor =
getRecip(norm_factor);
1585 vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1586 vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1587 vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1589 vec_first_col.s[1] = vec_in1.s[0];
1590 vec_first_col.s[2] = vec_in2.s[0];
1591 vec_first_col.s[3] = vec_in3.s[0];
1593 vec_si += vec_first_col.s[1] * vec_in1;
1594 vec_si += vec_first_col.s[2] * vec_in2;
1595 vec_si += vec_first_col.s[3] * vec_in3;
1597 vec_si = vec_si * inv_norm_factor;
1599 vec_in0 += vec_si * vec_first_col.s[0];
1600 vec_in1 += vec_si * vec_first_col.s[1];
1601 vec_in2 += vec_si * vec_first_col.s[2];
1602 vec_in3 += vec_si * vec_first_col.s[3];
1604 vec_first_col = vec_first_col * (vec)
getRecip(s);
1606 vec_in0.s[0] = vec_first_col.s[0];
1607 vec_in1.s[0] = vec_first_col.s[1];
1608 vec_in2.s[0] = vec_first_col.s[2];
1609 vec_in3.s[0] = vec_first_col.s[3];
1611 vec_in0 += vec_mask_first_row;
1613 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1614 __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1615 __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1616 __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1619 vec_in0 += vec_mask_first_row;
1620 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1621 U[0 + 1 * colUStride] = 0.0;
1622 U[0 + 2 * colUStride] = 0.0;
1623 U[0 + 3 * colUStride] = 0.0;
1629 template <
typename dataType>
1630 static inline void u_update_5th_iter(dataType *U,
const int colUStride, dataType *diag, dataType s);
1631 template <>
inline void u_update_5th_iter<float>(
float *U,
const int colUStride,
float *diag,
float s) {
return; }
1632 template <>
inline void u_update_5th_iter<double>(
double *U,
const int colUStride,
double *diag,
double s)
1635 typedef typename c7x::make_full_vector<double>::type vec;
1637 vec vec_mask_first_row = gen_plusOneVec<vec>();
1638 __vpred pred_4_elem = __create_vpred(0x000000FFFFFFFFFFU);
1640 vec vec_first_col = (vec) 0;
1641 vec vec_si = (vec) 0.0;
1642 vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1643 vec_first_col.s[0] = vec_in0.s[0];
1644 vec_in0 = vec_in0 * vec_mask_first_row;
1645 vec norm_factor = (vec) (vec_in0.s[0] * s);
1646 vec inv_norm_factor =
getRecip(norm_factor);
1649 vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1650 vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1651 vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1652 vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
1654 vec_first_col.s[1] = vec_in1.s[0];
1655 vec_first_col.s[2] = vec_in2.s[0];
1656 vec_first_col.s[3] = vec_in3.s[0];
1657 vec_first_col.s[4] = vec_in4.s[0];
1659 vec_si += vec_first_col.s[1] * vec_in1;
1660 vec_si += vec_first_col.s[2] * vec_in2;
1661 vec_si += vec_first_col.s[3] * vec_in3;
1662 vec_si += vec_first_col.s[4] * vec_in4;
1664 vec_si = vec_si * inv_norm_factor;
1666 vec_in0 += vec_si * vec_first_col.s[0];
1667 vec_in1 += vec_si * vec_first_col.s[1];
1668 vec_in2 += vec_si * vec_first_col.s[2];
1669 vec_in3 += vec_si * vec_first_col.s[3];
1670 vec_in4 += vec_si * vec_first_col.s[4];
1672 vec_first_col = vec_first_col * (vec)
getRecip(s);
1674 vec_in0.s[0] = vec_first_col.s[0];
1675 vec_in1.s[0] = vec_first_col.s[1];
1676 vec_in2.s[0] = vec_first_col.s[2];
1677 vec_in3.s[0] = vec_first_col.s[3];
1678 vec_in4.s[0] = vec_first_col.s[4];
1680 vec_in0 += vec_mask_first_row;
1682 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1683 __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1684 __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1685 __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1686 __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
1689 vec_in0 += vec_mask_first_row;
1690 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1691 U[0 + 1 * colUStride] = 0.0;
1692 U[0 + 2 * colUStride] = 0.0;
1693 U[0 + 3 * colUStride] = 0.0;
1694 U[0 + 4 * colUStride] = 0.0;
1700 template <
typename dataType>
1701 static inline void u_update_6th_iter(dataType *U,
const int colUStride, dataType *diag, dataType s);
1702 template <>
inline void u_update_6th_iter<float>(
float *U,
const int colUStride,
float *diag,
float s) {
return; }
1703 template <>
inline void u_update_6th_iter<double>(
double *U,
const int colUStride,
double *diag,
double s)
1707 typedef typename c7x::make_full_vector<double>::type vec;
1709 vec vec_mask_first_row = gen_plusOneVec<vec>();
1710 __vpred pred_4_elem = __create_vpred(0x0000FFFFFFFFFFFFU);
1712 vec vec_first_col = (vec) 0;
1713 vec vec_si = (vec) 0.0;
1714 vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1715 vec_first_col.s[0] = vec_in0.s[0];
1716 vec_in0 = vec_in0 * vec_mask_first_row;
1717 vec norm_factor = (vec) (vec_in0.s[0] * s);
1718 vec inv_norm_factor =
getRecip(norm_factor);
1721 vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1722 vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1723 vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1724 vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
1725 vec vec_in5 = __vload_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride]);
1727 vec_first_col.s[1] = vec_in1.s[0];
1728 vec_first_col.s[2] = vec_in2.s[0];
1729 vec_first_col.s[3] = vec_in3.s[0];
1730 vec_first_col.s[4] = vec_in4.s[0];
1731 vec_first_col.s[5] = vec_in5.s[0];
1733 vec_si += vec_first_col.s[1] * vec_in1;
1734 vec_si += vec_first_col.s[2] * vec_in2;
1735 vec_si += vec_first_col.s[3] * vec_in3;
1736 vec_si += vec_first_col.s[4] * vec_in4;
1737 vec_si += vec_first_col.s[5] * vec_in5;
1739 vec_si = vec_si * inv_norm_factor;
1741 vec_in0 += vec_si * vec_first_col.s[0];
1742 vec_in1 += vec_si * vec_first_col.s[1];
1743 vec_in2 += vec_si * vec_first_col.s[2];
1744 vec_in3 += vec_si * vec_first_col.s[3];
1745 vec_in4 += vec_si * vec_first_col.s[4];
1746 vec_in5 += vec_si * vec_first_col.s[5];
1748 vec_first_col = vec_first_col * (vec)
getRecip(s);
1750 vec_in0.s[0] = vec_first_col.s[0];
1751 vec_in1.s[0] = vec_first_col.s[1];
1752 vec_in2.s[0] = vec_first_col.s[2];
1753 vec_in3.s[0] = vec_first_col.s[3];
1754 vec_in4.s[0] = vec_first_col.s[4];
1755 vec_in5.s[0] = vec_first_col.s[5];
1757 vec_in0 += vec_mask_first_row;
1759 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1760 __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1761 __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1762 __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1763 __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
1764 __vstore_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride], vec_in5);
1767 vec_in0 += vec_mask_first_row;
1768 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1769 U[0 + 1 * colUStride] = 0.0;
1770 U[0 + 2 * colUStride] = 0.0;
1771 U[0 + 3 * colUStride] = 0.0;
1772 U[0 + 4 * colUStride] = 0.0;
1773 U[0 + 5 * colUStride] = 0.0;
1785 template <
typename dataType>
1787 u_update_6X7_NR_1st_iter(dataType *U,
const int colUStride, dataType *diag, dataType s,
const uint32_t col_elems);
1790 u_update_6X7_NR_1st_iter<float>(
float *U,
const int colUStride,
float *diag,
float s,
const uint32_t col_elems)
1796 u_update_6X7_NR_1st_iter<double>(
double *U,
const int colUStride,
double *diag,
double s,
const uint32_t col_elems)
1799 typedef typename c7x::make_full_vector<double>::type vec;
1801 __vpred pred_1_elem = __mask_long(col_elems);
1802 vec vec_in0, vec_in1;
1803 vec vec_mask_first_row = gen_plusOneVec<vec>();
1806 vec_in0 = __vload_pred(pred_1_elem, (vec *) &U[0 + 0 * colUStride]);
1807 vec_in1 = __vload_pred(pred_1_elem, (vec *) &U[0 + 1 * colUStride]);
1809 vec_in0 = vec_in0 * recip_s + vec_mask_first_row;
1810 vec_in1 = vec_in1 * recip_s;
1812 __vstore_pred(pred_1_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1813 __vstore_pred(pred_1_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1816 __vstore_pred(pred_1_elem, (vec *) U, (vec_in0 + vec_mask_first_row));
1817 U[0 + 1 * colUStride] = 0.0;
1823 template <
typename dataType>
1825 u_update_6X7_NR_2nd_iter(dataType *U,
const int colUStride, dataType *diag, dataType s,
const uint32_t col_elems);
1828 u_update_6X7_NR_2nd_iter<float>(
float *U,
const int colUStride,
float *diag,
float s,
const uint32_t col_elems)
1834 u_update_6X7_NR_2nd_iter<double>(
double *U,
const int colUStride,
double *diag,
double s,
const uint32_t col_elems)
1838 typedef typename c7x::make_full_vector<double>::type vec;
1840 vec vec_mask_first_row = gen_plusOneVec<vec>();
1841 __vpred pred_2_elem = __mask_long(col_elems);
1843 vec vec_first_col = (vec) 0;
1844 vec vec_si = (vec) 0.0;
1845 vec vec_in0 = __vload_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride]);
1846 vec_first_col.s[0] = vec_in0.s[0];
1847 vec_in0 = vec_in0 * vec_mask_first_row;
1848 vec norm_factor = (vec) (vec_in0.s[0] * s);
1849 vec inv_norm_factor =
getRecip(norm_factor);
1853 vec vec_in1 = __vload_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride]);
1854 vec vec_in2 = __vload_pred(pred_2_elem, (vec *) &U[0 + 2 * colUStride]);
1856 vec_first_col.s[1] = vec_in1.s[0];
1857 vec_first_col.s[2] = vec_in2.s[0];
1859 vec_si += vec_first_col.s[1] * vec_in1;
1860 vec_si += vec_first_col.s[2] * vec_in2;
1862 vec_si = vec_si * inv_norm_factor;
1864 vec_in0 += vec_si * vec_first_col.s[0];
1865 vec_in1 += vec_si * vec_first_col.s[1];
1866 vec_in2 += vec_si * vec_first_col.s[2];
1868 vec_first_col = vec_first_col * recip_s;
1870 vec_in0.s[0] = vec_first_col.s[0];
1871 vec_in1.s[0] = vec_first_col.s[1];
1872 vec_in2.s[0] = vec_first_col.s[2];
1874 vec_in0 += vec_mask_first_row;
1876 __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1877 __vstore_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1878 __vstore_pred(pred_2_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1881 vec_in0 += vec_mask_first_row;
1882 __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1883 U[0 + 1 * colUStride] = 0.0;
1884 U[0 + 2 * colUStride] = 0.0;
1890 template <
typename dataType>
1892 u_update_6X7_NR_3rd_iter(dataType *U,
const int colUStride, dataType *diag, dataType s,
const uint32_t col_elems);
1895 u_update_6X7_NR_3rd_iter<float>(
float *U,
const int colUStride,
float *diag,
float s,
const uint32_t col_elems)
1901 u_update_6X7_NR_3rd_iter<double>(
double *U,
const int colUStride,
double *diag,
double s,
const uint32_t col_elems)
1904 typedef typename c7x::make_full_vector<double>::type vec;
1906 vec vec_mask_first_row = gen_plusOneVec<vec>();
1907 __vpred pred_3_elem = __mask_long(col_elems);
1909 vec vec_first_col = (vec) 0;
1910 vec vec_si = (vec) 0.0;
1911 vec vec_in0 = __vload_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride]);
1912 vec_first_col.s[0] = vec_in0.s[0];
1913 vec_in0 = vec_in0 * vec_mask_first_row;
1914 vec norm_factor = (vec) (vec_in0.s[0] * s);
1915 vec inv_norm_factor =
getRecip(norm_factor);
1919 vec vec_in1 = __vload_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride]);
1920 vec vec_in2 = __vload_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride]);
1921 vec vec_in3 = __vload_pred(pred_3_elem, (vec *) &U[0 + 3 * colUStride]);
1923 vec_first_col.s[1] = vec_in1.s[0];
1924 vec_first_col.s[2] = vec_in2.s[0];
1925 vec_first_col.s[3] = vec_in3.s[0];
1927 vec_si += vec_first_col.s[1] * vec_in1;
1928 vec_si += vec_first_col.s[2] * vec_in2;
1929 vec_si += vec_first_col.s[3] * vec_in3;
1931 vec_si = vec_si * inv_norm_factor;
1933 vec_in0 += vec_si * vec_first_col.s[0];
1934 vec_in1 += vec_si * vec_first_col.s[1];
1935 vec_in2 += vec_si * vec_first_col.s[2];
1936 vec_in3 += vec_si * vec_first_col.s[3];
1938 vec_first_col = vec_first_col * recip_s;
1940 vec_in0.s[0] = vec_first_col.s[0];
1941 vec_in1.s[0] = vec_first_col.s[1];
1942 vec_in2.s[0] = vec_first_col.s[2];
1943 vec_in3.s[0] = vec_first_col.s[3];
1945 vec_in0 += vec_mask_first_row;
1947 __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1948 __vstore_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1949 __vstore_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1950 __vstore_pred(pred_3_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1953 vec_in0 += vec_mask_first_row;
1954 __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1955 U[0 + 1 * colUStride] = 0.0;
1956 U[0 + 2 * colUStride] = 0.0;
1957 U[0 + 3 * colUStride] = 0.0;
1963 template <
typename dataType>
1965 u_update_6X7_NR_4th_iter(dataType *U,
const int colUStride, dataType *diag, dataType s,
const uint32_t col_elems);
1968 u_update_6X7_NR_4th_iter<float>(
float *U,
const int colUStride,
float *diag,
float s,
const uint32_t col_elems)
1974 u_update_6X7_NR_4th_iter<double>(
double *U,
const int colUStride,
double *diag,
double s,
const uint32_t col_elems)
1977 typedef typename c7x::make_full_vector<double>::type vec;
1979 vec vec_mask_first_row = gen_plusOneVec<vec>();
1980 __vpred pred_4_elem = __mask_long(col_elems);
1982 vec vec_first_col = (vec) 0;
1983 vec vec_si = (vec) 0.0;
1984 vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1985 vec_first_col.s[0] = vec_in0.s[0];
1986 vec_in0 = vec_in0 * vec_mask_first_row;
1987 vec norm_factor = (vec) (vec_in0.s[0] * s);
1988 vec inv_norm_factor =
getRecip(norm_factor);
1992 vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1993 vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1994 vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1995 vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
1997 vec_first_col.s[1] = vec_in1.s[0];
1998 vec_first_col.s[2] = vec_in2.s[0];
1999 vec_first_col.s[3] = vec_in3.s[0];
2000 vec_first_col.s[4] = vec_in4.s[0];
2002 vec_si += vec_first_col.s[1] * vec_in1;
2003 vec_si += vec_first_col.s[2] * vec_in2;
2004 vec_si += vec_first_col.s[3] * vec_in3;
2005 vec_si += vec_first_col.s[4] * vec_in4;
2007 vec_si = vec_si * inv_norm_factor;
2009 vec_in0 += vec_si * vec_first_col.s[0];
2010 vec_in1 += vec_si * vec_first_col.s[1];
2011 vec_in2 += vec_si * vec_first_col.s[2];
2012 vec_in3 += vec_si * vec_first_col.s[3];
2013 vec_in4 += vec_si * vec_first_col.s[4];
2015 vec_first_col = vec_first_col * recip_s;
2017 vec_in0.s[0] = vec_first_col.s[0];
2018 vec_in1.s[0] = vec_first_col.s[1];
2019 vec_in2.s[0] = vec_first_col.s[2];
2020 vec_in3.s[0] = vec_first_col.s[3];
2021 vec_in4.s[0] = vec_first_col.s[4];
2023 vec_in0 += vec_mask_first_row;
2025 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2026 __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
2027 __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
2028 __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
2029 __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
2032 vec_in0 += vec_mask_first_row;
2033 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2034 U[0 + 1 * colUStride] = 0.0;
2035 U[0 + 2 * colUStride] = 0.0;
2036 U[0 + 3 * colUStride] = 0.0;
2037 U[0 + 4 * colUStride] = 0.0;
2044 template <
typename dataType>
2046 u_update_6X7_NR_5th_iter(dataType *U,
const int colUStride, dataType *diag, dataType s,
const uint32_t col_elems);
2049 u_update_6X7_NR_5th_iter<float>(
float *U,
const int colUStride,
float *diag,
float s,
const uint32_t col_elems)
2055 u_update_6X7_NR_5th_iter<double>(
double *U,
const int colUStride,
double *diag,
double s,
const uint32_t col_elems)
2059 typedef typename c7x::make_full_vector<double>::type vec;
2061 vec vec_mask_first_row = gen_plusOneVec<vec>();
2062 __vpred pred_4_elem = __mask_long(col_elems);
2064 vec vec_first_col = (vec) 0;
2065 vec vec_si = (vec) 0.0;
2066 vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
2067 vec_first_col.s[0] = vec_in0.s[0];
2068 vec_in0 = vec_in0 * vec_mask_first_row;
2069 vec norm_factor = (vec) (vec_in0.s[0] * s);
2070 vec inv_norm_factor =
getRecip(norm_factor);
2074 vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
2075 vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
2076 vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
2077 vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
2078 vec vec_in5 = __vload_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride]);
2080 vec_first_col.s[1] = vec_in1.s[0];
2081 vec_first_col.s[2] = vec_in2.s[0];
2082 vec_first_col.s[3] = vec_in3.s[0];
2083 vec_first_col.s[4] = vec_in4.s[0];
2084 vec_first_col.s[5] = vec_in5.s[0];
2086 vec_si += vec_first_col.s[1] * vec_in1;
2087 vec_si += vec_first_col.s[2] * vec_in2;
2088 vec_si += vec_first_col.s[3] * vec_in3;
2089 vec_si += vec_first_col.s[4] * vec_in4;
2090 vec_si += vec_first_col.s[5] * vec_in5;
2092 vec_si = vec_si * inv_norm_factor;
2094 vec_in0 += vec_si * vec_first_col.s[0];
2095 vec_in1 += vec_si * vec_first_col.s[1];
2096 vec_in2 += vec_si * vec_first_col.s[2];
2097 vec_in3 += vec_si * vec_first_col.s[3];
2098 vec_in4 += vec_si * vec_first_col.s[4];
2099 vec_in5 += vec_si * vec_first_col.s[5];
2101 vec_first_col = vec_first_col * recip_s;
2103 vec_in0.s[0] = vec_first_col.s[0];
2104 vec_in1.s[0] = vec_first_col.s[1];
2105 vec_in2.s[0] = vec_first_col.s[2];
2106 vec_in3.s[0] = vec_first_col.s[3];
2107 vec_in4.s[0] = vec_first_col.s[4];
2108 vec_in5.s[0] = vec_first_col.s[5];
2110 vec_in0 += vec_mask_first_row;
2112 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2113 __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
2114 __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
2115 __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
2116 __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
2117 __vstore_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride], vec_in5);
2120 vec_in0 += vec_mask_first_row;
2121 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2122 U[0 + 1 * colUStride] = 0.0;
2123 U[0 + 2 * colUStride] = 0.0;
2124 U[0 + 3 * colUStride] = 0.0;
2125 U[0 + 4 * colUStride] = 0.0;
2126 U[0 + 5 * colUStride] = 0.0;
2132 template <
typename dataType>
2134 u_update_6X7_NR_6th_iter(dataType *U,
const int colUStride, dataType *diag, dataType s,
const uint32_t col_elems);
2137 u_update_6X7_NR_6th_iter<float>(
float *U,
const int colUStride,
float *diag,
float s,
const uint32_t col_elems)
2143 u_update_6X7_NR_6th_iter<double>(
double *U,
const int colUStride,
double *diag,
double s,
const uint32_t col_elems)
2146 typedef typename c7x::make_full_vector<double>::type vec;
2148 vec vec_mask_first_row = gen_plusOneVec<vec>();
2149 __vpred pred_4_elem = __mask_long(col_elems);
2151 vec vec_first_col = (vec) 0;
2152 vec vec_si = (vec) 0.0;
2153 vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
2154 vec_first_col.s[0] = vec_in0.s[0];
2155 vec_in0 = vec_in0 * vec_mask_first_row;
2156 vec norm_factor = (vec) (vec_in0.s[0] * s);
2157 vec inv_norm_factor =
getRecip(norm_factor);
2161 vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
2162 vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
2163 vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
2164 vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
2165 vec vec_in5 = __vload_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride]);
2166 vec vec_in6 = __vload_pred(pred_4_elem, (vec *) &U[0 + 6 * colUStride]);
2168 vec_first_col.s[1] = vec_in1.s[0];
2169 vec_first_col.s[2] = vec_in2.s[0];
2170 vec_first_col.s[3] = vec_in3.s[0];
2171 vec_first_col.s[4] = vec_in4.s[0];
2172 vec_first_col.s[5] = vec_in5.s[0];
2173 vec_first_col.s[6] = vec_in6.s[0];
2175 vec_si += vec_first_col.s[1] * vec_in1;
2176 vec_si += vec_first_col.s[2] * vec_in2;
2177 vec_si += vec_first_col.s[3] * vec_in3;
2178 vec_si += vec_first_col.s[4] * vec_in4;
2179 vec_si += vec_first_col.s[5] * vec_in5;
2180 vec_si += vec_first_col.s[6] * vec_in6;
2182 vec_si = vec_si * inv_norm_factor;
2184 vec_in0 += vec_si * vec_first_col.s[0];
2185 vec_in1 += vec_si * vec_first_col.s[1];
2186 vec_in2 += vec_si * vec_first_col.s[2];
2187 vec_in3 += vec_si * vec_first_col.s[3];
2188 vec_in4 += vec_si * vec_first_col.s[4];
2189 vec_in5 += vec_si * vec_first_col.s[5];
2190 vec_in6 += vec_si * vec_first_col.s[6];
2192 vec_first_col = vec_first_col * recip_s;
2194 vec_in0.s[0] = vec_first_col.s[0];
2195 vec_in1.s[0] = vec_first_col.s[1];
2196 vec_in2.s[0] = vec_first_col.s[2];
2197 vec_in3.s[0] = vec_first_col.s[3];
2198 vec_in4.s[0] = vec_first_col.s[4];
2199 vec_in5.s[0] = vec_first_col.s[5];
2200 vec_in6.s[0] = vec_first_col.s[6];
2202 vec_in0 += vec_mask_first_row;
2204 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2205 __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
2206 __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
2207 __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
2208 __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
2209 __vstore_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride], vec_in5);
2210 __vstore_pred(pred_4_elem, (vec *) &U[0 + 6 * colUStride], vec_in6);
2213 vec_in0 += vec_mask_first_row;
2214 __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2215 U[0 + 1 * colUStride] = 0.0;
2216 U[0 + 2 * colUStride] = 0.0;
2217 U[0 + 3 * colUStride] = 0.0;
2218 U[0 + 4 * colUStride] = 0.0;
2219 U[0 + 5 * colUStride] = 0.0;
2220 U[0 + 6 * colUStride] = 0.0;
2226 template <
typename dataType>
2228 u_update_6X7_R_1st_iter(dataType *U,
const int colUStride, dataType *diag, dataType s,
const uint32_t col_elems);
2231 u_update_6X7_R_1st_iter<float>(
float *U,
const int colUStride,
float *diag,
float s,
const uint32_t col_elems)
2237 u_update_6X7_R_1st_iter<double>(
double *U,
const int colUStride,
double *diag,
double s,
const uint32_t col_elems)
2241 double u_0_0 = U[0 + 0 * colUStride];
2242 double u_1_0 = U[0 + 1 * colUStride];
2244 double norm_factor = (u_0_0 * s);
2245 double inv_norm_factor =
getRecip(norm_factor);
2251 si = u_1_0 * U[1 + 1 * colUStride];
2252 si = si * inv_norm_factor;
2253 U[1 + 1 * colUStride] += si * u_1_0;
2255 U[1 + 0 * colUStride] = u_0_0 * u_1_0 * inv_norm_factor;
2256 U[1 + 1 * colUStride] = (u_1_0 * u_1_0 * inv_norm_factor) + 1;
2258 U[0 + 0 * colUStride] = (u_0_0 * recip_s) + 1;
2259 U[0 + 1 * colUStride] = u_1_0 * recip_s;
2262 U[1 + 1 * colUStride] = 1;
2263 U[0 + 0 * colUStride] = 1;
2264 U[0 + 1 * colUStride] = 0;
dataType getRecip(dataType value)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_svd.
dataType getSqrt(dataType a)