DSPLIB User Guide
DSPLIB_svd_small_u_process.h
Go to the documentation of this file.
1 
2 /******************************************************************************/
6 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the
18  * distribution.
19  *
20  * Neither the name of Texas Instruments Incorporated nor the names of
21  * its contributors may be used to endorse or promote products derived
22  * from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
31  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35  *
36  ******************************************************************************/
37 
38 /******************************************************************************
39  * Version 1.0 Date Aug 2023 Author: Asheesh Bhardwaj
40  *****************************************************************************/
41 
42 /*******************************************************************************
43  *
44  * INCLUDES
45  *
46  ******************************************************************************/
47 
48 #include "DSPLIB_svd_small_priv.h"
49 
50 /* *****************************************************************************
51  *
52  * IMPLEMENTATION
53  *
54  ***************************************************************************** */
55 
56 #if (__C7X_VEC_SIZE_BITS__ == 512)
57 
58 
59 /* Following horizontal addition are used for DSPLIB_svd_small */
60 template <typename V, typename W> inline void c7x_horizontal_add_6_elems(V inVec, W *horizontalSum);
61 template <> inline void c7x_horizontal_add_6_elems(c7x::double_vec inVec, double *horizontalSum)
62 {
63  double s0 = inVec.s[0];
64  double s1 = inVec.s[1];
65  double s2 = inVec.s[2];
66  double s3 = inVec.s[3];
67  double s4 = inVec.s[4];
68  double s5 = inVec.s[5];
69  double ss0 = s0 + s1;
70  double ss1 = s2 + s3;
71  double ss2 = s4 + s5;
72  double sss0 = ss0 + ss1 + ss2;
73  *horizontalSum = sss0;
74 }
75 
76 
77 
78 
79 template <typename V, typename W> inline void c7x_horizontal_add_5_elems(V inVec, W *horizontalSum);
80 template <> inline void c7x_horizontal_add_5_elems(c7x::double_vec inVec, double *horizontalSum)
81 {
82  double s0 = inVec.s[0];
83  double s1 = inVec.s[1];
84  double s2 = inVec.s[2];
85  double s3 = inVec.s[3];
86  double s4 = inVec.s[4];
87  double ss0 = s0 + s1;
88  double ss1 = s2 + s3 + s4;
89  double sss0 = ss0 + ss1;
90  *horizontalSum = sss0;
91 }
92 
93 
94 
95 template <typename V, typename W> inline void c7x_horizontal_add_4_elems(V inVec, W *horizontalSum);
96 template <> inline void c7x_horizontal_add_4_elems(c7x::double_vec inVec, double *horizontalSum)
97 {
98  double s0 = inVec.s[0];
99  double s1 = inVec.s[1];
100  double s2 = inVec.s[2];
101  double s3 = inVec.s[3];
102  double ss0 = s0 + s1;
103  double ss1 = s2 + s3 ;
104  double sss0 = ss0 + ss1;
105  *horizontalSum = sss0;
106 }
107 
108 
109 template <typename V, typename W> inline void c7x_horizontal_add_3_elems(V inVec, W *horizontalSum);
110 template <> inline void c7x_horizontal_add_3_elems(c7x::double_vec inVec, double *horizontalSum)
111 {
112  double s0 = inVec.s[0];
113  double s1 = inVec.s[1];
114  double s2 = inVec.s[2];
115  double ss0 = s0 + s1;
116  double ss1 = s2;
117  double sss0 = ss0 + ss1;
118  *horizontalSum = sss0;
119 }
120 
121 
122 template <typename V, typename W> inline void c7x_horizontal_add_2_elems(V inVec, W *horizontalSum);
123 template <> inline void c7x_horizontal_add_2_elems(c7x::double_vec inVec, double *horizontalSum)
124 {
125  double s0 = inVec.s[0];
126  double s1 = inVec.s[1];
127  double ss0 = s0 + s1;
128  *horizontalSum = ss0;
129 }
130 
131 
132 
133 
134 template <typename X, typename Y> inline Y vec_put(uint8_t idx, X val, Y vecIn);
135 
136 template <> inline c7x::float_vec vec_put<float, c7x::float_vec>(uint8_t idx, float val, c7x::float_vec vecIn)
137 {
138  return __vputw_rkv(val, idx, vecIn);
139 }
140 
141 template <> inline c7x::double_vec vec_put<double, c7x::double_vec>(uint8_t idx, double val, c7x::double_vec vecIn)
142 {
143  return __vputd_dkv(val, idx, vecIn);
144 }
145 
146 #if (__C7X_VEC_SIZE_BITS__ == 512)
147 template <typename X> inline X gen_u_row_process_mask();
148 template <> inline c7x::float_vec gen_u_row_process_mask<c7x::float_vec>()
149 {
150  c7x::float_vec ret_mask = c7x::float_vec(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
151  return ret_mask;
152 }
153 template <> inline c7x::double_vec gen_u_row_process_mask<c7x::double_vec>()
154 {
155  c7x::double_vec ret_mask = c7x::double_vec(0, 1, 1, 1, 1, 1, 1, 1);
156  return ret_mask;
157 }
158 
159 template <typename X> inline X gen_u_row_process_rev_mask();
160 template <> inline c7x::float_vec gen_u_row_process_rev_mask<c7x::float_vec>()
161 {
162  c7x::float_vec ret_mask = c7x::float_vec(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
163  return ret_mask;
164 }
165 template <> inline c7x::double_vec gen_u_row_process_rev_mask<c7x::double_vec>()
166 {
167  c7x::double_vec ret_mask = c7x::double_vec(1, 0, 0, 0, 0, 0, 0, 0);
168  return ret_mask;
169 }
170 
171 #endif
172 
173 
174 #define U_PROC_SKIP_VLOAD 0
175 
176 #if U_PROC_SKIP_VLOAD
177 c7x::double_vec vec_in0_u_proc = (c7x::double_vec) 0.0;
178 c7x::double_vec vec_in1_u_proc = (c7x::double_vec) 0.0;
179 c7x::double_vec vec_in2_u_proc = (c7x::double_vec) 0.0;
180 c7x::double_vec vec_in3_u_proc = (c7x::double_vec) 0.0;
181 c7x::double_vec vec_in4_u_proc = (c7x::double_vec) 0.0;
182 c7x::double_vec vec_in5_u_proc = (c7x::double_vec) 0.0;
183 #endif
184 
185 template <typename dataType>
186 static inline void u_process_0th_iter(dataType *U,
187  const int colUStride,
188  dataType *diag,
189  dataType *superdiag,
190  dataType *scale,
191  dataType *s,
192  const uint32_t num_cols,
193  const uint32_t diag_idx);
194 template <>
195 inline void u_process_0th_iter<float>(float *U,
196  const int colUStride,
197  float *diag,
198  float *superdiag,
199  float *scale,
200  float *s,
201  const uint32_t num_cols,
202  const uint32_t diag_idx)
203 {
204  return;
205 }
206 template <>
207 inline void u_process_0th_iter<double>(double *U,
208  const int colUStride,
209  double *diag,
210  double *superdiag,
211  double *scale,
212  double *s,
213  const uint32_t num_cols,
214  const uint32_t diag_idx)
215 {
216  double s2 = 0, half_norm_squared = 0;
217  /* -------------------------------------------------- */
218  /* For i = 0 */
219  /* -------------------------------------------------- */
220  {
221  /* section 1- 54 */
222  typedef typename c7x::make_full_vector<double>::type vec;
223 
224  // __vpred pred_6_elem = __create_vpred(0x0000FFFFFFFFFFFFU);
225  __vpred pred_col_ele = __mask_long(num_cols);
226 
227 #if U_PROC_SKIP_VLOAD
228  vec vec_in0 = vec_in0_u_proc;
229  vec vec_in1 = vec_in1_u_proc;
230  vec vec_in2 = vec_in2_u_proc;
231  vec vec_in3 = vec_in3_u_proc;
232  vec vec_in4 = vec_in4_u_proc;
233  vec vec_in5 = vec_in5_u_proc;
234 #else
235  vec vec_in0 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 0 * colUStride]));
236  vec vec_in1 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 1 * colUStride]));
237  vec vec_in2 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 2 * colUStride]));
238  vec vec_in3 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 3 * colUStride]));
239  vec vec_in4 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 4 * colUStride]));
240  vec vec_in5 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 5 * colUStride]));
241  vec vec_in6 = __vload_pred(pred_col_ele, (vec *) (&U[0 + 6 * colUStride]));
242 
243 #endif
244 
245  superdiag[diag_idx] = (*scale) * (*s);
246  *scale = 0;
247 
248  /* -------------------------------------------------- */
249  /* U COLUMN PROCESS FOR i = 0 */
250  /* -------------------------------------------------- */
251 
252  /* Accumulate i = 0 th column absolute values to get 'scale' */
253  double in0_s0 = vec_in0.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in0), 0u));
254  double in1_s0 = vec_in1.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in1), 0u));
255  double in2_s0 = vec_in2.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in2), 0u));
256  double in3_s0 = vec_in3.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in3), 0u));
257  double in4_s0 = vec_in4.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in4), 0u));
258  double in5_s0 = vec_in5.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in5), 0u));
259  double in6_s0 = vec_in6.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in5), 0u));
260 
261  /* Store the first columns values in another vector */
262  vec vec_first_col = (vec) 0.0;
263  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
264  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
265  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
266  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
267  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in4_s0), 4, c7x::as_long_vec(vec_first_col)));
268  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in5_s0), 5, c7x::as_long_vec(vec_first_col)));
269  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in6_s0), 6, c7x::as_long_vec(vec_first_col)));
270 
271  (*scale) += __abs(in0_s0);
272  (*scale) += __abs(in1_s0);
273  (*scale) += __abs(in2_s0);
274  (*scale) += __abs(in3_s0);
275  (*scale) += __abs(in4_s0);
276  (*scale) += __abs(in5_s0);
277  (*scale) += __abs(in6_s0);
278 
279  /* section 2- 132 */
280 
281  if ((*scale) > 0) {
282  s2 = 0;
283  /* normalize the 'vec_first_col' */
284  vec_first_col = vec_first_col * (vec) getRecip((*scale)); // / scale;
285  vec vec_first_col_sq = vec_first_col * vec_first_col;
286  c7x_horizontal_add(vec_first_col_sq, &s2);
287 
288  *s = getSqrt(s2);
289 
290  if (vec_first_col.s[0] >= 0) {
291  *s = -(*s);
292  }
293 
294  half_norm_squared = vec_first_col.s[0] * (*s) - s2;
295  vec inv_hnsq = (vec) getRecip(half_norm_squared);
296  vec_first_col.s[0] -= (*s);
297 
298  vec vec_si = (vec) 0;
299 
300  vec_si += (vec_first_col.s[0] * vec_in0);
301  vec_si += (vec_first_col.s[1] * vec_in1);
302  vec_si += (vec_first_col.s[2] * vec_in2);
303  vec_si += (vec_first_col.s[3] * vec_in3);
304  vec_si += (vec_first_col.s[4] * vec_in4);
305  vec_si += (vec_first_col.s[5] * vec_in5);
306  vec_si += (vec_first_col.s[6] * vec_in6);
307 
308  vec_si = vec_si * inv_hnsq; //(vec) getRecip(half_norm_squared); // / half_norm_squared;
309 
310  vec_in0 += vec_si * vec_first_col.s[0];
311  vec_in1 += vec_si * vec_first_col.s[1];
312  vec_in2 += vec_si * vec_first_col.s[2];
313  vec_in3 += vec_si * vec_first_col.s[3];
314  vec_in4 += vec_si * vec_first_col.s[4];
315  vec_in5 += vec_si * vec_first_col.s[5];
316  vec_in6 += vec_si * vec_first_col.s[6];
317  }
318  /* section 3- 69 */
319  vec_first_col *= (*scale);
320  vec_in0.s[0] = vec_first_col.s[0];
321  vec_in1.s[0] = vec_first_col.s[1];
322  vec_in2.s[0] = vec_first_col.s[2];
323  vec_in3.s[0] = vec_first_col.s[3];
324  vec_in4.s[0] = vec_first_col.s[4];
325  vec_in5.s[0] = vec_first_col.s[5];
326  vec_in6.s[0] = vec_first_col.s[6];
327 
328  diag[diag_idx] = (*s) * (*scale);
329 
330  *s = 0;
331  *scale = 0;
332 
333  vec vec_row_mask = gen_u_row_process_mask<vec>();
334 
335  vec vec_first_row = vec_row_mask * vec_in0;
336  c7x_horizontal_add_6_elems(__abs(vec_first_row), scale);
337 
338  /* section 4- 142 */
339 
340  if ((*scale) > 0) {
341  s2 = 0;
342  vec_first_row = vec_first_row * (vec) getRecip((*scale)); // / scale;
343  vec vec_first_row_sq = vec_first_row * vec_first_row;
344  c7x_horizontal_add_6_elems(vec_first_row_sq, &s2);
345 
346  *s = getSqrt(s2);
347  if (vec_first_row.s[(num_cols - 1)] >= 0) {
348  *s = -(*s);
349  }
350 
351  half_norm_squared = vec_first_row.s[1] * (*s) - s2;
352  vec_first_row.s[1] -= (*s);
353 
354  vec vec_superdiag_temp = vec_first_row * (vec) getRecip(half_norm_squared); // / half_norm_squared;
355 
356  vec vec_si = (vec) 0;
357 
358  vec vec_temp1 = vec_first_row * vec_in1;
359  vec vec_temp2 = vec_first_row * vec_in2;
360  vec vec_temp3 = vec_first_row * vec_in3;
361  vec vec_temp4 = vec_first_row * vec_in4;
362  vec vec_temp5 = vec_first_row * vec_in5;
363  vec vec_temp6 = vec_first_row * vec_in6;
364 
365  c7x_horizontal_add_6_elems(vec_temp1, &vec_si.s[1]);
366  c7x_horizontal_add_6_elems(vec_temp2, &vec_si.s[2]);
367  c7x_horizontal_add_6_elems(vec_temp3, &vec_si.s[3]);
368  c7x_horizontal_add_6_elems(vec_temp4, &vec_si.s[4]);
369  c7x_horizontal_add_6_elems(vec_temp5, &vec_si.s[5]);
370  c7x_horizontal_add_6_elems(vec_temp6, &vec_si.s[6]);
371 
372  vec_in1 += vec_si.s[1] * vec_superdiag_temp;
373  vec_in2 += vec_si.s[2] * vec_superdiag_temp;
374  vec_in3 += vec_si.s[3] * vec_superdiag_temp;
375  vec_in4 += vec_si.s[4] * vec_superdiag_temp;
376  vec_in5 += vec_si.s[5] * vec_superdiag_temp;
377  vec_in6 += vec_si.s[6] * vec_superdiag_temp;
378 
379  vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
380  vec_in0 *= vec_row_rev_mask;
381  vec_in0 += vec_first_row * (*scale);
382  }
383 
384 #if U_PROC_SKIP_VLOAD
385  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
386  U[0 + 1 * colUStride] = vec_in1.s[0];
387  U[0 + 2 * colUStride] = vec_in2.s[0];
388  U[0 + 3 * colUStride] = vec_in3.s[0];
389  U[0 + 4 * colUStride] = vec_in4.s[0];
390  U[0 + 5 * colUStride] = vec_in5.s[0];
391  vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
392  vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
393  vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
394  vec_in3_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in4), 64u));
395  vec_in4_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in5), 64u));
396 
397 #else
398  /* section 5- 8 */
399  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
400  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
401  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
402  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
403  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 4 * colUStride]), vec_in4);
404  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 5 * colUStride]), vec_in5);
405  __vstore_pred(pred_col_ele, (vec *) (&U[0 + 6 * colUStride]), vec_in6);
406 #endif
407  // print_matrix("OPTC U matrix after row process:", U, Nrows, Ncols, colUStride);
408  }
409 
410 
411  return;
412 }
413 
414 template <typename dataType>
415 static inline void u_process_1st_iter(dataType *U,
416  const int colUStride,
417  dataType *diag,
418  dataType *superdiag,
419  dataType *scale,
420  dataType *s,
421  const uint32_t num_cols,
422  const uint32_t diag_idx);
423 template <>
424 inline void u_process_1st_iter<float>(float *U,
425  const int colUStride,
426  float *diag,
427  float *superdiag,
428  float *scale,
429  float *s,
430  const uint32_t num_cols,
431  const uint32_t diag_idx)
432 {
433  return;
434 }
435 template <>
436 inline void u_process_1st_iter<double>(double *U,
437  const int colUStride,
438  double *diag,
439  double *superdiag,
440  double *scale,
441  double *s,
442  const uint32_t num_cols,
443  const uint32_t diag_idx)
444 {
445  double s2 = 0, half_norm_squared = 0;
446  /* -------------------------------------------------- */
447  /* For i = 0 */
448  /* -------------------------------------------------- */
449  {
450  /* section 1- 54 */
451  // printf("ENTER OPT CODE:\n");
452  typedef typename c7x::make_full_vector<double>::type vec;
453 
454  // __vpred pred_6_elem = __create_vpred(0x0000FFFFFFFFFFFFU);
455  __vpred pred_6_elem = __mask_long(num_cols);
456 
457 #if U_PROC_SKIP_VLOAD
458  vec vec_in0 = vec_in0_u_proc;
459  vec vec_in1 = vec_in1_u_proc;
460  vec vec_in2 = vec_in2_u_proc;
461  vec vec_in3 = vec_in3_u_proc;
462  vec vec_in4 = vec_in4_u_proc;
463  vec vec_in5 = vec_in5_u_proc;
464 #else
465  vec vec_in0 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 0 * colUStride]));
466  vec vec_in1 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 1 * colUStride]));
467  vec vec_in2 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 2 * colUStride]));
468  vec vec_in3 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 3 * colUStride]));
469  vec vec_in4 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 4 * colUStride]));
470  vec vec_in5 = __vload_pred(pred_6_elem, (vec *) (&U[0 + 5 * colUStride]));
471 #endif
472 
473  superdiag[diag_idx] = (*scale) * (*s);
474  *scale = 0;
475 
476  /* -------------------------------------------------- */
477  /* U COLUMN PROCESS FOR i = 0 */
478  /* -------------------------------------------------- */
479 
480  /* Accumulate i = 0 th column absolute values to get 'scale' */
481  double in0_s0 = vec_in0.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in0), 0u));
482  double in1_s0 = vec_in1.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in1), 0u));
483  double in2_s0 = vec_in2.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in2), 0u));
484  double in3_s0 = vec_in3.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in3), 0u));
485  double in4_s0 = vec_in4.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in4), 0u));
486  double in5_s0 = vec_in5.s[0]; // __as_double(__vgetd_vrd(c7x::as_long_vec(vec_in5), 0u));
487 
488  /* Store the first columns values in another vector */
489  vec vec_first_col = (vec) 0.0;
490  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
491  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
492  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
493  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
494  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in4_s0), 4, c7x::as_long_vec(vec_first_col)));
495  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in5_s0), 5, c7x::as_long_vec(vec_first_col)));
496 
497  (*scale) += __abs(in0_s0);
498  (*scale) += __abs(in1_s0);
499  (*scale) += __abs(in2_s0);
500  (*scale) += __abs(in3_s0);
501  (*scale) += __abs(in4_s0);
502  (*scale) += __abs(in5_s0);
503 
504  /* section 2- 132 */
505 
506  if ((*scale) > 0) {
507  s2 = 0;
508  /* normalize the 'vec_first_col' */
509  vec_first_col = vec_first_col * (vec) getRecip((*scale)); // / scale;
510  vec vec_first_col_sq = vec_first_col * vec_first_col;
511  c7x_horizontal_add_6_elems(vec_first_col_sq, &s2);
512 
513  *s = getSqrt(s2);
514 
515  if (vec_first_col.s[0] >= 0) {
516  *s = -(*s);
517  }
518 
519  half_norm_squared = vec_first_col.s[0] * (*s) - s2;
520  vec inv_hnsq = (vec) getRecip(half_norm_squared);
521  vec_first_col.s[0] -= (*s);
522 
523  vec vec_si = (vec) 0;
524 
525  vec_si += (vec_first_col.s[0] * vec_in0);
526  vec_si += (vec_first_col.s[1] * vec_in1);
527  vec_si += (vec_first_col.s[2] * vec_in2);
528  vec_si += (vec_first_col.s[3] * vec_in3);
529  vec_si += (vec_first_col.s[4] * vec_in4);
530  vec_si += (vec_first_col.s[5] * vec_in5);
531 
532  vec_si = vec_si * inv_hnsq; //(vec) getRecip(half_norm_squared); // / half_norm_squared;
533 
534  vec_in0 += vec_si * vec_first_col.s[0];
535  vec_in1 += vec_si * vec_first_col.s[1];
536  vec_in2 += vec_si * vec_first_col.s[2];
537  vec_in3 += vec_si * vec_first_col.s[3];
538  vec_in4 += vec_si * vec_first_col.s[4];
539  vec_in5 += vec_si * vec_first_col.s[5];
540  }
541  /* section 3- 69 */
542  vec_first_col *= (*scale);
543  vec_in0.s[0] = vec_first_col.s[0];
544  vec_in1.s[0] = vec_first_col.s[1];
545  vec_in2.s[0] = vec_first_col.s[2];
546  vec_in3.s[0] = vec_first_col.s[3];
547  vec_in4.s[0] = vec_first_col.s[4];
548  vec_in5.s[0] = vec_first_col.s[5];
549 
550  diag[diag_idx] = (*s) * (*scale);
551 
552  *s = 0;
553  *scale = 0;
554 
555  vec vec_row_mask = gen_u_row_process_mask<vec>();
556 
557  vec vec_first_row = vec_row_mask * vec_in0;
558  c7x_horizontal_add_6_elems(__abs(vec_first_row), scale);
559 
560  /* section 4- 142 */
561 
562  if ((*scale) > 0) {
563  s2 = 0;
564  vec_first_row = vec_first_row * (vec) getRecip((*scale)); // / scale;
565  vec vec_first_row_sq = vec_first_row * vec_first_row;
566  c7x_horizontal_add_6_elems(vec_first_row_sq, &s2);
567 
568  *s = getSqrt(s2);
569  if (vec_first_row.s[(num_cols - 1)] >= 0) {
570  *s = -(*s);
571  }
572 
573  half_norm_squared = vec_first_row.s[1] * (*s) - s2;
574  vec_first_row.s[1] -= (*s);
575 
576  vec vec_superdiag_temp = vec_first_row * (vec) getRecip(half_norm_squared); // / half_norm_squared;
577 
578  vec vec_si = (vec) 0;
579 
580  vec vec_temp1 = vec_first_row * vec_in1;
581  vec vec_temp2 = vec_first_row * vec_in2;
582  vec vec_temp3 = vec_first_row * vec_in3;
583  vec vec_temp4 = vec_first_row * vec_in4;
584  vec vec_temp5 = vec_first_row * vec_in5;
585 
586  c7x_horizontal_add_6_elems(vec_temp1, &vec_si.s[1]);
587  c7x_horizontal_add_6_elems(vec_temp2, &vec_si.s[2]);
588  c7x_horizontal_add_6_elems(vec_temp3, &vec_si.s[3]);
589  c7x_horizontal_add_6_elems(vec_temp4, &vec_si.s[4]);
590  c7x_horizontal_add_6_elems(vec_temp5, &vec_si.s[5]);
591 
592  vec_in1 += vec_si.s[1] * vec_superdiag_temp;
593  vec_in2 += vec_si.s[2] * vec_superdiag_temp;
594  vec_in3 += vec_si.s[3] * vec_superdiag_temp;
595  vec_in4 += vec_si.s[4] * vec_superdiag_temp;
596  vec_in5 += vec_si.s[5] * vec_superdiag_temp;
597 
598  vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
599  vec_in0 *= vec_row_rev_mask;
600  vec_in0 += vec_first_row * (*scale);
601  }
602 
603 #if U_PROC_SKIP_VLOAD
604  __vstore_pred(pred_6_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
605  U[0 + 1 * colUStride] = vec_in1.s[0];
606  U[0 + 2 * colUStride] = vec_in2.s[0];
607  U[0 + 3 * colUStride] = vec_in3.s[0];
608  U[0 + 4 * colUStride] = vec_in4.s[0];
609  U[0 + 5 * colUStride] = vec_in5.s[0];
610  vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
611  vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
612  vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
613  vec_in3_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in4), 64u));
614  vec_in4_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in5), 64u));
615 
616 #else
617  /* section 5- 8 */
618  __vstore_pred(pred_6_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
619  __vstore_pred(pred_6_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
620  __vstore_pred(pred_6_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
621  __vstore_pred(pred_6_elem, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
622  __vstore_pred(pred_6_elem, (vec *) (&U[0 + 4 * colUStride]), vec_in4);
623  __vstore_pred(pred_6_elem, (vec *) (&U[0 + 5 * colUStride]), vec_in5);
624 #endif
625  }
626 
627  return;
628 }
629 
630 template <typename dataType>
631 static inline void u_process_2nd_iter(dataType *U,
632  const int colUStride,
633  dataType *diag,
634  dataType *superdiag,
635  dataType *scale,
636  dataType *s,
637  const uint32_t num_cols,
638  const uint32_t diag_idx);
639 template <>
640 inline void u_process_2nd_iter<float>(float *U,
641  const int colUStride,
642  float *diag,
643  float *superdiag,
644  float *scale,
645  float *s,
646  const uint32_t num_cols,
647  const uint32_t diag_idx)
648 {
649  return;
650 }
651 template <>
652 inline void u_process_2nd_iter<double>(double *U,
653  const int colUStride,
654  double *diag,
655  double *superdiag,
656  double *scale,
657  double *s,
658  const uint32_t num_cols,
659  const uint32_t diag_idx)
660 {
661  double s2 = 0, half_norm_squared = 0;
662  /* -------------------------------------------------- */
663  /* For i = 1 */
664  /* -------------------------------------------------- */
665  {
666  typedef typename c7x::make_full_vector<double>::type vec;
667 
668  // __vpred pred_5_elem = __create_vpred(0x000000FFFFFFFFFFU);
669  __vpred pred_5_elem = __mask_long(num_cols);
670 
671 #if U_PROC_SKIP_VLOAD
672  vec vec_in0 = vec_in0_u_proc;
673  vec vec_in1 = vec_in1_u_proc;
674  vec vec_in2 = vec_in2_u_proc;
675  vec vec_in3 = vec_in3_u_proc;
676  vec vec_in4 = vec_in4_u_proc;
677 #else
678  vec vec_in0 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 0 * colUStride]));
679  vec vec_in1 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 1 * colUStride]));
680  vec vec_in2 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 2 * colUStride]));
681  vec vec_in3 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 3 * colUStride]));
682  vec vec_in4 = __vload_pred(pred_5_elem, (vec *) (&U[0 + 4 * colUStride]));
683 #endif
684 
685  superdiag[diag_idx] = (*scale) * (*s);
686  *scale = 0;
687 
688  /* -------------------------------------------------- */
689  /* U COLUMN PROCESS FOR i = 0 */
690  /* -------------------------------------------------- */
691 
692  /* Accumulate i = 0 th column absolute values to get 'scale' */
693  double in0_s0 = vec_in0.s[0];
694  double in1_s0 = vec_in1.s[0];
695  double in2_s0 = vec_in2.s[0];
696  double in3_s0 = vec_in3.s[0];
697  double in4_s0 = vec_in4.s[0];
698 
699  /* Store the first columns values in another vector */
700  vec vec_first_col = (vec) 0.0;
701  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
702  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
703  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
704  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
705  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in4_s0), 4, c7x::as_long_vec(vec_first_col)));
706 
707  (*scale) += __abs(in0_s0);
708  (*scale) += __abs(in1_s0);
709  (*scale) += __abs(in2_s0);
710  (*scale) += __abs(in3_s0);
711  (*scale) += __abs(in4_s0);
712 
713  if ((*scale) > 0) {
714  s2 = 0;
715  /* normalize the 'vec_first_col' */
716  vec_first_col = vec_first_col * (vec) getRecip((*scale)); // / scale;
717  vec vec_first_col_sq = vec_first_col * vec_first_col;
718  c7x_horizontal_add_5_elems(vec_first_col_sq, &s2);
719 
720  *s = getSqrt(s2);
721  if (vec_first_col.s[0] >= 0) {
722  *s = -(*s);
723  }
724 
725  half_norm_squared = vec_first_col.s[0] * (*s) - s2;
726  vec_first_col.s[0] -= (*s);
727 
728  vec vec_si = (vec) 0;
729 
730  vec_si += (vec_first_col.s[0] * vec_in0);
731  vec_si += (vec_first_col.s[1] * vec_in1);
732  vec_si += (vec_first_col.s[2] * vec_in2);
733  vec_si += (vec_first_col.s[3] * vec_in3);
734  vec_si += (vec_first_col.s[4] * vec_in4);
735 
736  vec_si = vec_si * (vec) getRecip(half_norm_squared); // / half_norm_squared;
737 
738  vec_in0 += vec_si * vec_first_col.s[0];
739  vec_in1 += vec_si * vec_first_col.s[1];
740  vec_in2 += vec_si * vec_first_col.s[2];
741  vec_in3 += vec_si * vec_first_col.s[3];
742  vec_in4 += vec_si * vec_first_col.s[4];
743  }
744  vec_first_col *= (*scale);
745  vec_in0.s[0] = vec_first_col.s[0];
746  vec_in1.s[0] = vec_first_col.s[1];
747  vec_in2.s[0] = vec_first_col.s[2];
748  vec_in3.s[0] = vec_first_col.s[3];
749  vec_in4.s[0] = vec_first_col.s[4];
750 
751  diag[diag_idx] = (*s) * (*scale);
752 
753  *s = 0;
754  *scale = 0;
755 
756  vec vec_row_mask = gen_u_row_process_mask<vec>();
757 
758  vec vec_first_row = vec_row_mask * vec_in0;
759  c7x_horizontal_add_5_elems(__abs(vec_first_row), scale);
760 
761  if ((*scale) > 0) {
762  s2 = 0;
763  vec_first_row = vec_first_row * (vec) getRecip((*scale)); // / scale;
764  vec vec_first_row_sq = vec_first_row * vec_first_row;
765  c7x_horizontal_add_5_elems(vec_first_row_sq, &s2);
766 
767  *s = getSqrt(s2);
768  if (vec_first_row.s[(num_cols - 1)] >= 0) {
769  *s = -(*s);
770  }
771 
772  half_norm_squared = vec_first_row.s[1] * (*s) - s2;
773  vec_first_row.s[1] -= (*s);
774 
775  vec vec_superdiag_temp = vec_first_row * (vec) getRecip(half_norm_squared); // / half_norm_squared;
776 
777  vec vec_si = (vec) 0;
778 
779  vec vec_temp1 = vec_first_row * vec_in1;
780  vec vec_temp2 = vec_first_row * vec_in2;
781  vec vec_temp3 = vec_first_row * vec_in3;
782  vec vec_temp4 = vec_first_row * vec_in4;
783 
784  c7x_horizontal_add_5_elems(vec_temp1, &vec_si.s[1]);
785  c7x_horizontal_add_5_elems(vec_temp2, &vec_si.s[2]);
786  c7x_horizontal_add_5_elems(vec_temp3, &vec_si.s[3]);
787  c7x_horizontal_add_5_elems(vec_temp4, &vec_si.s[4]);
788 
789  vec_in1 += vec_si.s[1] * vec_superdiag_temp;
790  vec_in2 += vec_si.s[2] * vec_superdiag_temp;
791  vec_in3 += vec_si.s[3] * vec_superdiag_temp;
792  vec_in4 += vec_si.s[4] * vec_superdiag_temp;
793 
794  vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
795  vec_in0 *= vec_row_rev_mask;
796  vec_in0 += vec_first_row * (*scale);
797  }
798 
799 #if U_PROC_SKIP_VLOAD
800  __vstore_pred(pred_5_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
801  U[0 + 1 * colUStride] = vec_in1.s[0];
802  U[0 + 2 * colUStride] = vec_in2.s[0];
803  U[0 + 3 * colUStride] = vec_in3.s[0];
804  U[0 + 4 * colUStride] = vec_in4.s[0];
805 
806  vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
807  vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
808  vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
809  vec_in3_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in4), 64u));
810 #else
811  __vstore_pred(pred_5_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
812  __vstore_pred(pred_5_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
813  __vstore_pred(pred_5_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
814  __vstore_pred(pred_5_elem, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
815  __vstore_pred(pred_5_elem, (vec *) (&U[0 + 4 * colUStride]), vec_in4);
816 #endif
817  }
818  return;
819 }
820 
821 template <typename dataType>
822 static inline void u_process_3rd_iter(dataType *U,
823  const int colUStride,
824  dataType *diag,
825  dataType *superdiag,
826  dataType *scale,
827  dataType *s,
828  const uint32_t num_cols,
829  const uint32_t diag_idx);
830 template <>
831 inline void u_process_3rd_iter<float>(float *U,
832  const int colUStride,
833  float *diag,
834  float *superdiag,
835  float *scale,
836  float *s,
837  const uint32_t num_cols,
838  const uint32_t diag_idx)
839 {
840  return;
841 }
842 template <>
843 inline void u_process_3rd_iter<double>(double *U,
844  const int colUStride,
845  double *diag,
846  double *superdiag,
847  double *scale,
848  double *s,
849  const uint32_t num_cols,
850  const uint32_t diag_idx)
851 {
852  double s2 = 0, half_norm_squared = 0;
853  /* -------------------------------------------------- */
854  /* For i = 2 */
855  /* -------------------------------------------------- */
856  {
857  typedef typename c7x::make_full_vector<double>::type vec;
858 
859  // __vpred pred_4_elem = __create_vpred(0x00000000FFFFFFFFU);
860  __vpred pred_4_elem = __mask_long(num_cols);
861 
862 #if U_PROC_SKIP_VLOAD
863  vec vec_in0 = vec_in0_u_proc;
864  vec vec_in1 = vec_in1_u_proc;
865  vec vec_in2 = vec_in2_u_proc;
866  vec vec_in3 = vec_in3_u_proc;
867 #else
868  vec vec_in0 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 0 * colUStride]));
869  vec vec_in1 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 1 * colUStride]));
870  vec vec_in2 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 2 * colUStride]));
871  vec vec_in3 = __vload_pred(pred_4_elem, (vec *) (&U[0 + 3 * colUStride]));
872 #endif
873 
874  superdiag[diag_idx] = (*scale) * (*s);
875  *scale = 0;
876 
877  /* -------------------------------------------------- */
878  /* U COLUMN PROCESS FOR i = 0 */
879  /* -------------------------------------------------- */
880 
881  /* Accumulate i = 0 th column absolute values to get 'scale' */
882  double in0_s0 = vec_in0.s[0];
883  double in1_s0 = vec_in1.s[0];
884  double in2_s0 = vec_in2.s[0];
885  double in3_s0 = vec_in3.s[0];
886 
887  /* Store the first columns values in another vector */
888  vec vec_first_col = (vec) 0.0;
889  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
890  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
891  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
892  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in3_s0), 3, c7x::as_long_vec(vec_first_col)));
893 
894  (*scale) += __abs(in0_s0);
895  (*scale) += __abs(in1_s0);
896  (*scale) += __abs(in2_s0);
897  (*scale) += __abs(in3_s0);
898 
899  if ((*scale) > 0) {
900  s2 = 0;
901  /* normalize the 'vec_first_col' */
902  vec_first_col = vec_first_col * (vec) getRecip((*scale)); // / scale;
903  vec vec_first_col_sq = vec_first_col * vec_first_col;
904  c7x_horizontal_add_4_elems(vec_first_col_sq, &s2);
905 
906  *s = getSqrt(s2);
907  if (vec_first_col.s[0] >= 0) {
908  *s = -(*s);
909  }
910 
911  half_norm_squared = vec_first_col.s[0] * (*s) - s2;
912  vec_first_col.s[0] -= (*s);
913 
914  vec vec_si = (vec) 0;
915 
916  vec_si += (vec_first_col.s[0] * vec_in0);
917  vec_si += (vec_first_col.s[1] * vec_in1);
918  vec_si += (vec_first_col.s[2] * vec_in2);
919  vec_si += (vec_first_col.s[3] * vec_in3);
920 
921  vec_si = vec_si * (vec) getRecip(half_norm_squared); // / half_norm_squared;
922 
923  vec_in0 += vec_si * vec_first_col.s[0];
924  vec_in1 += vec_si * vec_first_col.s[1];
925  vec_in2 += vec_si * vec_first_col.s[2];
926  vec_in3 += vec_si * vec_first_col.s[3];
927  }
928  vec_first_col *= (*scale);
929  vec_in0.s[0] = vec_first_col.s[0];
930  vec_in1.s[0] = vec_first_col.s[1];
931  vec_in2.s[0] = vec_first_col.s[2];
932  vec_in3.s[0] = vec_first_col.s[3];
933 
934  diag[diag_idx] = (*s) * (*scale);
935 
936  *s = 0;
937  *scale = 0;
938 
939  vec vec_row_mask = gen_u_row_process_mask<vec>();
940 
941  vec vec_first_row = vec_row_mask * vec_in0;
942  c7x_horizontal_add_4_elems(__abs(vec_first_row), scale);
943 
944  if ((*scale) > 0) {
945  s2 = 0;
946  vec_first_row = vec_first_row * (vec) getRecip((*scale)); // / scale;
947  vec vec_first_row_sq = vec_first_row * vec_first_row;
948  c7x_horizontal_add_4_elems(vec_first_row_sq, &s2);
949 
950  *s = getSqrt(s2);
951  if (vec_first_row.s[(num_cols - 1)] >= 0) {
952  *s = -(*s);
953  }
954 
955  half_norm_squared = vec_first_row.s[1] * (*s) - s2;
956  vec_first_row.s[1] -= (*s);
957 
958  vec vec_superdiag_temp = vec_first_row * (vec) getRecip(half_norm_squared); // / half_norm_squared;
959 
960  vec vec_si = (vec) 0;
961 
962  vec vec_temp1 = vec_first_row * vec_in1;
963  vec vec_temp2 = vec_first_row * vec_in2;
964  vec vec_temp3 = vec_first_row * vec_in3;
965 
966  c7x_horizontal_add_4_elems(vec_temp1, &vec_si.s[1]);
967  c7x_horizontal_add_4_elems(vec_temp2, &vec_si.s[2]);
968  c7x_horizontal_add_4_elems(vec_temp3, &vec_si.s[3]);
969 
970  vec_in1 += vec_si.s[1] * vec_superdiag_temp;
971  vec_in2 += vec_si.s[2] * vec_superdiag_temp;
972  vec_in3 += vec_si.s[3] * vec_superdiag_temp;
973 
974  vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
975  vec_in0 *= vec_row_rev_mask;
976  vec_in0 += vec_first_row * (*scale);
977  }
978 
979 #if U_PROC_SKIP_VLOAD
980  __vstore_pred(pred_4_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
981  U[0 + 1 * colUStride] = vec_in1.s[0];
982  U[0 + 2 * colUStride] = vec_in2.s[0];
983  U[0 + 3 * colUStride] = vec_in3.s[0];
984 
985  vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
986  vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
987  vec_in2_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in3), 64u));
988 #else
989  __vstore_pred(pred_4_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
990  __vstore_pred(pred_4_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
991  __vstore_pred(pred_4_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
992  __vstore_pred(pred_4_elem, (vec *) (&U[0 + 3 * colUStride]), vec_in3);
993 #endif
994  }
995  return;
996 }
997 
998 template <typename dataType>
999 static inline void u_process_4th_iter(dataType *U,
1000  const int colUStride,
1001  dataType *diag,
1002  dataType *superdiag,
1003  dataType *scale,
1004  dataType *s,
1005  const uint32_t num_cols,
1006  const uint32_t diag_idx);
1007 template <>
1008 inline void u_process_4th_iter<float>(float *U,
1009  const int colUStride,
1010  float *diag,
1011  float *superdiag,
1012  float *scale,
1013  float *s,
1014  const uint32_t num_cols,
1015  const uint32_t diag_idx)
1016 {
1017  return;
1018 }
1019 template <>
1020 inline void u_process_4th_iter<double>(double *U,
1021  const int colUStride,
1022  double *diag,
1023  double *superdiag,
1024  double *scale,
1025  double *s,
1026  const uint32_t num_cols,
1027  const uint32_t diag_idx)
1028 {
1029  double s2 = 0, half_norm_squared = 0;
1030  /* -------------------------------------------------- */
1031  /* For i = 3 */
1032  /* -------------------------------------------------- */
1033  {
1034  typedef typename c7x::make_full_vector<double>::type vec;
1035  // __vpred pred_3_elem = __create_vpred(0x0000000000FFFFFFU);
1036  __vpred pred_3_elem = __mask_long(num_cols);
1037 
1038 #if U_PROC_SKIP_VLOAD
1039  vec vec_in0 = vec_in0_u_proc;
1040  vec vec_in1 = vec_in1_u_proc;
1041  vec vec_in2 = vec_in2_u_proc;
1042 #else
1043  vec vec_in0 = __vload_pred(pred_3_elem, (vec *) (&U[0 + 0 * colUStride]));
1044  vec vec_in1 = __vload_pred(pred_3_elem, (vec *) (&U[0 + 1 * colUStride]));
1045  vec vec_in2 = __vload_pred(pred_3_elem, (vec *) (&U[0 + 2 * colUStride]));
1046 #endif
1047  superdiag[diag_idx] = (*scale) * (*s);
1048  *scale = 0;
1049 
1050  /* -------------------------------------------------- */
1051  /* U COLUMN PROCESS FOR i = 0 */
1052  /* -------------------------------------------------- */
1053 
1054  /* Accumulate i = 0 th column absolute values to get 'scale' */
1055  double in0_s0 = vec_in0.s[0];
1056  double in1_s0 = vec_in1.s[0];
1057  double in2_s0 = vec_in2.s[0];
1058 
1059  /* Store the first columns values in another vector */
1060  vec vec_first_col = (vec) 0.0;
1061  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
1062  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
1063  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in2_s0), 2, c7x::as_long_vec(vec_first_col)));
1064 
1065  (*scale) += __abs(in0_s0);
1066  (*scale) += __abs(in1_s0);
1067  (*scale) += __abs(in2_s0);
1068 
1069  if ((*scale) > 0) {
1070  s2 = 0;
1071  /* normalize the 'vec_first_col' */
1072  vec_first_col = vec_first_col * (vec) getRecip((*scale)); // / scale;
1073  vec vec_first_col_sq = vec_first_col * vec_first_col;
1074  c7x_horizontal_add_3_elems(vec_first_col_sq, &s2);
1075 
1076  *s = getSqrt(s2);
1077  if (vec_first_col.s[0] >= 0) {
1078  *s = -(*s);
1079  }
1080 
1081  half_norm_squared = vec_first_col.s[0] * (*s) - s2;
1082  vec_first_col.s[0] -= (*s);
1083 
1084  vec vec_si = (vec) 0;
1085 
1086  vec_si += (vec_first_col.s[0] * vec_in0);
1087  vec_si += (vec_first_col.s[1] * vec_in1);
1088  vec_si += (vec_first_col.s[2] * vec_in2);
1089 
1090  vec_si = vec_si * (vec) getRecip(half_norm_squared); // / half_norm_squared;
1091 
1092  vec_in0 += vec_si * vec_first_col.s[0];
1093  vec_in1 += vec_si * vec_first_col.s[1];
1094  vec_in2 += vec_si * vec_first_col.s[2];
1095  }
1096  vec_first_col *= (*scale);
1097  vec_in0.s[0] = vec_first_col.s[0];
1098  vec_in1.s[0] = vec_first_col.s[1];
1099  vec_in2.s[0] = vec_first_col.s[2];
1100 
1101  diag[diag_idx] = (*s) * (*scale);
1102 
1103  *s = 0;
1104  *scale = 0;
1105 
1106  vec vec_row_mask = gen_u_row_process_mask<vec>();
1107 
1108  vec vec_first_row = vec_row_mask * vec_in0;
1109  c7x_horizontal_add_3_elems(__abs(vec_first_row), scale);
1110 
1111  if ((*scale) > 0) {
1112  s2 = 0;
1113  vec_first_row = vec_first_row * (vec) getRecip((*scale)); // / scale;
1114  vec vec_first_row_sq = vec_first_row * vec_first_row;
1115  c7x_horizontal_add_3_elems(vec_first_row_sq, &s2);
1116 
1117  *s = getSqrt(s2);
1118  if (vec_first_row.s[(num_cols - 1)] >= 0) {
1119  *s = -(*s);
1120  }
1121 
1122  half_norm_squared = vec_first_row.s[1] * (*s) - s2;
1123  vec_first_row.s[1] -= (*s);
1124 
1125  vec vec_superdiag_temp = vec_first_row * (vec) getRecip(half_norm_squared); // / half_norm_squared;
1126 
1127  vec vec_si = (vec) 0;
1128 
1129  vec vec_temp1 = vec_first_row * vec_in1;
1130  vec vec_temp2 = vec_first_row * vec_in2;
1131 
1132  c7x_horizontal_add_3_elems(vec_temp1, &vec_si.s[1]);
1133  c7x_horizontal_add_3_elems(vec_temp2, &vec_si.s[2]);
1134 
1135  vec_in1 += vec_si.s[1] * vec_superdiag_temp;
1136  vec_in2 += vec_si.s[2] * vec_superdiag_temp;
1137 
1138  vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
1139  vec_in0 *= vec_row_rev_mask;
1140  vec_in0 += vec_first_row * (*scale);
1141  }
1142 
1143 #if U_PROC_SKIP_VLOAD
1144  __vstore_pred(pred_3_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1145  U[0 + 1 * colUStride] = vec_in1.s[0];
1146  U[0 + 2 * colUStride] = vec_in2.s[0];
1147 
1148  vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
1149  vec_in1_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in2), 64u));
1150 #else
1151  __vstore_pred(pred_3_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1152  __vstore_pred(pred_3_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
1153  __vstore_pred(pred_3_elem, (vec *) (&U[0 + 2 * colUStride]), vec_in2);
1154 #endif
1155  }
1156  return;
1157 }
1158 
1159 template <typename dataType>
1160 static inline void u_process_5th_iter(dataType *U,
1161  const int colUStride,
1162  dataType *diag,
1163  dataType *superdiag,
1164  dataType *scale,
1165  dataType *s,
1166  const uint32_t num_cols,
1167  const uint32_t diag_idx);
1168 template <>
1169 inline void u_process_5th_iter<float>(float *U,
1170  const int colUStride,
1171  float *diag,
1172  float *superdiag,
1173  float *scale,
1174  float *s,
1175  const uint32_t num_cols,
1176  const uint32_t diag_idx)
1177 {
1178  return;
1179 }
1180 template <>
1181 inline void u_process_5th_iter<double>(double *U,
1182  const int colUStride,
1183  double *diag,
1184  double *superdiag,
1185  double *scale,
1186  double *s,
1187  const uint32_t num_cols,
1188  const uint32_t diag_idx)
1189 {
1190  double s2 = 0, half_norm_squared = 0;
1191  /* -------------------------------------------------- */
1192  /* For i = 4 */
1193  /* -------------------------------------------------- */
1194 
1195  {
1196  typedef typename c7x::make_full_vector<double>::type vec;
1197 
1198  // __vpred pred_2_elem = __create_vpred(0x000000000000FFFFU);
1199  __vpred pred_2_elem = __mask_long(num_cols);
1200 
1201 #if U_PROC_SKIP_VLOAD
1202  vec vec_in0 = vec_in0_u_proc;
1203  vec vec_in1 = vec_in1_u_proc;
1204 #else
1205  vec vec_in0 = __vload_pred(pred_2_elem, (vec *) (&U[0 + 0 * colUStride]));
1206  vec vec_in1 = __vload_pred(pred_2_elem, (vec *) (&U[0 + 1 * colUStride]));
1207 #endif
1208  superdiag[diag_idx] = (*scale) * (*s);
1209  *scale = 0;
1210 
1211  /* -------------------------------------------------- */
1212  /* U COLUMN PROCESS FOR i = 0 */
1213  /* -------------------------------------------------- */
1214 
1215  /* Accumulate i = 0 th column absolute values to get 'scale' */
1216  double in0_s0 = vec_in0.s[0];
1217  double in1_s0 = vec_in1.s[0];
1218 
1219  /* Store the first columns values in another vector */
1220  vec vec_first_col = (vec) 0.0;
1221  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in0_s0), 0, c7x::as_long_vec(vec_first_col)));
1222  vec_first_col = c7x::reinterpret<vec>(__vputd_dkv(__as_long(in1_s0), 1, c7x::as_long_vec(vec_first_col)));
1223 
1224  (*scale) += __abs(in0_s0);
1225  (*scale) += __abs(in1_s0);
1226 
1227  if ((*scale) > 0) {
1228  s2 = 0;
1229  /* normalize the 'vec_first_col' */
1230  vec_first_col = vec_first_col * (vec) getRecip((*scale)); // / scale;
1231  vec vec_first_col_sq = vec_first_col * vec_first_col;
1232  c7x_horizontal_add_2_elems(vec_first_col_sq, &s2);
1233 
1234  *s = getSqrt(s2);
1235  if (vec_first_col.s[0] >= 0) {
1236  *s = -(*s);
1237  }
1238 
1239  half_norm_squared = vec_first_col.s[0] * (*s) - s2;
1240  vec_first_col.s[0] -= (*s);
1241 
1242  vec vec_si = (vec) 0;
1243 
1244  vec_si += (vec_first_col.s[0] * vec_in0);
1245  vec_si += (vec_first_col.s[1] * vec_in1);
1246 
1247  vec_si = vec_si * (vec) getRecip(half_norm_squared); // / half_norm_squared;
1248 
1249  vec_in0 += vec_si * vec_first_col.s[0];
1250  vec_in1 += vec_si * vec_first_col.s[1];
1251  }
1252  vec_first_col *= (*scale);
1253  vec_in0.s[0] = vec_first_col.s[0];
1254  vec_in1.s[0] = vec_first_col.s[1];
1255 
1256  diag[diag_idx] = (*s) * (*scale);
1257 
1258  *s = 0;
1259  *scale = 0;
1260 
1261  vec vec_row_mask = gen_u_row_process_mask<vec>();
1262 
1263  vec vec_first_row = vec_row_mask * vec_in0;
1264  c7x_horizontal_add_2_elems(__abs(vec_first_row), scale);
1265 
1266  if ((*scale) > 0) {
1267  s2 = 0;
1268  vec_first_row = vec_first_row * (vec) getRecip((*scale)); // / scale;
1269  vec vec_first_row_sq = vec_first_row * vec_first_row;
1270  c7x_horizontal_add_2_elems(vec_first_row_sq, &s2);
1271 
1272  *s = getSqrt(s2);
1273  if (vec_first_row.s[(num_cols - 1)] >= 0) {
1274  *s = -(*s);
1275  }
1276 
1277  half_norm_squared = vec_first_row.s[1] * (*s) - s2;
1278  vec_first_row.s[1] -= (*s);
1279 
1280  vec vec_superdiag_temp = vec_first_row * (vec) getRecip(half_norm_squared); // / half_norm_squared;
1281 
1282  vec vec_si = (vec) 0;
1283 
1284  vec vec_temp1 = vec_first_row * vec_in1;
1285 
1286  c7x_horizontal_add_2_elems(vec_temp1, &vec_si.s[1]);
1287 
1288  vec_in1 += vec_si.s[1] * vec_superdiag_temp;
1289 
1290  vec vec_row_rev_mask = gen_u_row_process_rev_mask<vec>();
1291  vec_in0 *= vec_row_rev_mask;
1292  vec_in0 += vec_first_row * (*scale);
1293  }
1294 
1295 #if U_PROC_SKIP_VLOAD
1296  __vstore_pred(pred_2_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1297  __vstore_pred(pred_2_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
1298 
1299  vec_in0_u_proc = c7x::as_double_vec(__shift_right_full(c7x::as_long_vec(vec_in1), 64u));
1300 #else
1301  __vstore_pred(pred_2_elem, (vec *) (&U[0 + 0 * colUStride]), vec_in0);
1302  __vstore_pred(pred_2_elem, (vec *) (&U[0 + 1 * colUStride]), vec_in1);
1303 #endif
1304  }
1305  return;
1306 }
1307 
1308 template <typename dataType>
1309 static inline void u_process_6th_iter(dataType *U,
1310  const int colUStride,
1311  dataType *diag,
1312  dataType *superdiag,
1313  dataType *scale,
1314  dataType *s,
1315  const uint32_t num_cols,
1316  const uint32_t diag_idx);
1317 template <>
1318 inline void u_process_6th_iter<float>(float *U,
1319  const int colUStride,
1320  float *diag,
1321  float *superdiag,
1322  float *scale,
1323  float *s,
1324  const uint32_t num_cols,
1325  const uint32_t diag_idx)
1326 {
1327  return;
1328 }
1329 template <>
1330 inline void u_process_6th_iter<double>(double *U,
1331  const int colUStride,
1332  double *diag,
1333  double *superdiag,
1334  double *scale,
1335  double *s,
1336  const uint32_t num_cols,
1337  const uint32_t diag_idx)
1338 {
1339  double s2 = 0, half_norm_squared = 0;
1340  /* -------------------------------------------------- */
1341  /* For i = 5 */
1342  /* -------------------------------------------------- */
1343  {
1344  double vec_in0 = U[0 + 0 * colUStride];
1345 
1346  superdiag[5] = (*scale) * (*s);
1347  *scale = 0;
1348 
1349  /* -------------------------------------------------- */
1350  /* U COLUMN PROCESS FOR i = 0 */
1351  /* -------------------------------------------------- */
1352 
1353  /* Accumulate i = 0 th column absolute values to get 'scale' */
1354  double in0_s0 = vec_in0;
1355 
1356  /* Store the first columns values in another vector */
1357  double vec_first_col = in0_s0;
1358 
1359  (*scale) += __abs(in0_s0);
1360 
1361  if ((*scale) > 0.0) {
1362  s2 = 0.0;
1363  /* normalize the 'vec_first_col' */
1364  vec_first_col = vec_first_col * (double) getRecip((*scale)); // / scale;
1365  double vec_first_col_sq = vec_first_col * vec_first_col;
1366  s2 = vec_first_col_sq;
1367 
1368  *s = getSqrt(s2);
1369  if (vec_first_col >= 0) {
1370  *s = -(*s);
1371  }
1372 
1373  half_norm_squared = vec_first_col * (*s) - s2;
1374  vec_first_col -= (*s);
1375 
1376  double vec_si = (double) 0.0;
1377 
1378  vec_si += (vec_first_col * vec_in0);
1379 
1380  vec_si = vec_si * (double) getRecip(half_norm_squared); // / half_norm_squared;
1381 
1382  vec_in0 += vec_si * vec_first_col;
1383  }
1384  vec_first_col *= (*scale);
1385  vec_in0 = vec_first_col;
1386 
1387  diag[5] = (*s) * (*scale);
1388 
1389  *s = 0.0;
1390  *scale = 0.0;
1391 
1392  U[0 + 0 * colUStride] = vec_in0;
1393  }
1394  return;
1395 }
1396 
1397 
1398 /* ********************************************************************************** */
1399 /* ********************************************************************************** */
1400 /* ************************* U UPDATE PROCESS FOR 6X6 **************************** */
1401 /* ********************************************************************************** */
1402 /* ********************************************************************************** */
1403 
1404 #if (__C7X_VEC_SIZE_BITS__ == 512)
1405 template <typename X> static inline X gen_plusOneVec();
1406 
1407 template <> inline c7x::float_vec gen_plusOneVec<c7x::float_vec>()
1408 {
1409  c7x::float_vec ret_vec = c7x::float_vec(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1410  return ret_vec;
1411 }
1412 
1413 template <> inline c7x::double_vec gen_plusOneVec<c7x::double_vec>()
1414 {
1415  c7x::double_vec ret_vec = c7x::double_vec(1, 0, 0, 0, 0, 0, 0, 0);
1416  return ret_vec;
1417 }
1418 #elif (__C7X_VEC_SIZE_BITS__ == 256)
1419 template <typename X> static inline X gen_plusOneVec();
1420 
1421 template <> inline c7x::float_vec gen_plusOneVec<c7x::float_vec>()
1422 {
1423  c7x::float_vec ret_vec = c7x::float_vec(1, 0, 0, 0, 0, 0, 0, 0);
1424  return ret_vec;
1425 }
1426 
1427 template <> inline c7x::double_vec gen_plusOneVec<c7x::double_vec>()
1428 {
1429  c7x::double_vec ret_vec = c7x::double_vec(1, 0, 0, 0);
1430  return ret_vec;
1431 }
1432 
1433 
1434 #endif
1435 
1436 template <typename dataType>
1437 static inline void u_update_1st_iter(dataType *U, const int colUStride, dataType *diag, dataType s);
1438 template <> inline void u_update_1st_iter<float>(float *U, const int colUStride, float *diag, float s) { return; }
1439 template <> inline void u_update_1st_iter<double>(double *U, const int colUStride, double *diag, double s)
1440 {
1441  {
1442  typedef typename c7x::make_full_vector<double>::type vec;
1443 
1444  __vpred pred_1_elem = __create_vpred(0x00000000000000FFU);
1445  vec vec_in0 = (vec) 0;
1446  vec vec_mask_first_row = gen_plusOneVec<vec>();
1447  if (s != 0) {
1448  vec_in0 = __vload_pred(pred_1_elem, (vec *) U);
1449  vec_in0 = vec_in0 * (vec) getRecip(s) + vec_mask_first_row;
1450  __vstore_pred(pred_1_elem, (vec *) U, vec_in0);
1451  }
1452  else {
1453  __vstore_pred(pred_1_elem, (vec *) U, (vec_in0 + vec_mask_first_row));
1454  }
1455  }
1456  return;
1457 }
1458 
1459 template <typename dataType>
1460 static inline void u_update_2nd_iter(dataType *U, const int colUStride, dataType *diag, dataType s);
1461 template <> inline void u_update_2nd_iter<float>(float *U, const int colUStride, float *diag, float s) { return; }
1462 template <> inline void u_update_2nd_iter<double>(double *U, const int colUStride, double *diag, double s)
1463 {
1464  {
1465  typedef typename c7x::make_full_vector<double>::type vec;
1466 
1467  vec vec_mask_first_row = gen_plusOneVec<vec>();
1468  __vpred pred_2_elem = __create_vpred(0x000000000000FFFFU);
1469 
1470  vec vec_first_col = (vec) 0;
1471  vec vec_si = (vec) 0.0;
1472  vec vec_in0 = __vload_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride]);
1473  vec_first_col.s[0] = vec_in0.s[0];
1474  vec_in0 = vec_in0 * vec_mask_first_row;
1475  vec norm_factor = (vec) (vec_in0.s[0] * s);
1476  vec inv_norm_factor = getRecip(norm_factor);
1477 
1478  if (s != 0) {
1479  vec vec_in1 = __vload_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride]);
1480  vec_first_col.s[1] = vec_in1.s[0];
1481 
1482  vec_si += vec_first_col.s[1] * vec_in1;
1483 
1484  vec_si = vec_si * inv_norm_factor;
1485 
1486  vec_in0 += vec_si * vec_first_col.s[0];
1487  vec_in1 += vec_si * vec_first_col.s[1];
1488 
1489  vec_first_col = vec_first_col * (vec) getRecip(s);
1490 
1491  vec_in0.s[0] = vec_first_col.s[0];
1492  vec_in1.s[0] = vec_first_col.s[1];
1493 
1494  vec_in0 += vec_mask_first_row;
1495 
1496  __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1497  __vstore_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1498  }
1499  else {
1500  vec_in0 += vec_mask_first_row;
1501  __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1502  U[0 + 1 * colUStride] = 0.0;
1503  }
1504  }
1505  return;
1506 }
1507 
1508 template <typename dataType>
1509 static inline void u_update_3rd_iter(dataType *U, const int colUStride, dataType *diag, dataType s);
1510 template <> inline void u_update_3rd_iter<float>(float *U, const int colUStride, float *diag, float s) { return; }
1511 template <> inline void u_update_3rd_iter<double>(double *U, const int colUStride, double *diag, double s)
1512 {
1513  {
1514  typedef typename c7x::make_full_vector<double>::type vec;
1515 
1516  vec vec_mask_first_row = gen_plusOneVec<vec>();
1517  __vpred pred_3_elem = __create_vpred(0x0000000000FFFFFFU);
1518 
1519  vec vec_first_col = (vec) 0;
1520  vec vec_si = (vec) 0.0;
1521  vec vec_in0 = __vload_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride]);
1522  vec_first_col.s[0] = vec_in0.s[0];
1523  vec_in0 = vec_in0 * vec_mask_first_row;
1524  vec norm_factor = (vec) (vec_in0.s[0] * s);
1525  vec inv_norm_factor = getRecip(norm_factor);
1526 
1527  if (s != 0) {
1528  vec vec_in1 = __vload_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride]);
1529  vec vec_in2 = __vload_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride]);
1530 
1531  vec_first_col.s[1] = vec_in1.s[0];
1532  vec_first_col.s[2] = vec_in2.s[0];
1533 
1534  vec_si += vec_first_col.s[1] * vec_in1;
1535  vec_si += vec_first_col.s[2] * vec_in2;
1536 
1537  vec_si = vec_si * inv_norm_factor;
1538 
1539  vec_in0 += vec_si * vec_first_col.s[0];
1540  vec_in1 += vec_si * vec_first_col.s[1];
1541  vec_in2 += vec_si * vec_first_col.s[2];
1542 
1543  vec_first_col = vec_first_col * (vec) getRecip(s);
1544 
1545  vec_in0.s[0] = vec_first_col.s[0];
1546  vec_in1.s[0] = vec_first_col.s[1];
1547  vec_in2.s[0] = vec_first_col.s[2];
1548 
1549  vec_in0 += vec_mask_first_row;
1550 
1551  __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1552  __vstore_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1553  __vstore_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1554  }
1555  else {
1556  vec_in0 += vec_mask_first_row;
1557  __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1558  U[0 + 1 * colUStride] = 0.0;
1559  U[0 + 2 * colUStride] = 0.0;
1560  }
1561  }
1562  return;
1563 }
1564 
1565 template <typename dataType>
1566 static inline void u_update_4th_iter(dataType *U, const int colUStride, dataType *diag, dataType s);
1567 template <> inline void u_update_4th_iter<float>(float *U, const int colUStride, float *diag, float s) { return; }
1568 template <> inline void u_update_4th_iter<double>(double *U, const int colUStride, double *diag, double s)
1569 {
1570  {
1571  typedef typename c7x::make_full_vector<double>::type vec;
1572 
1573  vec vec_mask_first_row = gen_plusOneVec<vec>();
1574  __vpred pred_4_elem = __create_vpred(0x00000000FFFFFFFFU);
1575 
1576  vec vec_first_col = (vec) 0;
1577  vec vec_si = (vec) 0.0;
1578  vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1579  vec_first_col.s[0] = vec_in0.s[0];
1580  vec_in0 = vec_in0 * vec_mask_first_row;
1581  vec norm_factor = (vec) (vec_in0.s[0] * s);
1582  vec inv_norm_factor = getRecip(norm_factor);
1583 
1584  if (s != 0) {
1585  vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1586  vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1587  vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1588 
1589  vec_first_col.s[1] = vec_in1.s[0];
1590  vec_first_col.s[2] = vec_in2.s[0];
1591  vec_first_col.s[3] = vec_in3.s[0];
1592 
1593  vec_si += vec_first_col.s[1] * vec_in1;
1594  vec_si += vec_first_col.s[2] * vec_in2;
1595  vec_si += vec_first_col.s[3] * vec_in3;
1596 
1597  vec_si = vec_si * inv_norm_factor;
1598 
1599  vec_in0 += vec_si * vec_first_col.s[0];
1600  vec_in1 += vec_si * vec_first_col.s[1];
1601  vec_in2 += vec_si * vec_first_col.s[2];
1602  vec_in3 += vec_si * vec_first_col.s[3];
1603 
1604  vec_first_col = vec_first_col * (vec) getRecip(s);
1605 
1606  vec_in0.s[0] = vec_first_col.s[0];
1607  vec_in1.s[0] = vec_first_col.s[1];
1608  vec_in2.s[0] = vec_first_col.s[2];
1609  vec_in3.s[0] = vec_first_col.s[3];
1610 
1611  vec_in0 += vec_mask_first_row;
1612 
1613  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1614  __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1615  __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1616  __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1617  }
1618  else {
1619  vec_in0 += vec_mask_first_row;
1620  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1621  U[0 + 1 * colUStride] = 0.0;
1622  U[0 + 2 * colUStride] = 0.0;
1623  U[0 + 3 * colUStride] = 0.0;
1624  }
1625  }
1626  return;
1627 }
1628 
1629 template <typename dataType>
1630 static inline void u_update_5th_iter(dataType *U, const int colUStride, dataType *diag, dataType s);
1631 template <> inline void u_update_5th_iter<float>(float *U, const int colUStride, float *diag, float s) { return; }
1632 template <> inline void u_update_5th_iter<double>(double *U, const int colUStride, double *diag, double s)
1633 {
1634  {
1635  typedef typename c7x::make_full_vector<double>::type vec;
1636 
1637  vec vec_mask_first_row = gen_plusOneVec<vec>();
1638  __vpred pred_4_elem = __create_vpred(0x000000FFFFFFFFFFU);
1639 
1640  vec vec_first_col = (vec) 0;
1641  vec vec_si = (vec) 0.0;
1642  vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1643  vec_first_col.s[0] = vec_in0.s[0];
1644  vec_in0 = vec_in0 * vec_mask_first_row;
1645  vec norm_factor = (vec) (vec_in0.s[0] * s);
1646  vec inv_norm_factor = getRecip(norm_factor);
1647 
1648  if (s != 0) {
1649  vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1650  vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1651  vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1652  vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
1653 
1654  vec_first_col.s[1] = vec_in1.s[0];
1655  vec_first_col.s[2] = vec_in2.s[0];
1656  vec_first_col.s[3] = vec_in3.s[0];
1657  vec_first_col.s[4] = vec_in4.s[0];
1658 
1659  vec_si += vec_first_col.s[1] * vec_in1;
1660  vec_si += vec_first_col.s[2] * vec_in2;
1661  vec_si += vec_first_col.s[3] * vec_in3;
1662  vec_si += vec_first_col.s[4] * vec_in4;
1663 
1664  vec_si = vec_si * inv_norm_factor;
1665 
1666  vec_in0 += vec_si * vec_first_col.s[0];
1667  vec_in1 += vec_si * vec_first_col.s[1];
1668  vec_in2 += vec_si * vec_first_col.s[2];
1669  vec_in3 += vec_si * vec_first_col.s[3];
1670  vec_in4 += vec_si * vec_first_col.s[4];
1671 
1672  vec_first_col = vec_first_col * (vec) getRecip(s);
1673 
1674  vec_in0.s[0] = vec_first_col.s[0];
1675  vec_in1.s[0] = vec_first_col.s[1];
1676  vec_in2.s[0] = vec_first_col.s[2];
1677  vec_in3.s[0] = vec_first_col.s[3];
1678  vec_in4.s[0] = vec_first_col.s[4];
1679 
1680  vec_in0 += vec_mask_first_row;
1681 
1682  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1683  __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1684  __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1685  __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1686  __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
1687  }
1688  else {
1689  vec_in0 += vec_mask_first_row;
1690  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1691  U[0 + 1 * colUStride] = 0.0;
1692  U[0 + 2 * colUStride] = 0.0;
1693  U[0 + 3 * colUStride] = 0.0;
1694  U[0 + 4 * colUStride] = 0.0;
1695  }
1696  }
1697  return;
1698 }
1699 
1700 template <typename dataType>
1701 static inline void u_update_6th_iter(dataType *U, const int colUStride, dataType *diag, dataType s);
1702 template <> inline void u_update_6th_iter<float>(float *U, const int colUStride, float *diag, float s) { return; }
1703 template <> inline void u_update_6th_iter<double>(double *U, const int colUStride, double *diag, double s)
1704 {
1705 
1706  {
1707  typedef typename c7x::make_full_vector<double>::type vec;
1708 
1709  vec vec_mask_first_row = gen_plusOneVec<vec>();
1710  __vpred pred_4_elem = __create_vpred(0x0000FFFFFFFFFFFFU);
1711 
1712  vec vec_first_col = (vec) 0;
1713  vec vec_si = (vec) 0.0;
1714  vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1715  vec_first_col.s[0] = vec_in0.s[0];
1716  vec_in0 = vec_in0 * vec_mask_first_row;
1717  vec norm_factor = (vec) (vec_in0.s[0] * s);
1718  vec inv_norm_factor = getRecip(norm_factor);
1719 
1720  if (s != 0) {
1721  vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1722  vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1723  vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1724  vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
1725  vec vec_in5 = __vload_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride]);
1726 
1727  vec_first_col.s[1] = vec_in1.s[0];
1728  vec_first_col.s[2] = vec_in2.s[0];
1729  vec_first_col.s[3] = vec_in3.s[0];
1730  vec_first_col.s[4] = vec_in4.s[0];
1731  vec_first_col.s[5] = vec_in5.s[0];
1732 
1733  vec_si += vec_first_col.s[1] * vec_in1;
1734  vec_si += vec_first_col.s[2] * vec_in2;
1735  vec_si += vec_first_col.s[3] * vec_in3;
1736  vec_si += vec_first_col.s[4] * vec_in4;
1737  vec_si += vec_first_col.s[5] * vec_in5;
1738 
1739  vec_si = vec_si * inv_norm_factor;
1740 
1741  vec_in0 += vec_si * vec_first_col.s[0];
1742  vec_in1 += vec_si * vec_first_col.s[1];
1743  vec_in2 += vec_si * vec_first_col.s[2];
1744  vec_in3 += vec_si * vec_first_col.s[3];
1745  vec_in4 += vec_si * vec_first_col.s[4];
1746  vec_in5 += vec_si * vec_first_col.s[5];
1747 
1748  vec_first_col = vec_first_col * (vec) getRecip(s);
1749 
1750  vec_in0.s[0] = vec_first_col.s[0];
1751  vec_in1.s[0] = vec_first_col.s[1];
1752  vec_in2.s[0] = vec_first_col.s[2];
1753  vec_in3.s[0] = vec_first_col.s[3];
1754  vec_in4.s[0] = vec_first_col.s[4];
1755  vec_in5.s[0] = vec_first_col.s[5];
1756 
1757  vec_in0 += vec_mask_first_row;
1758 
1759  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1760  __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1761  __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1762  __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1763  __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
1764  __vstore_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride], vec_in5);
1765  }
1766  else {
1767  vec_in0 += vec_mask_first_row;
1768  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1769  U[0 + 1 * colUStride] = 0.0;
1770  U[0 + 2 * colUStride] = 0.0;
1771  U[0 + 3 * colUStride] = 0.0;
1772  U[0 + 4 * colUStride] = 0.0;
1773  U[0 + 5 * colUStride] = 0.0;
1774  }
1775  }
1776  return;
1777 }
1778 
1779 /* ********************************************************************************** */
1780 /* ********************************************************************************** */
1781 /* ************************* U UPDATE PROCESS FOR 6X7 REDUCE FORM ***************** */
1782 /* ********************************************************************************** */
1783 /* ********************************************************************************** */
1784 
1785 template <typename dataType>
1786 static inline void
1787 u_update_6X7_NR_1st_iter(dataType *U, const int colUStride, dataType *diag, dataType s, const uint32_t col_elems);
1788 template <>
1789 inline void
1790 u_update_6X7_NR_1st_iter<float>(float *U, const int colUStride, float *diag, float s, const uint32_t col_elems)
1791 {
1792  return;
1793 }
1794 template <>
1795 inline void
1796 u_update_6X7_NR_1st_iter<double>(double *U, const int colUStride, double *diag, double s, const uint32_t col_elems)
1797 {
1798  {
1799  typedef typename c7x::make_full_vector<double>::type vec;
1800 
1801  __vpred pred_1_elem = __mask_long(col_elems);
1802  vec vec_in0, vec_in1;
1803  vec vec_mask_first_row = gen_plusOneVec<vec>();
1804  if (s != 0) {
1805  vec recip_s = (vec) getRecip(s);
1806  vec_in0 = __vload_pred(pred_1_elem, (vec *) &U[0 + 0 * colUStride]);
1807  vec_in1 = __vload_pred(pred_1_elem, (vec *) &U[0 + 1 * colUStride]);
1808 
1809  vec_in0 = vec_in0 * recip_s + vec_mask_first_row;
1810  vec_in1 = vec_in1 * recip_s;
1811 
1812  __vstore_pred(pred_1_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1813  __vstore_pred(pred_1_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1814  }
1815  else {
1816  __vstore_pred(pred_1_elem, (vec *) U, (vec_in0 + vec_mask_first_row));
1817  U[0 + 1 * colUStride] = 0.0;
1818  }
1819  }
1820  return;
1821 }
1822 
1823 template <typename dataType>
1824 static inline void
1825 u_update_6X7_NR_2nd_iter(dataType *U, const int colUStride, dataType *diag, dataType s, const uint32_t col_elems);
1826 template <>
1827 inline void
1828 u_update_6X7_NR_2nd_iter<float>(float *U, const int colUStride, float *diag, float s, const uint32_t col_elems)
1829 {
1830  return;
1831 }
1832 template <>
1833 inline void
1834 u_update_6X7_NR_2nd_iter<double>(double *U, const int colUStride, double *diag, double s, const uint32_t col_elems)
1835 {
1836 
1837  {
1838  typedef typename c7x::make_full_vector<double>::type vec;
1839 
1840  vec vec_mask_first_row = gen_plusOneVec<vec>();
1841  __vpred pred_2_elem = __mask_long(col_elems);
1842 
1843  vec vec_first_col = (vec) 0;
1844  vec vec_si = (vec) 0.0;
1845  vec vec_in0 = __vload_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride]);
1846  vec_first_col.s[0] = vec_in0.s[0];
1847  vec_in0 = vec_in0 * vec_mask_first_row;
1848  vec norm_factor = (vec) (vec_in0.s[0] * s);
1849  vec inv_norm_factor = getRecip(norm_factor);
1850 
1851  if (s != 0) {
1852  vec recip_s = (vec) getRecip(s);
1853  vec vec_in1 = __vload_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride]);
1854  vec vec_in2 = __vload_pred(pred_2_elem, (vec *) &U[0 + 2 * colUStride]);
1855 
1856  vec_first_col.s[1] = vec_in1.s[0];
1857  vec_first_col.s[2] = vec_in2.s[0];
1858 
1859  vec_si += vec_first_col.s[1] * vec_in1;
1860  vec_si += vec_first_col.s[2] * vec_in2;
1861 
1862  vec_si = vec_si * inv_norm_factor;
1863 
1864  vec_in0 += vec_si * vec_first_col.s[0];
1865  vec_in1 += vec_si * vec_first_col.s[1];
1866  vec_in2 += vec_si * vec_first_col.s[2];
1867 
1868  vec_first_col = vec_first_col * recip_s;
1869 
1870  vec_in0.s[0] = vec_first_col.s[0];
1871  vec_in1.s[0] = vec_first_col.s[1];
1872  vec_in2.s[0] = vec_first_col.s[2];
1873 
1874  vec_in0 += vec_mask_first_row;
1875 
1876  __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1877  __vstore_pred(pred_2_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1878  __vstore_pred(pred_2_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1879  }
1880  else {
1881  vec_in0 += vec_mask_first_row;
1882  __vstore_pred(pred_2_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1883  U[0 + 1 * colUStride] = 0.0;
1884  U[0 + 2 * colUStride] = 0.0;
1885  }
1886  }
1887  return;
1888 }
1889 
1890 template <typename dataType>
1891 static inline void
1892 u_update_6X7_NR_3rd_iter(dataType *U, const int colUStride, dataType *diag, dataType s, const uint32_t col_elems);
1893 template <>
1894 inline void
1895 u_update_6X7_NR_3rd_iter<float>(float *U, const int colUStride, float *diag, float s, const uint32_t col_elems)
1896 {
1897  return;
1898 }
1899 template <>
1900 inline void
1901 u_update_6X7_NR_3rd_iter<double>(double *U, const int colUStride, double *diag, double s, const uint32_t col_elems)
1902 {
1903  {
1904  typedef typename c7x::make_full_vector<double>::type vec;
1905 
1906  vec vec_mask_first_row = gen_plusOneVec<vec>();
1907  __vpred pred_3_elem = __mask_long(col_elems);
1908 
1909  vec vec_first_col = (vec) 0;
1910  vec vec_si = (vec) 0.0;
1911  vec vec_in0 = __vload_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride]);
1912  vec_first_col.s[0] = vec_in0.s[0];
1913  vec_in0 = vec_in0 * vec_mask_first_row;
1914  vec norm_factor = (vec) (vec_in0.s[0] * s);
1915  vec inv_norm_factor = getRecip(norm_factor);
1916 
1917  if (s != 0) {
1918  vec recip_s = (vec) getRecip(s);
1919  vec vec_in1 = __vload_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride]);
1920  vec vec_in2 = __vload_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride]);
1921  vec vec_in3 = __vload_pred(pred_3_elem, (vec *) &U[0 + 3 * colUStride]);
1922 
1923  vec_first_col.s[1] = vec_in1.s[0];
1924  vec_first_col.s[2] = vec_in2.s[0];
1925  vec_first_col.s[3] = vec_in3.s[0];
1926 
1927  vec_si += vec_first_col.s[1] * vec_in1;
1928  vec_si += vec_first_col.s[2] * vec_in2;
1929  vec_si += vec_first_col.s[3] * vec_in3;
1930 
1931  vec_si = vec_si * inv_norm_factor;
1932 
1933  vec_in0 += vec_si * vec_first_col.s[0];
1934  vec_in1 += vec_si * vec_first_col.s[1];
1935  vec_in2 += vec_si * vec_first_col.s[2];
1936  vec_in3 += vec_si * vec_first_col.s[3];
1937 
1938  vec_first_col = vec_first_col * recip_s;
1939 
1940  vec_in0.s[0] = vec_first_col.s[0];
1941  vec_in1.s[0] = vec_first_col.s[1];
1942  vec_in2.s[0] = vec_first_col.s[2];
1943  vec_in3.s[0] = vec_first_col.s[3];
1944 
1945  vec_in0 += vec_mask_first_row;
1946 
1947  __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1948  __vstore_pred(pred_3_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
1949  __vstore_pred(pred_3_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
1950  __vstore_pred(pred_3_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
1951  }
1952  else {
1953  vec_in0 += vec_mask_first_row;
1954  __vstore_pred(pred_3_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
1955  U[0 + 1 * colUStride] = 0.0;
1956  U[0 + 2 * colUStride] = 0.0;
1957  U[0 + 3 * colUStride] = 0.0;
1958  }
1959  }
1960  return;
1961 }
1962 
1963 template <typename dataType>
1964 static inline void
1965 u_update_6X7_NR_4th_iter(dataType *U, const int colUStride, dataType *diag, dataType s, const uint32_t col_elems);
1966 template <>
1967 inline void
1968 u_update_6X7_NR_4th_iter<float>(float *U, const int colUStride, float *diag, float s, const uint32_t col_elems)
1969 {
1970  return;
1971 }
1972 template <>
1973 inline void
1974 u_update_6X7_NR_4th_iter<double>(double *U, const int colUStride, double *diag, double s, const uint32_t col_elems)
1975 {
1976  {
1977  typedef typename c7x::make_full_vector<double>::type vec;
1978 
1979  vec vec_mask_first_row = gen_plusOneVec<vec>();
1980  __vpred pred_4_elem = __mask_long(col_elems);
1981 
1982  vec vec_first_col = (vec) 0;
1983  vec vec_si = (vec) 0.0;
1984  vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
1985  vec_first_col.s[0] = vec_in0.s[0];
1986  vec_in0 = vec_in0 * vec_mask_first_row;
1987  vec norm_factor = (vec) (vec_in0.s[0] * s);
1988  vec inv_norm_factor = getRecip(norm_factor);
1989 
1990  if (s != 0) {
1991  vec recip_s = (vec) getRecip(s);
1992  vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
1993  vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
1994  vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
1995  vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
1996 
1997  vec_first_col.s[1] = vec_in1.s[0];
1998  vec_first_col.s[2] = vec_in2.s[0];
1999  vec_first_col.s[3] = vec_in3.s[0];
2000  vec_first_col.s[4] = vec_in4.s[0];
2001 
2002  vec_si += vec_first_col.s[1] * vec_in1;
2003  vec_si += vec_first_col.s[2] * vec_in2;
2004  vec_si += vec_first_col.s[3] * vec_in3;
2005  vec_si += vec_first_col.s[4] * vec_in4;
2006 
2007  vec_si = vec_si * inv_norm_factor;
2008 
2009  vec_in0 += vec_si * vec_first_col.s[0];
2010  vec_in1 += vec_si * vec_first_col.s[1];
2011  vec_in2 += vec_si * vec_first_col.s[2];
2012  vec_in3 += vec_si * vec_first_col.s[3];
2013  vec_in4 += vec_si * vec_first_col.s[4];
2014 
2015  vec_first_col = vec_first_col * recip_s;
2016 
2017  vec_in0.s[0] = vec_first_col.s[0];
2018  vec_in1.s[0] = vec_first_col.s[1];
2019  vec_in2.s[0] = vec_first_col.s[2];
2020  vec_in3.s[0] = vec_first_col.s[3];
2021  vec_in4.s[0] = vec_first_col.s[4];
2022 
2023  vec_in0 += vec_mask_first_row;
2024 
2025  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2026  __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
2027  __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
2028  __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
2029  __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
2030  }
2031  else {
2032  vec_in0 += vec_mask_first_row;
2033  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2034  U[0 + 1 * colUStride] = 0.0;
2035  U[0 + 2 * colUStride] = 0.0;
2036  U[0 + 3 * colUStride] = 0.0;
2037  U[0 + 4 * colUStride] = 0.0;
2038  }
2039  }
2040 
2041  return;
2042 }
2043 
2044 template <typename dataType>
2045 static inline void
2046 u_update_6X7_NR_5th_iter(dataType *U, const int colUStride, dataType *diag, dataType s, const uint32_t col_elems);
2047 template <>
2048 inline void
2049 u_update_6X7_NR_5th_iter<float>(float *U, const int colUStride, float *diag, float s, const uint32_t col_elems)
2050 {
2051  return;
2052 }
2053 template <>
2054 inline void
2055 u_update_6X7_NR_5th_iter<double>(double *U, const int colUStride, double *diag, double s, const uint32_t col_elems)
2056 {
2057 
2058  {
2059  typedef typename c7x::make_full_vector<double>::type vec;
2060 
2061  vec vec_mask_first_row = gen_plusOneVec<vec>();
2062  __vpred pred_4_elem = __mask_long(col_elems);
2063 
2064  vec vec_first_col = (vec) 0;
2065  vec vec_si = (vec) 0.0;
2066  vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
2067  vec_first_col.s[0] = vec_in0.s[0];
2068  vec_in0 = vec_in0 * vec_mask_first_row;
2069  vec norm_factor = (vec) (vec_in0.s[0] * s);
2070  vec inv_norm_factor = getRecip(norm_factor);
2071 
2072  if (s != 0) {
2073  vec recip_s = (vec) getRecip(s);
2074  vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
2075  vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
2076  vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
2077  vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
2078  vec vec_in5 = __vload_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride]);
2079 
2080  vec_first_col.s[1] = vec_in1.s[0];
2081  vec_first_col.s[2] = vec_in2.s[0];
2082  vec_first_col.s[3] = vec_in3.s[0];
2083  vec_first_col.s[4] = vec_in4.s[0];
2084  vec_first_col.s[5] = vec_in5.s[0];
2085 
2086  vec_si += vec_first_col.s[1] * vec_in1;
2087  vec_si += vec_first_col.s[2] * vec_in2;
2088  vec_si += vec_first_col.s[3] * vec_in3;
2089  vec_si += vec_first_col.s[4] * vec_in4;
2090  vec_si += vec_first_col.s[5] * vec_in5;
2091 
2092  vec_si = vec_si * inv_norm_factor;
2093 
2094  vec_in0 += vec_si * vec_first_col.s[0];
2095  vec_in1 += vec_si * vec_first_col.s[1];
2096  vec_in2 += vec_si * vec_first_col.s[2];
2097  vec_in3 += vec_si * vec_first_col.s[3];
2098  vec_in4 += vec_si * vec_first_col.s[4];
2099  vec_in5 += vec_si * vec_first_col.s[5];
2100 
2101  vec_first_col = vec_first_col * recip_s;
2102 
2103  vec_in0.s[0] = vec_first_col.s[0];
2104  vec_in1.s[0] = vec_first_col.s[1];
2105  vec_in2.s[0] = vec_first_col.s[2];
2106  vec_in3.s[0] = vec_first_col.s[3];
2107  vec_in4.s[0] = vec_first_col.s[4];
2108  vec_in5.s[0] = vec_first_col.s[5];
2109 
2110  vec_in0 += vec_mask_first_row;
2111 
2112  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2113  __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
2114  __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
2115  __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
2116  __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
2117  __vstore_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride], vec_in5);
2118  }
2119  else {
2120  vec_in0 += vec_mask_first_row;
2121  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2122  U[0 + 1 * colUStride] = 0.0;
2123  U[0 + 2 * colUStride] = 0.0;
2124  U[0 + 3 * colUStride] = 0.0;
2125  U[0 + 4 * colUStride] = 0.0;
2126  U[0 + 5 * colUStride] = 0.0;
2127  }
2128  }
2129  return;
2130 }
2131 
2132 template <typename dataType>
2133 static inline void
2134 u_update_6X7_NR_6th_iter(dataType *U, const int colUStride, dataType *diag, dataType s, const uint32_t col_elems);
2135 template <>
2136 inline void
2137 u_update_6X7_NR_6th_iter<float>(float *U, const int colUStride, float *diag, float s, const uint32_t col_elems)
2138 {
2139  return;
2140 }
2141 template <>
2142 inline void
2143 u_update_6X7_NR_6th_iter<double>(double *U, const int colUStride, double *diag, double s, const uint32_t col_elems)
2144 {
2145  {
2146  typedef typename c7x::make_full_vector<double>::type vec;
2147 
2148  vec vec_mask_first_row = gen_plusOneVec<vec>();
2149  __vpred pred_4_elem = __mask_long(col_elems);
2150 
2151  vec vec_first_col = (vec) 0;
2152  vec vec_si = (vec) 0.0;
2153  vec vec_in0 = __vload_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride]);
2154  vec_first_col.s[0] = vec_in0.s[0];
2155  vec_in0 = vec_in0 * vec_mask_first_row;
2156  vec norm_factor = (vec) (vec_in0.s[0] * s);
2157  vec inv_norm_factor = getRecip(norm_factor);
2158 
2159  if (s != 0) {
2160  vec recip_s = (vec) getRecip(s);
2161  vec vec_in1 = __vload_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride]);
2162  vec vec_in2 = __vload_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride]);
2163  vec vec_in3 = __vload_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride]);
2164  vec vec_in4 = __vload_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride]);
2165  vec vec_in5 = __vload_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride]);
2166  vec vec_in6 = __vload_pred(pred_4_elem, (vec *) &U[0 + 6 * colUStride]);
2167 
2168  vec_first_col.s[1] = vec_in1.s[0];
2169  vec_first_col.s[2] = vec_in2.s[0];
2170  vec_first_col.s[3] = vec_in3.s[0];
2171  vec_first_col.s[4] = vec_in4.s[0];
2172  vec_first_col.s[5] = vec_in5.s[0];
2173  vec_first_col.s[6] = vec_in6.s[0];
2174 
2175  vec_si += vec_first_col.s[1] * vec_in1;
2176  vec_si += vec_first_col.s[2] * vec_in2;
2177  vec_si += vec_first_col.s[3] * vec_in3;
2178  vec_si += vec_first_col.s[4] * vec_in4;
2179  vec_si += vec_first_col.s[5] * vec_in5;
2180  vec_si += vec_first_col.s[6] * vec_in6;
2181 
2182  vec_si = vec_si * inv_norm_factor;
2183 
2184  vec_in0 += vec_si * vec_first_col.s[0];
2185  vec_in1 += vec_si * vec_first_col.s[1];
2186  vec_in2 += vec_si * vec_first_col.s[2];
2187  vec_in3 += vec_si * vec_first_col.s[3];
2188  vec_in4 += vec_si * vec_first_col.s[4];
2189  vec_in5 += vec_si * vec_first_col.s[5];
2190  vec_in6 += vec_si * vec_first_col.s[6];
2191 
2192  vec_first_col = vec_first_col * recip_s;
2193 
2194  vec_in0.s[0] = vec_first_col.s[0];
2195  vec_in1.s[0] = vec_first_col.s[1];
2196  vec_in2.s[0] = vec_first_col.s[2];
2197  vec_in3.s[0] = vec_first_col.s[3];
2198  vec_in4.s[0] = vec_first_col.s[4];
2199  vec_in5.s[0] = vec_first_col.s[5];
2200  vec_in6.s[0] = vec_first_col.s[6];
2201 
2202  vec_in0 += vec_mask_first_row;
2203 
2204  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2205  __vstore_pred(pred_4_elem, (vec *) &U[0 + 1 * colUStride], vec_in1);
2206  __vstore_pred(pred_4_elem, (vec *) &U[0 + 2 * colUStride], vec_in2);
2207  __vstore_pred(pred_4_elem, (vec *) &U[0 + 3 * colUStride], vec_in3);
2208  __vstore_pred(pred_4_elem, (vec *) &U[0 + 4 * colUStride], vec_in4);
2209  __vstore_pred(pred_4_elem, (vec *) &U[0 + 5 * colUStride], vec_in5);
2210  __vstore_pred(pred_4_elem, (vec *) &U[0 + 6 * colUStride], vec_in6);
2211  }
2212  else {
2213  vec_in0 += vec_mask_first_row;
2214  __vstore_pred(pred_4_elem, (vec *) &U[0 + 0 * colUStride], vec_in0);
2215  U[0 + 1 * colUStride] = 0.0;
2216  U[0 + 2 * colUStride] = 0.0;
2217  U[0 + 3 * colUStride] = 0.0;
2218  U[0 + 4 * colUStride] = 0.0;
2219  U[0 + 5 * colUStride] = 0.0;
2220  U[0 + 6 * colUStride] = 0.0;
2221  }
2222  }
2223  return;
2224 }
2225 
2226 template <typename dataType>
2227 static inline void
2228 u_update_6X7_R_1st_iter(dataType *U, const int colUStride, dataType *diag, dataType s, const uint32_t col_elems);
2229 template <>
2230 inline void
2231 u_update_6X7_R_1st_iter<float>(float *U, const int colUStride, float *diag, float s, const uint32_t col_elems)
2232 {
2233  return;
2234 }
2235 template <>
2236 inline void
2237 u_update_6X7_R_1st_iter<double>(double *U, const int colUStride, double *diag, double s, const uint32_t col_elems)
2238 {
2239  {
2240 
2241  double u_0_0 = U[0 + 0 * colUStride];
2242  double u_1_0 = U[0 + 1 * colUStride];
2243 
2244  double norm_factor = (u_0_0 * s);
2245  double inv_norm_factor = getRecip(norm_factor);
2246 
2247  if (s != 0) {
2248  double si = 0;
2249  double recip_s = getRecip(s);
2250 
2251  si = u_1_0 * U[1 + 1 * colUStride];
2252  si = si * inv_norm_factor;
2253  U[1 + 1 * colUStride] += si * u_1_0;
2254 
2255  U[1 + 0 * colUStride] = u_0_0 * u_1_0 * inv_norm_factor;
2256  U[1 + 1 * colUStride] = (u_1_0 * u_1_0 * inv_norm_factor) + 1;
2257 
2258  U[0 + 0 * colUStride] = (u_0_0 * recip_s) + 1;
2259  U[0 + 1 * colUStride] = u_1_0 * recip_s;
2260  }
2261  else {
2262  U[1 + 1 * colUStride] = 1;
2263  U[0 + 0 * colUStride] = 1;
2264  U[0 + 1 * colUStride] = 0;
2265  }
2266  }
2267  return;
2268 }
2269 
2270 #endif /* #if (__C7X_VEC_SIZE_BITS__ == 512) */
2271 
2272 /* ======================================================================== */
2273 /* End of file: DSPLIB_svd_small_u_process.h */
2274 /* ======================================================================== */
dataType getRecip(dataType value)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_svd.
dataType getSqrt(dataType a)