59 #define MAX_ITERATION_COUNT 30
67 template <
typename dataType>
103 #if (__C7X_VEC_SIZE_BITS__ == 512)
105 template <
typename dataType>
106 static inline int DSPF_sp_convert_to_bidiag_ci(
const int Nrows,
112 const int colUStride,
113 const int colVStride,
114 uint32_t enableReducedForm)
125 u_process_1st_iter(&U[0 + 0 * colUStride], colUStride, diag, superdiag, &scale, &s, 6, 0);
126 u_process_2nd_iter(&U[1 + 1 * colUStride], colUStride, diag, superdiag, &scale, &s, 5, 1);
127 u_process_3rd_iter(&U[2 + 2 * colUStride], colUStride, diag, superdiag, &scale, &s, 4, 2);
128 u_process_4th_iter(&U[3 + 3 * colUStride], colUStride, diag, superdiag, &scale, &s, 3, 3);
129 u_process_5th_iter(&U[4 + 4 * colUStride], colUStride, diag, superdiag, &scale, &s, 2, 4);
130 u_process_6th_iter(&U[5 + 5 * colUStride], colUStride, diag, superdiag, &scale, &s, 1, 5);
133 u_process_0th_iter(&U[0 + 0 * colUStride], colUStride, diag, superdiag, &scale, &s, 6, 0);
134 u_process_1st_iter(&U[1 + 1 * colUStride], colUStride, diag, superdiag, &scale, &s, 5, 1);
135 u_process_2nd_iter(&U[2 + 2 * colUStride], colUStride, diag, superdiag, &scale, &s, 4, 2);
136 u_process_3rd_iter(&U[3 + 3 * colUStride], colUStride, diag, superdiag, &scale, &s, 3, 3);
137 u_process_4th_iter(&U[4 + 4 * colUStride], colUStride, diag, superdiag, &scale, &s, 2, 4);
138 u_process_5th_iter(&U[5 + 5 * colUStride], colUStride, diag, superdiag, &scale, &s, 1, 5);
142 int diag_index_V = (Ncols - 1) + (Ncols - 1) * colVStride;
144 s = superdiag[Ncols - 1];
145 int diag_index_U = (Ncols - 2) + (Ncols - 2) * colUStride;
146 diag_index_V = (Ncols - 2) + (Ncols - 2) * colVStride;
147 v_process_1st_iter(&U[diag_index_U], &V[diag_index_V], colUStride, colVStride, &s);
150 s = superdiag[Ncols - 2];
151 diag_index_U = (Ncols - 3) + (Ncols - 3) * colUStride;
152 diag_index_V = (Ncols - 3) + (Ncols - 3) * colVStride;
153 v_process_2nd_iter(&U[diag_index_U], &V[diag_index_V], colUStride, colVStride, &s);
156 s = superdiag[Ncols - 3];
157 diag_index_U = (Ncols - 4) + (Ncols - 4) * colUStride;
158 diag_index_V = (Ncols - 4) + (Ncols - 4) * colVStride;
159 v_process_3rd_iter(&U[diag_index_U], &V[diag_index_V], colUStride, colVStride, &s);
162 s = superdiag[Ncols - 4];
163 diag_index_U = (Ncols - 5) + (Ncols - 5) * colUStride;
164 diag_index_V = (Ncols - 5) + (Ncols - 5) * colVStride;
165 v_process_4th_iter(&U[diag_index_U], &V[diag_index_V], colUStride, colVStride, &s);
168 s = superdiag[Ncols - 5];
169 diag_index_U = (Ncols - 6) + (Ncols - 6) * colUStride;
170 diag_index_V = (Ncols - 6) + (Ncols - 6) * colVStride;
171 v_process_5th_iter(&U[diag_index_U], &V[diag_index_V], colUStride, colVStride, &s);
174 dataType *U_diag = &U[(Ncols - 1) + (Ncols - 1) * colUStride];
176 u_update_1st_iter(U_diag, colUStride, diag, s);
178 U_diag = &U[(Ncols - 2) + (Ncols - 2) * colUStride];
180 u_update_2nd_iter(U_diag, colUStride, diag, s);
182 U_diag = &U[(Ncols - 3) + (Ncols - 3) * colUStride];
184 u_update_3rd_iter(U_diag, colUStride, diag, s);
186 U_diag = &U[(Ncols - 4) + (Ncols - 4) * colUStride];
188 u_update_4th_iter(U_diag, colUStride, diag, s);
190 U_diag = &U[(Ncols - 5) + (Ncols - 5) * colUStride];
192 u_update_5th_iter(U_diag, colUStride, diag, s);
194 U_diag = &U[(Ncols - 6) + (Ncols - 6) * colUStride];
196 u_update_6th_iter(U_diag, colUStride, diag, s);
199 if (enableReducedForm == 0) {
200 dataType *U_diag = &U[(Ncols - 1) + (Ncols - 1) * colUStride];
202 u_update_6X7_R_1st_iter(U_diag, colUStride, diag, s, 2U);
204 U_diag = &U[(Ncols - 2) + (Ncols - 2) * colUStride];
206 u_update_6X7_NR_2nd_iter(U_diag, colUStride, diag, s, 3U);
208 U_diag = &U[(Ncols - 3) + (Ncols - 3) * colUStride];
210 u_update_6X7_NR_3rd_iter(U_diag, colUStride, diag, s, 4U);
212 U_diag = &U[(Ncols - 4) + (Ncols - 4) * colUStride];
214 u_update_6X7_NR_4th_iter(U_diag, colUStride, diag, s, 5U);
216 U_diag = &U[(Ncols - 5) + (Ncols - 5) * colUStride];
218 u_update_6X7_NR_5th_iter(U_diag, colUStride, diag, s, 6U);
220 U_diag = &U[(Ncols - 6) + (Ncols - 6) * colUStride];
222 u_update_6X7_NR_6th_iter(U_diag, colUStride, diag, s, 7U);
225 dataType *U_diag = &U[(Ncols - 1) + (Ncols - 1) * colUStride];
227 u_update_6X7_NR_1st_iter(U_diag, colUStride, diag, s, 1U);
229 U_diag = &U[(Ncols - 2) + (Ncols - 2) * colUStride];
231 u_update_6X7_NR_2nd_iter(U_diag, colUStride, diag, s, 2U);
233 U_diag = &U[(Ncols - 3) + (Ncols - 3) * colUStride];
235 u_update_6X7_NR_3rd_iter(U_diag, colUStride, diag, s, 3U);
237 U_diag = &U[(Ncols - 4) + (Ncols - 4) * colUStride];
239 u_update_6X7_NR_4th_iter(U_diag, colUStride, diag, s, 4U);
241 U_diag = &U[(Ncols - 5) + (Ncols - 5) * colUStride];
243 u_update_6X7_NR_5th_iter(U_diag, colUStride, diag, s, 5U);
245 U_diag = &U[(Ncols - 6) + (Ncols - 6) * colUStride];
247 u_update_6X7_NR_6th_iter(U_diag, colUStride, diag, s, 6U);
262 template int DSPF_sp_convert_to_bidiag_ci<double>(
const int Nrows,
268 const int colUStride,
269 const int colVStride,
270 uint32_t enableReducedForm);
272 template <
typename dataType>
273 static inline int DSPF_sp_bidiag_to_diag_ci(
const int Nrows,
279 const int colUStride,
280 const int colVStride,
281 uint32_t enableReducedForm)
285 int i, k, rotation_test, iter, total_iter;
286 dataType x, y, z, epsilon;
287 dataType c, s, f, g, h;
294 typedef typename c7x::make_full_vector<double>::type vec;
296 epsilon = set_epsilon(diag, superdiag);
298 for (k = Ncols - 1; k >= 0; k--) {
303 rotation_test = rotation_test_check(diag, superdiag, &m, epsilon, (uint32_t) (k + 1));
308 __vpred pred_Z = __mask_long((uint32_t) Nrows);
309 for (i = m; i <= k; i++) {
310 vec *ptr_Uy = (vec *) &U[(m - 1) * colVStride];
311 vec *ptr_Uz = (vec *) &U[i * colVStride];
314 vec vec_Uy = __vload_pred(pred_Z, ptr_Uy);
315 vec vec_Uz = __vload_pred(pred_Z, ptr_Uz);
317 f = s * superdiag[i];
318 superdiag[i] = c * superdiag[i];
319 #if !defined(ENABLE_LDRA_COVERAGE)
323 if (__abs(f) <= epsilon) {
331 double f_g_sq = f_sq + g_sq;
338 vec temp_Uy = vec_Uy * c + vec_Uz * s;
339 vec temp_Uz = -vec_Uy * s + vec_Uz * c;
342 __vstore_pred(pred_Z, ptr_Uy, temp_Uy);
343 __vstore_pred(pred_Z, ptr_Uz, temp_Uz);
351 __vpred pred_Z = __mask_long(6u);
352 vec *p_vec_Z = (vec *) (&V[k * colVStride]);
354 vec vec_Z = __vload_pred(pred_Z, p_vec_Z);
355 __vstore_pred(pred_Z, p_vec_Z, -vec_Z);
360 #if !defined(ENABLE_LDRA_COVERAGE)
371 dataType z_sq, x_sq, x_recip, y_sq, g_sq, h_sq, h_y, h_y_2, f_g_recip, f_sq, f_h_sq;
382 g = superdiag[k - 1];
392 f = ((y_sq - z_sq) + (g_sq - h_sq)) *
getRecip((h_y_2));
400 f = ((x_sq - z_sq) + h * y * f_g_recip - h_sq) * x_recip;
406 __vpred pred_Z = __mask_long((uint32_t) Nrows);
408 for (i = m + 1; i <= k; i++) {
409 vec *ptr_Vx = (vec *) &V[(i - 1) * colVStride];
410 vec *ptr_Vz = (vec *) &V[i * colVStride];
413 vec vec_Vx = __vload_pred(pred_Z, ptr_Vx);
414 vec vec_Vz = __vload_pred(pred_Z, ptr_Vz);
416 vec *ptr_Uy = (vec *) &U[(i - 1) * colUStride];
417 vec *ptr_Uz = (vec *) &U[i * colUStride];
420 vec vec_Uy = __vload_pred(pred_Z, ptr_Uy);
421 vec vec_Uz = __vload_pred(pred_Z, ptr_Uz);
428 f_h_sq = f_sq + h_sq;
431 superdiag[i - 1] = z * f_h_sq;
437 vec temp_Vx = vec_Vx * c + vec_Vz * s;
438 vec temp_Vz = -vec_Vx * s + vec_Vz * c;
441 __vstore_pred(pred_Z, ptr_Vx, temp_Vx);
442 __vstore_pred(pred_Z, ptr_Vz, temp_Vz);
449 f_h_sq = f_sq + h_sq;
453 diag[i - 1] = z * f_h_sq;
462 vec temp_Uy = vec_Uy * c + vec_Uz * s;
463 vec temp_Uz = -vec_Uy * s + vec_Uz * c;
466 __vstore_pred(pred_Z, ptr_Uy, temp_Uy);
467 __vstore_pred(pred_Z, ptr_Uz, temp_Uz);
482 template int DSPF_sp_bidiag_to_diag_ci<float>(
const int Nrows,
488 const int colUStride,
489 const int colVStride,
490 uint32_t enableReducedForm);
491 template int DSPF_sp_bidiag_to_diag_ci<double>(
const int Nrows,
497 const int colUStride,
498 const int colVStride,
499 uint32_t enableReducedForm);
501 template <
typename dataType>
502 static inline int DSPF_sp_sort_singular_values_ci(
const int Nrows,
506 dataType *singular_values,
507 const int colUStride,
508 const int colVStride,
509 uint32_t enableReducedForm)
512 sort_singular_vals(singular_values, U, V, Nrows, Ncols, colUStride, colVStride);
524 template int DSPF_sp_sort_singular_values_ci<double>(
const int Nrows,
528 double *singular_values,
529 const int colUStride,
530 const int colVStride,
531 uint32_t enableReducedForm);
533 template <
typename dataType>
543 const int32_t strideIn,
544 const int32_t strideU,
545 const int32_t strideV,
546 uint32_t enableReducedForm)
550 int row, col, Nrows1, Ncols1, status;
555 if (Nrows >= Ncols) {
563 int32_t dataSize =
sizeof(dataType);
564 int32_t colUStride = strideU / dataSize;
565 int32_t colVStride = strideV / dataSize;
566 int32_t colAStride = strideIn / dataSize;
568 if (Nrows >= Ncols) {
570 for (row = 0; row < Nrows1; row++) {
571 for (col = 0; col < Ncols1; col++) {
572 U[col + row * colUStride] = A[col + row * colAStride];
578 for (row = 0; row < Nrows1; row++) {
579 for (col = 0; col < Ncols1; col++) {
580 U[col + row * colUStride] = A[row + col * colAStride];
588 DSPF_sp_convert_to_bidiag_ci<dataType>(Nrows1, Ncols1, U, V, diag, superdiag, colUStride, colVStride,
595 transpose_vec_mat(V, Ncols, Ncols, colVStride);
596 transpose_vec_mat(U, Ncols, Ncols, colUStride);
597 status = DSPF_sp_bidiag_to_diag_ci<dataType>(Nrows1, Ncols1, U, V, diag, superdiag, colUStride, colVStride,
600 transpose_vec_mat(V, Ncols, Ncols, colVStride);
601 transpose_vec_mat(U, Ncols, Ncols, colUStride);
606 DSPF_sp_sort_singular_values_ci<dataType>(Nrows1, Ncols1, U, V, diag, colUStride, colVStride, enableReducedForm);
612 if (enableReducedForm == 0u) {
613 memcpy(U1, V,
sizeof(dataType) * Nrows * colVStride);
614 memcpy(V, U,
sizeof(dataType) * Ncols * colUStride);
615 memcpy(U, U1,
sizeof(dataType) * Nrows * colUStride);
618 memcpy(U1, V,
sizeof(dataType) * Ncols * colVStride);
619 memcpy(V, U,
sizeof(dataType) * Ncols * colUStride);
620 memcpy(U, U1,
sizeof(dataType) * Nrows * colUStride);
649 const int32_t strideIn,
650 const int32_t strideU,
651 const int32_t strideV,
652 uint32_t enableReducedForm);
656 template <
typename dataType>
661 void *restrict pDiag,
662 void *restrict pSuperDiag,
669 #if (__C7X_VEC_SIZE_BITS__ == 512)
671 uint32_t heightIn = pKerPrivArgs->
heightIn;
672 uint32_t widthIn = pKerPrivArgs->
widthIn;
673 int32_t strideIn = pKerPrivArgs->
strideIn;
674 int32_t strideU = pKerPrivArgs->
strideU;
675 int32_t strideV = pKerPrivArgs->
strideV;
679 dataType *pALocal = (dataType *) pA;
680 dataType *pULocal = (dataType *) pU;
681 dataType *pVLocal = (dataType *) pV;
682 dataType *pDiagLocal = (dataType *) pDiag;
683 dataType *pSuperDiagLocal = (dataType *) pSuperDiag;
684 dataType *pU1Local = (dataType *) pU1;
686 DSPLIB_DEBUGPRINTFN(0,
"pALocal: %p pOutLocal: %p widthIn: %d heightIn: %d\n", pALocal, pULocal, widthIn, heightIn);
688 #if !defined(ENABLE_LDRA_COVERAGE)
690 DSPF_sp_svd_ci<dataType>(pKerPrivArgs, heightIn, widthIn, pALocal, pULocal, pVLocal, pU1Local, pDiagLocal,
691 pSuperDiagLocal, strideIn, strideU, strideV, enableReducedForm);
693 if (svd_status < 0) {
697 DSPF_sp_svd_ci<dataType>(pKerPrivArgs, heightIn, widthIn, pALocal, pULocal, pVLocal, pU1Local, pDiagLocal,
698 pSuperDiagLocal, strideIn, strideU, strideV, enableReducedForm);
702 DSPLIB_DEBUGPRINTFN(0,
"%s\n",
"The code is only implemented for __C7X_VEC_SIZE_BITS__ == 512 ");
723 void *restrict pDiag,
724 void *restrict pSuperDiag,
dataType getRecipSqrt(dataType a)
dataType getRecip(dataType value)
DSPLIB_STATUS DSPLIB_svd_small_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsV, const DSPLIB_bufParams1D_t *bufParamsDiag, const DSPLIB_bufParams1D_t *bufParamsSuperDiag, const DSPLIB_svd_small_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_svd_small_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsV, const DSPLIB_bufParams1D_t *bufParamsDiag, const DSPLIB_bufParams1D_t *bufParamsSuperDiag, const DSPLIB_svd_small_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_svd_small_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pU, void *restrict pV, void *restrict pDiag, void *restrict pSuperDiag, void *restrict pU1)
This function is the main execution function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_svd_small_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pU, void *restrict pV, void *restrict pDiag, void *restrict pSuperDiag, void *restrict pU1)
#define MAX_ITERATION_COUNT
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_svd.
dataType getSqrt(dataType a)
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
@ DSPLIB_ERR_NOT_IMPLEMENTED
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint32_t enableReducedForm
Flag for enabling the calculation of reduced form enableReducedForm = 1 for reduced form SVD calc ena...
uint32_t strideU
Stride between rows of U matrix
uint32_t strideV
Stride between rows of V matrix
int32_t strideIn
Stride between rows of input data matrix
uint32_t widthIn
Size of input buffer for different batches DSPLIB_svd_small_init that will be retrieved and used by D...
uint32_t heightIn
Height of input data matrix