/*------------------------------------------------------------------------ * Check the factorization of the matrix A2 */ static int check_factorization(int N, PLASMA_Complex64_t *A1, PLASMA_Complex64_t *A2, int LDA, int uplo, double eps) { double Anorm, Rnorm; PLASMA_Complex64_t alpha; int info_factorization; int i,j; PLASMA_Complex64_t *Residual = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *L1 = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *L2 = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); double *work = (double *)malloc(N*sizeof(double)); memset((void*)L1, 0, N*N*sizeof(PLASMA_Complex64_t)); memset((void*)L2, 0, N*N*sizeof(PLASMA_Complex64_t)); alpha= 1.0; LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,' ', N, N, A1, LDA, Residual, N); /* Dealing with L'L or U'U */ if (uplo == PlasmaUpper){ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,'u', N, N, A2, LDA, L1, N); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,'u', N, N, A2, LDA, L2, N); cblas_ztrmm(CblasColMajor, CblasLeft, CblasUpper, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), L1, N, L2, N); } else{ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,'l', N, N, A2, LDA, L1, N); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,'l', N, N, A2, LDA, L2, N); cblas_ztrmm(CblasColMajor, CblasRight, CblasLower, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), L1, N, L2, N); } /* Compute the Residual || A -L'L|| */ for (i = 0; i < N; i++) for (j = 0; j < N; j++) Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; BLAS_zge_norm( blas_colmajor, blas_inf_norm, N, N, Residual, N, &Rnorm ); BLAS_zge_norm( blas_colmajor, blas_inf_norm, N, N, A1, LDA, &Anorm ); printf("============\n"); printf("Checking the Cholesky Factorization \n"); printf("-- ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps)); if ( isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0) ){ printf("-- Factorization is suspicious ! \n"); info_factorization = 1; } else{ printf("-- Factorization is CORRECT ! \n"); info_factorization = 0; } free(Residual); free(L1); free(L2); free(work); return info_factorization; }
/*------------------------------------------------------------------------ * Check the factorization of the matrix A2 */ static int check_factorization(int N, PLASMA_Complex64_t *A1, PLASMA_Complex64_t *A2, int LDA, int uplo, double eps) { PLASMA_Complex64_t alpha = 1.0; double Anorm, Rnorm, result; int info_factorization; int i,j; PLASMA_Complex64_t *Residual = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *L1 = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *L2 = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); double *work = (double *)malloc(N*sizeof(double)); memset((void*)L1, 0, N*N*sizeof(PLASMA_Complex64_t)); memset((void*)L2, 0, N*N*sizeof(PLASMA_Complex64_t)); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,' ', N, N, A1, LDA, Residual, N); /* Dealing with L'L or U'U */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, N, A2, LDA, L1, N); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, N, A2, LDA, L2, N); if (uplo == PlasmaUpper) cblas_ztrmm(CblasColMajor, CblasLeft, (CBLAS_UPLO)uplo, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), L1, N, L2, N); else cblas_ztrmm(CblasColMajor, CblasRight, (CBLAS_UPLO)uplo, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), L1, N, L2, N); /* Compute the Residual || A - L'L|| */ for (i = 0; i < N; i++) for (j = 0; j < N; j++) Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; Rnorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, N, Residual, N, work); Anorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, N, A1, LDA, work); result = Rnorm / ( Anorm * N * eps ); printf("============\n"); printf("Checking the Cholesky Factorization \n"); printf("-- ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n", result); if ( isnan(result) || isinf(result) || (result > 60.0) ){ printf("-- Factorization is suspicious ! \n"); info_factorization = 1; } else{ printf("-- Factorization is CORRECT ! \n"); info_factorization = 0; } free(Residual); free(L1); free(L2); free(work); return info_factorization; }
lapack_int LAPACKE_zlacpy( int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb ) { if( matrix_order != LAPACK_COL_MAJOR && matrix_order != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zlacpy", -1 ); return -1; } #ifndef LAPACK_DISABLE_NAN_CHECK /* Optionally check input matrices for NaNs */ if( LAPACKE_zge_nancheck( matrix_order, m, n, a, lda ) ) { return -5; } #endif return LAPACKE_zlacpy_work( matrix_order, uplo, m, n, a, lda, b, ldb ); }
static int RunTest(int *iparam, double *dparam, real_Double_t *t_) { PLASMA_Complex64_t *A, *Acpy = NULL, *b = NULL, *x; real_Double_t t; int n = iparam[TIMING_N]; int nrhs = iparam[TIMING_NRHS]; int check = iparam[TIMING_CHECK]; int lda = n; int ldb = n; /* Allocate Data */ A = (PLASMA_Complex64_t *)malloc(lda*n* sizeof(PLASMA_Complex64_t)); x = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t)); /* Check if unable to allocate memory */ if ( (!A) || (!x) ) { printf("Out of Memory \n "); exit(0); } /* Initialize Plasma */ PLASMA_Init( iparam[TIMING_THRDNBR] ); if ( iparam[TIMING_SCHEDULER] ) PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING ); else PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_STATIC_SCHEDULING ); /*if ( !iparam[TIMING_AUTOTUNING] ) {*/ PLASMA_Disable(PLASMA_AUTOTUNING); PLASMA_Set(PLASMA_TILE_SIZE, iparam[TIMING_NB] ); /* } */ /* Initialiaze Data */ PLASMA_zplghe((double)n, n, A, lda, 51 ); LAPACKE_zlarnv_work(1, ISEED, n*nrhs, x); /* Save A and b */ if (check) { Acpy = (PLASMA_Complex64_t *)malloc(lda*n* sizeof(PLASMA_Complex64_t)); b = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t)); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,' ', n, n, A, lda, Acpy, lda); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,' ', n, nrhs, x, ldb, b, ldb); } /* PLASMA ZPOSV */ t = -cWtime(); PLASMA_zposv(PlasmaUpper, n, nrhs, A, lda, x, ldb); t += cWtime(); *t_ = t; /* Check the solution */ if (check) { dparam[TIMING_RES] = z_check_solution(n, n, nrhs, Acpy, lda, b, x, ldb, &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), &(dparam[TIMING_XNORM])); free(Acpy); free(b); } free(A); free(x); PLASMA_Finalize(); return 0; }
static int RunTest(int *iparam, double *dparam, real_Double_t *t_) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; PLASMA_Complex64_t *A, *A2 = NULL; real_Double_t t; int *ipiv, *ipiv2 = NULL; int i; int m = iparam[TIMING_N]; int n = iparam[TIMING_NRHS]; int check = iparam[TIMING_CHECK]; int lda = m; PLASMA_sequence *sequence = NULL; PLASMA_request request = PLASMA_REQUEST_INITIALIZER; /* Initialize Plasma */ PLASMA_Init( iparam[TIMING_THRDNBR] ); PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING ); PLASMA_Disable(PLASMA_AUTOTUNING); PLASMA_Set(PLASMA_TILE_SIZE, iparam[TIMING_NB] ); PLASMA_Set(PLASMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] ); /* Allocate Data */ A = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t)); /* Check if unable to allocate memory */ if ( (! A) ) { printf("Out of Memory \n "); return -1; } /* Initialiaze Data */ LAPACKE_zlarnv_work(1, ISEED, lda*n, A); /* Allocate Workspace */ ipiv = (int *)malloc( n*sizeof(int) ); /* Save A in lapack layout for check */ if ( check ) { A2 = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t)); ipiv2 = (int *)malloc( n*sizeof(int) ); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,' ', m, n, A, lda, A2, lda); LAPACKE_zgetrf_work(LAPACK_COL_MAJOR, m, n, A2, lda, ipiv2 ); } plasma = plasma_context_self(); PLASMA_Sequence_Create(&sequence); QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); QUARK_Task_Flag_Set(&task_flags, TASK_THREAD_COUNT, iparam[TIMING_THRDNBR] ); plasma_dynamic_spawn(); CORE_zgetrf_reclap_init(); t = -cWtime(); QUARK_CORE_zgetrf_reclap(plasma->quark, &task_flags, m, n, n, A, lda, ipiv, sequence, &request, 0, 0, iparam[TIMING_THRDNBR]); PLASMA_Sequence_Wait(sequence); t += cWtime(); *t_ = t; PLASMA_Sequence_Destroy(sequence); /* Check the solution */ if ( check ) { double *work = (double *)malloc(max(m,n)*sizeof(double)); /* Check ipiv */ for(i=0; i<n; i++) { if( ipiv[i] != ipiv2[i] ) { fprintf(stderr, "\nPLASMA (ipiv[%d] = %d, A[%d] = %e) / LAPACK (ipiv[%d] = %d, A[%d] = [%e])\n", i, ipiv[i], i, creal(A[ i * lda + i ]), i, ipiv2[i], i, creal(A2[ i * lda + i ])); break; } } dparam[TIMING_ANORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), m, n, A, lda, work); dparam[TIMING_XNORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), m, n, A2, lda, work); dparam[TIMING_BNORM] = 0.0; CORE_zaxpy( m, n, -1.0, A, lda, A2, lda); dparam[TIMING_RES] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaMaxNorm), m, n, A2, lda, work); free( A2 ); free( ipiv2 ); free( work ); } free( A ); free( ipiv ); PLASMA_Finalize(); return 0; }
int CORE_zpltmg_chebvand( int M, int N, PLASMA_Complex64_t *A, int LDA, int gN, int m0, int n0, PLASMA_Complex64_t *W ) { PLASMA_Complex64_t step; int i, j, jj; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(4, "Illegal value of LDA"); return -4; } if (m0 < 0) { coreblas_error(6, "Illegal value of m0"); return -6; } if (n0 < 0) { coreblas_error(7, "Illegal value of n0"); return -7; } if (gN < n0+N) { coreblas_error(5, "Illegal value of gN"); return -5; } step = (PLASMA_Complex64_t)1. / (gN - 1.); /* Initialize W if required */ if (m0 == 0) { for (j=0, jj=n0; j<N; j++, jj++) { W[2*j ] = 1.; W[2*j+1] = jj * step; } if ( M == 1 ) { LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 1, N, W, 2, A, LDA ); return PLASMA_SUCCESS; } LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 2, N, W, 2, A, LDA ); M -= 2; A += 2; } /* Case NB=1, W contains row 0 and 1 and M should be 1 */ if (m0 == 1) { if (M != 1) { coreblas_error(1, "Illegal value of M for m0 = 1"); return -1; } LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 1, N, W+1, 2, A, LDA ); return PLASMA_SUCCESS; } for (j=0, jj=n0; j<N; j++, jj++) { /* First line */ if (M > 0) { A[LDA*j] = 2. * jj * step * W[j*2 + 1] - W[j*2 ]; } /* Second line */ if (M > 1) { A[LDA*j+1] = 2. * jj * step * A[LDA*j ] - W[j*2+1]; } for (i=2; i<M; i++) { A[LDA*j+i] = 2. * jj * step * A[LDA*j+i-1] - A[LDA*j+i-2]; } } if ( M == 1 ) { cblas_zcopy( N, W+1, 2, W, 2 ); cblas_zcopy( N, A+M-1, LDA, W+1, 2 ); } else { LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 2, N, A + M - 2, LDA, W, 2 ); } return PLASMA_SUCCESS; }
int testing_zhegst(int argc, char **argv) { /* Check for number of arguments*/ if (argc != 3) { USAGE("HEGST", "N LDA LDB", " - N : size of the matrices A and B\n" " - LDA : leading dimension of the matrix A\n" " - LDB : leading dimension of the matrix B\n"); return -1; } double eps = LAPACKE_dlamch_work('e'); int N = atoi(argv[0]); int LDA = atoi(argv[1]); int LDB = atoi(argv[2]); int info_transformation, info_factorization; int i, u; int LDAxN = LDA*N; int LDBxN = LDB*N; PLASMA_Complex64_t *A1 = (PLASMA_Complex64_t *)malloc(LDAxN*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *A2 = (PLASMA_Complex64_t *)malloc(LDAxN*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *B1 = (PLASMA_Complex64_t *)malloc(LDBxN*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *B2 = (PLASMA_Complex64_t *)malloc(LDBxN*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *Ainit = (PLASMA_Complex64_t *)malloc(LDAxN*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *Binit = (PLASMA_Complex64_t *)malloc(LDBxN*sizeof(PLASMA_Complex64_t)); /* Check if unable to allocate memory */ if ((!A1)||(!A2)||(!B1)||(!B2)||(!Ainit)||(!Binit)){ printf("Out of Memory \n "); return -2; } /*---------------------------------------------------------- * TESTING ZHEGST */ /* Initialize A1 and A2 */ PLASMA_zplghe(0., N, A1, LDA, 5198); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, 'A', N, N, A1, LDA, Ainit, LDA); /* Initialize B1 and B2 */ PLASMA_zplghe((double)N, N, B1, LDB, 4231); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, 'A', N, N, B1, LDB, Binit, LDB); printf("\n"); printf("------ TESTS FOR PLASMA ZHEGST ROUTINE ------- \n"); printf(" Size of the Matrix %d by %d\n", N, N); printf("\n"); printf(" The matrices A and B are randomly generated for each test.\n"); printf("============\n"); printf(" The relative machine precision (eps) is to be %e \n",eps); printf(" Computational tests pass if scaled residuals are less than 60.\n"); /*---------------------------------------------------------- * TESTING ZHEGST */ for (i=0; i<3; i++) { for (u=0; u<2; u++) { memcpy(A2, Ainit, LDAxN*sizeof(PLASMA_Complex64_t)); memcpy(B2, Binit, LDBxN*sizeof(PLASMA_Complex64_t)); PLASMA_zpotrf(uplo[u], N, B2, LDB); PLASMA_zhegst(itype[i], uplo[u], N, A2, LDA, B2, LDB); /* Check the Cholesky factorization and the transformation */ info_factorization = check_factorization(N, B1, B2, LDB, uplo[u], eps); info_transformation = check_transformation(itype[i], uplo[u], N, A1, A2, LDA, B2, LDB, eps); if ( (info_transformation == 0) && (info_factorization == 0) ) { printf("***************************************************\n"); printf(" ---- TESTING ZHEGST (%s, %s) ....... PASSED !\n", itypestr[i], uplostr[u]); printf("***************************************************\n"); } else { printf("************************************************\n"); printf(" - TESTING ZHEGST (%s, %s) ... FAILED !\n", itypestr[i], uplostr[u]); printf("************************************************\n"); } } } free(A1); free(A2); free(B1); free(B2); free(Ainit); free(Binit); return 0; }
static int check_transformation(int itype, int uplo, int N, PLASMA_Complex64_t *A1, PLASMA_Complex64_t *A2, int LDA, PLASMA_Complex64_t *B2, int LDB, double eps) { PLASMA_Complex64_t alpha = 1.0; double Anorm, Rnorm, result; int info_transformation; int i, j; char *str; PLASMA_Complex64_t *Residual = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *Aorig = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); double *work = (double *)malloc(N*sizeof(double)); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, 'a', N, N, A1, LDA, Residual, N); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, N, A2, LDA, Aorig, N); /* Rebuild the symmetry of A2 */ if (uplo == PlasmaLower) { for (j = 0; j < N; j++) for (i = j+1; i < N; i++) Aorig[j+i*N] = conj(Aorig[i+j*N]); } else { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) Aorig[j+i*N] = conj(Aorig[i+j*N]); } if (itype == 1) { if (uplo == PlasmaLower) { str = "L*A2*L'"; cblas_ztrmm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); cblas_ztrmm(CblasColMajor, CblasRight, CblasLower, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); } else{ str = "U'*A2*U"; cblas_ztrmm(CblasColMajor, CblasLeft, CblasUpper, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); cblas_ztrmm(CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); } } else { if (uplo == PlasmaLower) { str = "inv(L')*A2*inv(L)"; cblas_ztrsm(CblasColMajor, CblasLeft, CblasLower, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); cblas_ztrsm(CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); } else{ str = "inv(U)*A2*inv(U')"; cblas_ztrsm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); cblas_ztrsm(CblasColMajor, CblasRight, CblasUpper, CblasConjTrans, CblasNonUnit, N, N, CBLAS_SADDR(alpha), B2, LDB, Aorig, N); } } /* Compute the Residual ( A1 - W ) */ for (i = 0; i < N; i++) for (j = 0; j < N; j++) Residual[j*N+i] = Aorig[j*N+i] - Residual[j*N+i]; Rnorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, N, Residual, N, work); Anorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, N, A1, LDA, work); result = Rnorm / (Anorm * N *eps); printf("============\n"); printf("Checking the global transformation \n"); printf("-- ||A1-%s||_oo/(||A1||_oo.N.eps) = %e \n", str, result ); if (isnan(result) || isinf(result) || (result > 60.0) ) { printf("-- Transformation is suspicious ! \n"); info_transformation = 1; } else { printf("-- Transformation is CORRECT ! \n"); info_transformation = 0; } free(Residual); free(Aorig); free(work); return info_transformation; }
int check_factorization(int M, int N, PLASMA_Complex64_t *A1, PLASMA_Complex64_t *A2, int LDA, PLASMA_Complex64_t *Q) { double Anorm, Rnorm; PLASMA_Complex64_t alpha, beta; int info_factorization; int i,j; double eps; eps = LAPACKE_dlamch_work('e'); PLASMA_Complex64_t *Ql = (PLASMA_Complex64_t *)malloc(M*N*sizeof(PLASMA_Complex64_t)); PLASMA_Complex64_t *Residual = (PLASMA_Complex64_t *)malloc(M*N*sizeof(PLASMA_Complex64_t)); double *work = (double *)malloc(max(M,N)*sizeof(double)); alpha=1.0; beta=0.0; if (M >= N) { /* Extract the R */ PLASMA_Complex64_t *R = (PLASMA_Complex64_t *)malloc(N*N*sizeof(PLASMA_Complex64_t)); memset((void*)R, 0, N*N*sizeof(PLASMA_Complex64_t)); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,'u', M, N, A2, LDA, R, N); /* Perform Ql=Q*R */ memset((void*)Ql, 0, M*N*sizeof(PLASMA_Complex64_t)); cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, N, CBLAS_SADDR(alpha), Q, LDA, R, N, CBLAS_SADDR(beta), Ql, M); free(R); } else { /* Extract the L */ PLASMA_Complex64_t *L = (PLASMA_Complex64_t *)malloc(M*M*sizeof(PLASMA_Complex64_t)); memset((void*)L, 0, M*M*sizeof(PLASMA_Complex64_t)); LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,'l', M, N, A2, LDA, L, M); /* Perform Ql=LQ */ memset((void*)Ql, 0, M*N*sizeof(PLASMA_Complex64_t)); cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, M, CBLAS_SADDR(alpha), L, M, Q, LDA, CBLAS_SADDR(beta), Ql, M); free(L); } /* Compute the Residual */ for (i = 0; i < M; i++) for (j = 0 ; j < N; j++) Residual[j*M+i] = A1[j*LDA+i]-Ql[j*M+i]; Rnorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Residual, M, work); Anorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, A2, LDA, work); if (M >= N) { printf("============\n"); printf("Checking the QR Factorization \n"); printf("-- ||A-QR||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps)); } else { printf("============\n"); printf("Checking the LQ Factorization \n"); printf("-- ||A-LQ||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps)); } if (isnan(Rnorm / (Anorm * N *eps)) || (Rnorm / (Anorm * N * eps) > 10.0) ) { printf("-- Factorization is suspicious ! \n"); info_factorization = 1; } else { printf("-- Factorization is CORRECT ! \n"); info_factorization = 0; } free(work); free(Ql); free(Residual); return info_factorization; }
int CORE_zttrfb(int side, int trans, int direct, int storev, int M1, int N1, int M2, int N2, int K, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *V, int LDV, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *WORK, int LDWORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t mzone = -1.0; int j, vi; /* Check input arguments */ if (M1 < 0) { coreblas_error(5, "Illegal value of M1"); return -5; } if (N1 < 0) { coreblas_error(6, "Illegal value of N1"); return -6; } if ((M2 < 0) || ( (side == PlasmaRight) && (M1 != M2) ) ) { coreblas_error(7, "Illegal value of M2"); return -7; } if ((N2 < 0) || ( (side == PlasmaLeft) && (N1 != N2) ) ) { coreblas_error(8, "Illegal value of N2"); return -8; } if (K < 0) { coreblas_error(9, "Illegal value of K"); return -9; } /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) return PLASMA_SUCCESS; if (storev == PlasmaColumnwise) { if (direct == PlasmaForward) { /* * Let V = ( V1 ) (first K rows) * ( V2 ) * where V2 is non-unit upper triangular */ if (side == PlasmaLeft) { /* * Colwise / Forward / Left * ------------------------- * * Form H * A or H' * A where A = ( A1 ) * ( A2 ) * where A2 = ( A2_1 ) * ( A2_2 ) */ /* * W = A1 + V' * A2 */ /* * W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N2, &A2[M2-K], LDA2, WORK, LDWORK); /* * W = V2' * A2_2 */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasConjTrans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), &V[M2-K], LDV, WORK, LDWORK); if (M2 > K) { /* * W = W + V1' * A2_1 */ cblas_zgemm( CblasColMajor, CblasConjTrans, CblasNoTrans, K, N2, M2-K, CBLAS_SADDR(zone), V, LDV, A2, LDA2, CBLAS_SADDR(zone), WORK, LDWORK); } /* * W = A1 + W */ for(j = 0; j < N1; j++) { cblas_zaxpy(K, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* * A2 = A2 - V * T * W -> W = T * W, A2 = A2 - V * W */ /* * W = T * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < N1; j++) { cblas_zaxpy(K, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2_1 = A2_1 - V1 * W */ if (M2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2-K, N2, K, CBLAS_SADDR(mzone), V, LDV, WORK, LDWORK, CBLAS_SADDR(zone), A2, LDA2); } /* * W = - V2 * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, K, N2, CBLAS_SADDR(mzone), &V[M2-K], LDV, WORK, LDWORK); /* * A2_2 = A2_2 + W */ for(j = 0; j < N2; j++) { cblas_zaxpy( K, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*j+(M2-K)], 1); } } else { /* * Colwise / Forward / Right * ------------------------- * * Form H * A or H' * A where: * * A = ( A1 A2 ) * * A2 = ( A2_1 : A2_2 ) * * A2_1 is M2 x (M2-K) * A2_2 is M2 x K * * V = ( V_1 ) * ( V_2 ) * * V_1 is full and (N2-K) x K * V_2 is upper triangular and K x K */ /* * W = ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) * * W is M x K * A1 is M x K * A2 is M x N2 split as (A2_1 A2_2) such as * A2_1 is (N2-K) x K * A2_2 is M x K */ /* W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M2, K, &A2[LDA2*(N2-K)], LDA2, WORK, LDWORK); /* W = W * V_2 --> W = A2_2 * V_2 */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), &V[N2-K], LDV, WORK, LDWORK); /* W = W + A2_1 * V_1 */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, K, N2-K, CBLAS_SADDR(zone), A2, LDA2, V, LDV, CBLAS_SADDR(zone), WORK, LDWORK); } /* W = A1 + W */ for (j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* W = W * T --> ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2 = A2 - W * V' --> A2 - W*V_1' - W*V_2' */ /* A2 = A2 - W * V_1' */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasConjTrans, M2, N2-K, K, CBLAS_SADDR(mzone), WORK, LDWORK, V, LDV, CBLAS_SADDR(zone), A2, LDA2); } /* A2 = A2 - W * V_2' */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasConjTrans, CblasNonUnit, M2, K, CBLAS_SADDR(mzone), &V[N2-K], LDV, WORK, LDWORK); for(j = 0; j < K; j++) { cblas_zaxpy( M2, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*(j+N2-K)], 1); } } } else { coreblas_error(3, "Not implemented (ColWise / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } else { /* * Rowwise */ if (direct == PlasmaForward) { /* * Let V = ( V1 V2 ) (V1: first K cols) * * where V2 is non-unit lower triangular */ if (side == PlasmaLeft) { /* * Rowwise / Forward / Left * ------------------------- * * Form H * A or H' * A where A = ( A1 ) * ( A2 ) * where A2 = ( A2_1 ) * ( A2_2 ) */ /* V_2 first element */ vi = LDV*(M2-K); /* * W = A1 + V * A2 */ /* * W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N2, &A2[M2-K], LDA2, WORK, LDWORK); /* * W = V2 * A2_2 */ //**DB CblasColMajor, CblasLeft, CblasUpper, cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), &V[vi], LDV, WORK, LDWORK); if (M2 > K) { /* * W = W + V1 * A2_1 */ cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, K, N2, M2-K, CBLAS_SADDR(zone), V, LDV, A2, LDA2, CBLAS_SADDR(zone), WORK, LDWORK); } /* * W = A1 + W */ for(j = 0; j < N1; j++) { cblas_zaxpy( K, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* * W = T * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < N1; j++) { cblas_zaxpy( K, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2 = A2 - V' * T * W -> A2 = A2 - V' * W */ /* * A2_1 = A2_1 - V1' * W */ if (M2 > K) { cblas_zgemm( CblasColMajor, CblasConjTrans, CblasNoTrans, M2-K, N2, K, CBLAS_SADDR(mzone), V, LDV, WORK, LDWORK, CBLAS_SADDR(zone), A2, LDA2); } /* * W = - V2' * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasConjTrans, CblasNonUnit, K, N2, CBLAS_SADDR(mzone), &V[vi], LDV, WORK, LDWORK); /* * A2_2 = A2_2 + W */ for(j = 0; j < N2; j++) { cblas_zaxpy( K, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*j+(M2-K)], 1); } } else { /* * Rowwise / Forward / Right * ------------------------- * * Form H * A or H' * A where: * * A = ( A1 A2 ) * * A2 = ( A2_1 : A2_2 ) * * A2_1 is M2 x (M2-K) * A2_2 is M2 x K * * V = ( V_1 ) * ( V_2 ) * * V_1 is full and (N2-K) x K * V_2 is lower triangular and K x K */ /* * W = ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) * * W is M x K * A1 is M x K * A2 is M x N2 split as (A2_1 A2_2) such as * A2_1 is (N2-K) x K * A2_2 is M x K */ /* V_2 and A2_2 first element */ vi = LDV*(N2-K); /* W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M2, K, &A2[LDA2*(N2-K)], LDA2, WORK, LDWORK); /* W = W * V_2' --> W = A2_2 * V_2' */ cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasConjTrans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), &V[vi], LDV, WORK, LDWORK); /* W = W + A2_1 * V_1' */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasConjTrans, M2, K, N2-K, CBLAS_SADDR(zone), A2, LDA2, V, LDV, CBLAS_SADDR(zone), WORK, LDWORK); } /* W = A1 + W */ for (j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* W = W * op(T) --> ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2 = A2 - W * V --> A2 - W*V_1 - W*V_2 */ /* A2 = A2 - W * V_1 */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, N2-K, K, CBLAS_SADDR(mzone), WORK, LDWORK, V, LDV, CBLAS_SADDR(zone), A2, LDA2); } /* A2 = A2 - W * V_2 */ cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, M2, K, CBLAS_SADDR(mzone), &V[vi], LDV, WORK, LDWORK); for(j = 0; j < K; j++) { cblas_zaxpy( M2, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*(N2-K+j)], 1); } } } else { coreblas_error(3, "Not implemented (Rowwise / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } return PLASMA_SUCCESS; }