int check_solution(int M, int N, int NRHS, PLASMA_Complex32_t *A1, int LDA, PLASMA_Complex32_t *B1, PLASMA_Complex32_t *B2, int LDB) { int info_solution; float Rnorm, Anorm, Xnorm, Bnorm; PLASMA_Complex32_t alpha, beta; float *work = (float *)malloc(max(M, N)* sizeof(float)); float eps; eps = LAPACKE_slamch_work('e'); alpha = 1.0; beta = -1.0; Anorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, A1, LDA, work); Xnorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, NRHS, B2, LDB, work); Bnorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, NRHS, B1, LDB, work); cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, NRHS, N, CBLAS_SADDR(alpha), A1, LDA, B2, LDB, CBLAS_SADDR(beta), B1, LDB); if (M >= N) { PLASMA_Complex32_t *Residual = (PLASMA_Complex32_t *)malloc(M*NRHS*sizeof(PLASMA_Complex32_t)); memset((void*)Residual, 0, M*NRHS*sizeof(PLASMA_Complex32_t)); cblas_cgemm(CblasColMajor, CblasConjTrans, CblasNoTrans, N, NRHS, M, CBLAS_SADDR(alpha), A1, LDA, B1, LDB, CBLAS_SADDR(beta), Residual, M); Rnorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, NRHS, Residual, M, work); free(Residual); } else { PLASMA_Complex32_t *Residual = (PLASMA_Complex32_t *)malloc(N*NRHS*sizeof(PLASMA_Complex32_t)); memset((void*)Residual, 0, N*NRHS*sizeof(PLASMA_Complex32_t)); cblas_cgemm(CblasColMajor, CblasConjTrans, CblasNoTrans, N, NRHS, M, CBLAS_SADDR(alpha), A1, LDA, B1, LDB, CBLAS_SADDR(beta), Residual, N); Rnorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, NRHS, Residual, N, work); free(Residual); } printf("============\n"); printf("Checking the Residual of the solution \n"); printf("-- ||Ax-B||_oo/((||A||_oo||x||_oo+||B||)_oo.N.eps) = %e \n",Rnorm/((Anorm*Xnorm+Bnorm)*N*eps)); if (isnan(Rnorm / ((Anorm * Xnorm + Bnorm) * N * eps)) || (Rnorm / ((Anorm * Xnorm + Bnorm) * N * eps) > 10.0) ) { printf("-- The solution is suspicious ! \n"); info_solution = 1; } else { printf("-- The solution is CORRECT ! \n"); info_solution= 0 ; } free(work); return info_solution; }
static int check_solution(int M, int N, int NRHS, PLASMA_Complex32_t *A1, int LDA, PLASMA_Complex32_t *B1, PLASMA_Complex32_t *B2, int LDB, float eps) { int info_solution; float Rnorm, Anorm, Xnorm, Bnorm; PLASMA_Complex32_t alpha, beta; float result; float *work = (float *)malloc(max(M, N)* sizeof(float)); alpha = 1.0; beta = -1.0; BLAS_cge_norm( blas_colmajor, blas_inf_norm, M, N, A1, LDA, &Anorm ); BLAS_cge_norm( blas_colmajor, blas_inf_norm, M, NRHS, B2, LDB, &Xnorm ); BLAS_cge_norm( blas_colmajor, blas_inf_norm, N, NRHS, B1, LDB, &Bnorm ); cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, NRHS, N, CBLAS_SADDR(alpha), A1, LDA, B2, LDB, CBLAS_SADDR(beta), B1, LDB); if (M >= N) { PLASMA_Complex32_t *Residual = (PLASMA_Complex32_t *)malloc(M*NRHS*sizeof(PLASMA_Complex32_t)); memset((void*)Residual, 0, M*NRHS*sizeof(PLASMA_Complex32_t)); cblas_cgemm(CblasColMajor, CblasConjTrans, CblasNoTrans, N, NRHS, M, CBLAS_SADDR(alpha), A1, LDA, B1, LDB, CBLAS_SADDR(beta), Residual, M); BLAS_cge_norm( blas_colmajor, blas_inf_norm, M, NRHS, Residual, M, &Rnorm ); free(Residual); } else { PLASMA_Complex32_t *Residual = (PLASMA_Complex32_t *)malloc(N*NRHS*sizeof(PLASMA_Complex32_t)); memset((void*)Residual, 0, N*NRHS*sizeof(PLASMA_Complex32_t)); cblas_cgemm(CblasColMajor, CblasConjTrans, CblasNoTrans, N, NRHS, M, CBLAS_SADDR(alpha), A1, LDA, B1, LDB, CBLAS_SADDR(beta), Residual, N); BLAS_cge_norm( blas_colmajor, blas_inf_norm, N, NRHS, Residual, N, &Rnorm ); free(Residual); } if (getenv("PLASMA_TESTING_VERBOSE")) printf( "||A||_oo=%f\n||X||_oo=%f\n||B||_oo=%f\n||A X - B||_oo=%e\n", Anorm, Xnorm, Bnorm, Rnorm ); result = Rnorm / ( (Anorm*Xnorm+Bnorm)*N*eps ) ; printf("============\n"); printf("Checking the Residual of the solution \n"); printf("-- ||Ax-B||_oo/((||A||_oo||x||_oo+||B||_oo).N.eps) = %e \n", result); if ( isnan(Xnorm) || isinf(Xnorm) || isnan(result) || isinf(result) || (result > 60.0) ) { printf("-- The solution is suspicious ! \n"); info_solution = 1; } else{ printf("-- The solution is CORRECT ! \n"); info_solution = 0; } free(work); return info_solution; }
void CORE_cgemm_quark(Quark *quark) { int transA; int transB; int m; int n; int k; PLASMA_Complex32_t alpha; PLASMA_Complex32_t *A; int lda; PLASMA_Complex32_t *B; int ldb; PLASMA_Complex32_t beta; PLASMA_Complex32_t *C; int ldc; quark_unpack_args_13(quark, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); cblas_cgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, m, n, k, CBLAS_SADDR(alpha), A, lda, B, ldb, CBLAS_SADDR(beta), C, ldc); }
///////////////////////////////////////////////////////////////////// // REPLACEMENT FUNCTIONS IN CASE BLAS NOT AVAILABLE // // THESE ARE SLOW - NO ATTEMPTS AT OPTIMISATION WERE MADE // // THESE ARE *NOT* THE GENERAL BLAS FUNCTIONS: // // THEY **ONLY** APPLY TO SQUARE ROW-MAJOR MATRICES!!! // ///////////////////////////////////////////////////////////////////// //void phi_gemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, // const enum CBLAS_TRANSPOSE TransB, const int M, const int N, // const int K, const void *alpha, const void *A, // const int lda, const void *B, const int ldb, // const void *beta, void *C, const int ldc) { void phi_gemm(const int N, const Complex *alpha, const Complex *A, const Complex *B, const Complex *beta, Complex *C) { #ifndef NOBLAS #ifdef SINGLEPRECISION cblas_cgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,alpha,A,N,B,N,beta,C,N); #else cblas_zgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,alpha,A,N,B,N,beta,C,N); #endif #else int i,j,n; //multiply beta: //for(i = 0; i < N*N; ++i) // C[i] *= (*beta); //cout << "beta= " << *beta << " alpha=" << *alpha << "\n"; for(i = 0; i < N; ++i) { for(j = 0; j < N; ++j) { C[i+j*N] *= (*beta); for (n = 0; n < N; ++n) C[i+j*N] += (*alpha)*(A[i+n*N]*B[j*N+n]); } } #endif }
void CORE_cgemm_p2f1_quark(Quark* quark) { int transA; int transB; int M; int N; int K; PLASMA_Complex32_t alpha; PLASMA_Complex32_t *A; int LDA; PLASMA_Complex32_t **B; int LDB; PLASMA_Complex32_t beta; PLASMA_Complex32_t *C; int LDC; void *fake1; quark_unpack_args_14(quark, transA, transB, M, N, K, alpha, A, LDA, B, LDB, beta, C, LDC, fake1); cblas_cgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, CBLAS_SADDR(alpha), A, LDA, *B, LDB, CBLAS_SADDR(beta), C, LDC); }
static inline void CORE_cgetrf_reclap_update(const int M, const int column, const int n1, const int n2, PLASMA_Complex32_t *A, const int LDA, int *IPIV, const int thidx, const int thcnt) { static PLASMA_Complex32_t posone = 1.0; static PLASMA_Complex32_t negone = -1.0; PLASMA_Complex32_t *Atop = A + column*LDA; PLASMA_Complex32_t *Atop2 = Atop + n1 *LDA; int coff, ccnt, lm, loff; CORE_cbarrier_thread( thidx, thcnt ); psplit( n2, thidx, thcnt, &coff, &ccnt ); if (ccnt > 0) { CORE_claswap1( ccnt, Atop2 + coff*LDA, LDA, column, n1 + column, IPIV ); /* swap to the right */ cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, n1, ccnt, CBLAS_SADDR(posone), Atop + column, LDA, Atop2 + coff*LDA + column, LDA ); } /* __sync_synchronize(); */ /* hopefully we will not need memory fences */ /* need to wait for pivoting and triangular solve to finish */ CORE_cbarrier_thread( thidx, thcnt ); psplit( M, thidx, thcnt, &loff, &lm ); if (thidx == 0) { loff = column + n1; lm -= column + n1; }; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, CBLAS_SADDR(negone), Atop+loff, LDA, Atop2 + column, LDA, CBLAS_SADDR(posone), Atop2+loff, LDA ); }
inline void gemm( const Order order, const TransA transa, const TransB transb, const int m, const int n, const int k, const std::complex<float> alpha, const std::complex<float>* a, const int lda, const std::complex<float>* b, const int ldb, const std::complex<float> beta, std::complex<float>* c, const int ldc ) { cblas_cgemm( cblas_option< Order >::value, cblas_option< TransA >::value, cblas_option< TransB >::value, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc ); }
VrArrayPtrCF32 BlasComplexSingle::mmult(CBLAS_ORDER order, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, VrArrayPtrCF32 A, VrArrayPtrCF32 B,float complex alpha,float complex beta) { int dims[2]; dims[0]=VR_GET_DIMS_CF32(A)[0]; dims[1]=VR_GET_DIMS_CF32(B)[1]; VrArrayPtrCF32 C=vrAllocArrayF32CM(2,0,(int*)dims); //float alph[]={1,0}; //float bet[]={0,0}; cblas_cgemm(order, transA, transB,VR_GET_DIMS_CF32(A)[0],VR_GET_DIMS_CF32(B)[1],VR_GET_DIMS_CF32(A)[1],reinterpret_cast<float*>(&alpha),(float*)VR_GET_DATA_CF32(A),VR_GET_DIMS_CF32(A)[0], (float*)VR_GET_DATA_CF32(B),VR_GET_DIMS_CF32(B)[0],reinterpret_cast<float*>(&beta),(float*)VR_GET_DATA_CF32(C),VR_GET_DIMS_CF32(C)[0]); return C; }
void CORE_cgemm(int transA, int transB, int M, int N, int K, PLASMA_Complex32_t alpha, PLASMA_Complex32_t *A, int LDA, PLASMA_Complex32_t *B, int LDB, PLASMA_Complex32_t beta, PLASMA_Complex32_t *C, int LDC) { cblas_cgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, CBLAS_SADDR(alpha), A, LDA, B, LDB, CBLAS_SADDR(beta), C, LDC); }
void blas_cgemm(char transa, char transb, long M, long N, long K, const complex float alpha, long lda, const complex float A[K][lda], long ldb, const complex float B[N][ldb], const complex float beta, long ldc, complex float C[N][ldc]) { #ifdef USE_CUDA #define CUCOMPLEX(x) (((union { cuComplex cu; complex float std; }){ .std = (x) }).cu) if (cuda_ondevice(A)) { cublasCgemm(transa, transb, M, N, K, CUCOMPLEX(alpha), (const cuComplex*)A, lda, (const cuComplex*)B, ldb, CUCOMPLEX(beta), (cuComplex*)C, ldc); } else #endif cblas_cgemm(CblasColMajor, transa, transb, M, N, K, (void*)&alpha, (void*)A, lda, (void*)B, ldb, (void*)&beta, (void*)C, ldc); }
JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_cgemm (JNIEnv * env, jobject calling_obj, jint order, jint transA, jint transB, jint M, jint N, jint K, jfloatArray jAlpha, jfloatArray jA, jint lda, jfloatArray jB, jint ldb, jfloatArray jBeta, jfloatArray jC, jint ldc){ jfloat * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE); jfloat * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE); jfloat * C = (*env)->GetPrimitiveArrayCritical(env, jC, JNI_FALSE); jfloat * alpha = (*env)->GetPrimitiveArrayCritical(env, jAlpha, JNI_FALSE); jfloat * beta = (*env)->GetPrimitiveArrayCritical(env, jBeta, JNI_FALSE); cblas_cgemm((CBLAS_ORDER)order, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); (*env)->ReleasePrimitiveArrayCritical(env, jC, C, 0); (*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0); (*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0); }
inline void gemm (CBLAS_ORDER const Order, CBLAS_TRANSPOSE const TransA, CBLAS_TRANSPOSE const TransB, int const M, int const N, int const K, traits::complex_f const& alpha, traits::complex_f const* A, int const lda, traits::complex_f const* B, int const ldb, traits::complex_f const& beta, traits::complex_f* C, int const ldc) { cblas_cgemm (Order, TransA, TransB, M, N, K, static_cast<void const*> (&alpha), static_cast<void const*> (A), lda, static_cast<void const*> (B), ldb, static_cast<void const*> (&beta), static_cast<void*> (C), ldc); }
inline void gemm(CBLAS_ORDER const Order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K, float alpha, std::complex<float> const *A, int lda, std::complex<float> const *B, int ldb, float beta, std::complex<float>* C, int ldc ) { std::complex<float> alphaArg(alpha,0); std::complex<float> betaArg(beta,0); cblas_cgemm( Order, TransA, TransB, M, N, K, static_cast<void const *>(&alphaArg), static_cast<void const *>(A), lda, static_cast<void const *>(B), ldb, static_cast<void const *>(&betaArg), static_cast<void *>(C), ldc ); }
void bl1_cgemm_blas( trans1_t transa, trans1_t transb, int m, int n, int k, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb, scomplex* beta, scomplex* c, int ldc ) { #ifdef BLIS1_ENABLE_CBLAS_INTERFACES enum CBLAS_ORDER cblas_order = CblasColMajor; enum CBLAS_TRANSPOSE cblas_transa; enum CBLAS_TRANSPOSE cblas_transb; bl1_param_map_to_netlib_trans( transa, &cblas_transa ); bl1_param_map_to_netlib_trans( transb, &cblas_transb ); cblas_cgemm( cblas_order, cblas_transa, cblas_transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); #else char blas_transa; char blas_transb; bl1_param_map_to_netlib_trans( transa, &blas_transa ); bl1_param_map_to_netlib_trans( transb, &blas_transb ); F77_cgemm( &blas_transa, &blas_transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc ); #endif }
int check_solution(int N, int NRHS, PLASMA_Complex32_t *A1, int LDA, PLASMA_Complex32_t *B1, PLASMA_Complex32_t *B2, int LDB ) { int info_solution; float Rnorm, Anorm, Xnorm, Bnorm; PLASMA_Complex32_t alpha, beta; float *work = (float *)malloc(N*sizeof(float)); float eps; eps = LAPACKE_slamch_work('e'); /* Initialize A1 and A2 for Symmetric Positive Matrix */ alpha = 1.0; beta = -1.0; Xnorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, NRHS, B2, LDB, work); Anorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, N, A1, LDA, work); Bnorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, NRHS, B1, LDB, work); cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, NRHS, N, CBLAS_SADDR(alpha), A1, LDA, B2, LDB, CBLAS_SADDR(beta), B1, LDB); Rnorm = LAPACKE_clange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), N, NRHS, B1, LDB, work); printf("============\n"); printf("Checking the Residual of the solution \n"); printf("-- ||Ax-B||_oo/((||A||_oo||x||_oo+||B||_oo).N.eps) = %e \n",Rnorm/((Anorm*Xnorm+Bnorm)*N*eps)); if (Rnorm/((Anorm*Xnorm+Bnorm)*N*eps) > 10.0){ printf("-- The solution is suspicious ! \n"); info_solution = 1; } else{ printf("-- The solution is CORRECT ! \n"); info_solution = 0; } free(work); return info_solution; }
unsigned long long benchmark_gemm( const void* memory, size_t cache_size, enum gemm_type type, size_t m, size_t n, size_t k, const float a[], const float b[], float c[], size_t max_iterations) { unsigned long long computation_time[max_iterations]; size_t computation_samples = 0; for (size_t iteration = 0; iteration < max_iterations; iteration++) { read_memory(memory, cache_size); unsigned long long start_time, end_time; if (!read_timer(&start_time)) continue; switch (type) { case gemm_type_sgemm: { #if defined(USE_MKL) || defined(USE_OPENBLAS) cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a, k, b, k, 0.0f, c, n); #elif defined(USE_BLIS) float alpha = 1.0f; float beta = 0.0f; bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, n, k, &alpha, (float*) a, k, 1, (float*) b, k, 1, &beta, c, n, 1); #endif break; } case gemm_type_cgemm: { #if defined(USE_MKL) || defined(USE_OPENBLAS) float alpha[2] = { 1.0f, 0.0f }; float beta[2] = { 0.0f, 0.0f }; cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, alpha, a, k, b, k, beta, c, n); #elif defined(USE_BLIS) scomplex alpha = { 1.0f, 0.0f }; scomplex beta = { 0.0f, 0.0f }; bli_cgemm(BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, n, k, &alpha, (scomplex*) a, k, 1, (scomplex*) b, k, 1, &beta, (scomplex*) c, n, 1); #endif break; } } if (!read_timer(&end_time)) continue; computation_time[computation_samples++] = end_time - start_time; } return median(computation_time, computation_samples); }
DLLEXPORT void c_matrix_multiply(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, const blasint m, const blasint n, const blasint k, const openblas_complex_float alpha, const openblas_complex_float x[], const openblas_complex_float y[], const openblas_complex_float beta, openblas_complex_float c[]) { blasint lda = transA == CblasNoTrans ? m : k; blasint ldb = transB == CblasNoTrans ? k : n; cblas_cgemm(CblasColMajor, transA, transB, m, n, k, (float*)&alpha, (float*)x, lda, (float*)y, ldb, (float*)&beta, (float*)c, m); }
template <typename fptype> static inline int lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha, const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags) { int ldsrc1 = src1_step / sizeof(std::complex<fptype>); int ldsrc2 = src2_step / sizeof(std::complex<fptype>); int ldsrc3 = src3_step / sizeof(std::complex<fptype>); int lddst = dst_step / sizeof(std::complex<fptype>); int c_m, c_n, d_m; CBLAS_TRANSPOSE transA, transB; std::complex<fptype> cAlpha(alpha, 0.0); std::complex<fptype> cBeta(beta, 0.0); if(flags & CV_HAL_GEMM_2_T) { transB = CblasTrans; if(flags & CV_HAL_GEMM_1_T ) { d_m = a_n; } else { d_m = a_m; } } else { transB = CblasNoTrans; if(flags & CV_HAL_GEMM_1_T ) { d_m = a_n; } else { d_m = a_m; } } if(flags & CV_HAL_GEMM_3_T) { c_m = d_n; c_n = d_m; } else { c_m = d_m; c_n = d_n; } if(flags & CV_HAL_GEMM_1_T ) { transA = CblasTrans; std::swap(a_n, a_m); } else { transA = CblasNoTrans; } if(src3 != dst && beta != 0.0 && src3_step != 0) { if(flags & CV_HAL_GEMM_3_T) transpose((std::complex<fptype>*)src3, ldsrc3, (std::complex<fptype>*)dst, lddst, c_m, c_n); else copy_matrix((std::complex<fptype>*)src3, ldsrc3, (std::complex<fptype>*)dst, lddst, c_m, c_n); } else if (src3 == dst && (flags & CV_HAL_GEMM_3_T)) //actually transposing C in this case done by openCV return CV_HAL_ERROR_NOT_IMPLEMENTED; else if(src3_step == 0 && beta != 0.0) set_value((std::complex<fptype>*)dst, lddst, std::complex<fptype>(0.0, 0.0), d_m, d_n); if(typeid(fptype) == typeid(float)) cblas_cgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst); else if(typeid(fptype) == typeid(double)) cblas_zgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst); return CV_HAL_ERROR_OK; }
int CORE_cgessm(int M, int N, int K, int IB, int *IPIV, PLASMA_Complex32_t *L, int LDL, PLASMA_Complex32_t *A, int LDA) { static PLASMA_Complex32_t zone = 1.0; static PLASMA_Complex32_t mzone = -1.0; static int ione = 1; int i, sb; int tmp, tmp2; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (K < 0) { coreblas_error(3, "Illegal value of K"); return -3; } if (IB < 0) { coreblas_error(4, "Illegal value of IB"); return -4; } if ((LDL < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDL"); return -7; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(9, "Illegal value of LDA"); return -9; } /* Quick return */ if ((M == 0) || (N == 0) || (K == 0) || (IB == 0)) return PLASMA_SUCCESS; for(i = 0; i < K; i += IB) { sb = min(IB, K-i); /* * Apply interchanges to columns I*IB+1:IB*( I+1 )+1. */ tmp = i+1; tmp2 = i+sb; LAPACKE_claswp_work(LAPACK_COL_MAJOR, N, A, LDA, tmp, tmp2, IPIV, ione); /* * Compute block row of U. */ cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, sb, N, CBLAS_SADDR(zone), &L[LDL*i+i], LDL, &A[i], LDA ); if (i+sb < M) { /* * Update trailing submatrix. */ cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M-(i+sb), N, sb, CBLAS_SADDR(mzone), &L[LDL*i+(i+sb)], LDL, &A[i], LDA, CBLAS_SADDR(zone), &A[i+sb], LDA ); } } return PLASMA_SUCCESS; }
void F77_cgemm(int *layout, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*layout == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } if (transb == CblasNoTrans) { LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDB = *k+1; B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*layout == TEST_COL_MJR) cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); }
static int check_factorization(int M, int N, PLASMA_Complex32_t *A1, PLASMA_Complex32_t *A2, int LDA, PLASMA_Complex32_t *Q, float eps ) { float Anorm, Rnorm; PLASMA_Complex32_t alpha, beta; int info_factorization; int i,j; PLASMA_Complex32_t *Ql = (PLASMA_Complex32_t *)malloc(M*N*sizeof(PLASMA_Complex32_t)); PLASMA_Complex32_t *Residual = (PLASMA_Complex32_t *)malloc(M*N*sizeof(PLASMA_Complex32_t)); float *work = (float *)malloc(max(M,N)*sizeof(float)); alpha=1.0; beta=0.0; if (M >= N) { /* Extract the R */ PLASMA_Complex32_t *R = (PLASMA_Complex32_t *)malloc(N*N*sizeof(PLASMA_Complex32_t)); memset((void*)R, 0, N*N*sizeof(PLASMA_Complex32_t)); LAPACKE_clacpy_work(LAPACK_COL_MAJOR,'u', M, N, A2, LDA, R, N); /* Perform Ql=Q*R */ memset((void*)Ql, 0, M*N*sizeof(PLASMA_Complex32_t)); cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, N, CBLAS_SADDR(alpha), Q, LDA, R, N, CBLAS_SADDR(beta), Ql, M); free(R); } else { /* Extract the L */ PLASMA_Complex32_t *L = (PLASMA_Complex32_t *)malloc(M*M*sizeof(PLASMA_Complex32_t)); memset((void*)L, 0, M*M*sizeof(PLASMA_Complex32_t)); LAPACKE_clacpy_work(LAPACK_COL_MAJOR,'l', M, N, A2, LDA, L, M); /* Perform Ql=LQ */ memset((void*)Ql, 0, M*N*sizeof(PLASMA_Complex32_t)); cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, M, CBLAS_SADDR(alpha), L, M, Q, LDA, CBLAS_SADDR(beta), Ql, M); free(L); } /* Compute the Residual */ for (i = 0; i < M; i++) for (j = 0 ; j < N; j++) Residual[j*M+i] = A1[j*LDA+i]-Ql[j*M+i]; BLAS_cge_norm( blas_colmajor, blas_inf_norm, M, N, Residual, M, &Rnorm ); BLAS_cge_norm( blas_colmajor, blas_inf_norm, M, N, A2, LDA, &Anorm ); if (M >= N) { printf("============\n"); printf("Checking the QR Factorization \n"); printf("-- ||A-QR||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps)); } else { printf("============\n"); printf("Checking the LQ Factorization \n"); printf("-- ||A-LQ||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps)); } if (isnan(Rnorm / (Anorm * N *eps)) || isinf(Rnorm / (Anorm * N *eps)) || (Rnorm / (Anorm * N * eps) > 60.0) ) { printf("-- Factorization is suspicious ! \n"); info_factorization = 1; } else { printf("-- Factorization is CORRECT ! \n"); info_factorization = 0; } free(work); free(Ql); free(Residual); return info_factorization; }
void wrapper_cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc) { cblas_cgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }