int main() { double *A, *B, *C; int i,j,r,max_threads,size; double alpha, beta; double s_initial, s_elapsed; printf("Intializing data for matrix multiplication C=A*B for matrix\n\n" " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N); alpha = 1.0; beta = 0.0; printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n"); A = ( double *)mkl_malloc(M*P*sizeof( double ),64); B = ( double *)mkl_malloc(N*P*sizeof( double ),64); C = ( double *)mkl_malloc(M*N*sizeof( double ),64); if (A == NULL || B == NULL || C == NULL) { printf("Error: can`t allocate memory for matrices.\n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf("Intializing matrix data\n\n"); size = M*P; for (i = 0; i < size; ++i) { A[i] = ( double )(i+1); } size = N*P; for (i = 0; i < size; ++i) { B[i] = ( double )(i-1); } printf("Finding max number of threads can use for parallel runs \n\n"); max_threads = mkl_get_max_threads(); printf("Running from 1 to %i threads \n\n",max_threads); for (i = 1; i <= max_threads; ++i) { size = M*N; for (j = 0; j < size; ++j) { C[j] = 0.0; } printf("Requesting to use %i threads \n\n",i); mkl_set_num_threads(i); printf("Measuring performance of matrix product using dgemm function\n" " via CBLAS interface on %i threads \n\n",i); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; ++r) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N); // multiply matrices with cblas_dgemm; } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf("Matrix multiplication using dgemm completed \n" " at %.5f milliseconds using %d threads \n\n", (s_elapsed * 1000),i); printf("Output the result: \n"); size = M*N; for (i = 0; i < size; ++i) { printf("%i\t",(int)C[i]); if (i % N == N - 1) printf("\n"); } } printf("Dellocating memory\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 0; }
int check_factorization(int M, int N, double *A1, double *A2, int LDA, double *Q) { double Anorm, Rnorm; double alpha, beta; int info_factorization; int i,j; double eps; eps = LAPACKE_dlamch_work('e'); double *Ql = (double *)malloc(M*N*sizeof(double)); double *Residual = (double *)malloc(M*N*sizeof(double)); double *work = (double *)malloc(max(M,N)*sizeof(double)); alpha=1.0; beta=0.0; if (M >= N) { /* Extract the R */ double *R = (double *)malloc(N*N*sizeof(double)); memset((void*)R, 0, N*N*sizeof(double)); LAPACKE_dlacpy_work(LAPACK_COL_MAJOR,'u', M, N, A2, LDA, R, N); /* Perform Ql=Q*R */ memset((void*)Ql, 0, M*N*sizeof(double)); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, N, (alpha), Q, LDA, R, N, (beta), Ql, M); free(R); } else { /* Extract the L */ double *L = (double *)malloc(M*M*sizeof(double)); memset((void*)L, 0, M*M*sizeof(double)); LAPACKE_dlacpy_work(LAPACK_COL_MAJOR,'l', M, N, A2, LDA, L, M); /* Perform Ql=LQ */ memset((void*)Ql, 0, M*N*sizeof(double)); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, M, (alpha), L, M, Q, LDA, (beta), Ql, M); free(L); } /* Compute the Residual */ for (i = 0; i < M; i++) for (j = 0 ; j < N; j++) Residual[j*M+i] = A1[j*LDA+i]-Ql[j*M+i]; Rnorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Residual, M, work); Anorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, A2, LDA, work); if (M >= N) { printf("============\n"); printf("Checking the QR Factorization \n"); printf("-- ||A-QR||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps)); } else { printf("============\n"); printf("Checking the LQ Factorization \n"); printf("-- ||A-LQ||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps)); } if (isnan(Rnorm / (Anorm * N *eps)) || (Rnorm / (Anorm * N * eps) > 10.0) ) { printf("-- Factorization is suspicious ! \n"); info_factorization = 1; } else { printf("-- Factorization is CORRECT ! \n"); info_factorization = 0; } free(work); free(Ql); free(Residual); return info_factorization; }
static void CORE_dgetrf_rectil_rec(const PLASMA_desc A, int *IPIV, int *info, double *pivot, int thidx, int thcnt, int column, int width, int ft, int lt) { int ld, jp, n1, n2, lm, tmpM, piv_sf; int ip, j, it, i, ldft; int max_i, max_it, thwin; double zone = 1.0; double mzone = -1.0; double tmp1; double tmp2 = 0.; double pivval; double *Atop, *Atop2, *U, *L; double abstmp1; int offset = A.i; ldft = BLKLDD(A, 0); Atop = A(0, 0) + column * ldft; if ( width > 1 ) { /* Assumption: N = min( M, N ); */ n1 = width / 2; n2 = width - n1; Atop2 = Atop + n1 * ldft; CORE_dgetrf_rectil_rec( A, IPIV, info, pivot, thidx, thcnt, column, n1, ft, lt ); if ( *info != 0 ) return; if (thidx == 0) { /* Swap to the right */ int *lipiv = IPIV+column; int idxMax = column+n1; for (j = column; j < idxMax; ++j, ++lipiv) { ip = (*lipiv) - offset - 1; if ( ip != j ) { it = ip / A.mb; i = ip % A.mb; ld = BLKLDD(A, it); cblas_dswap(n2, Atop2 + j, ldft, A(it, 0) + (column+n1)*ld + i, ld ); } } /* Trsm on the upper part */ U = Atop2 + column; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, n1, n2, (zone), Atop + column, ldft, U, ldft ); /* SIgnal to other threads that they can start update */ CORE_dbarrier_thread( thidx, thcnt ); pivval = *pivot; if ( pivval == 0.0 ) { *info = column+n1; return; } else { if ( fabs(pivval) >= sfmin ) { piv_sf = 1; pivval = 1.0 / pivval; } else { piv_sf = 0; } } /* First tile */ { L = Atop + column + n1; tmpM = min(ldft, A.m) - column - n1; /* Scale last column of L */ if ( piv_sf ) { cblas_dscal( tmpM, (pivval), L+(n1-1)*ldft, 1 ); } else { int i; Atop2 = L+(n1-1)*ldft; for( i=0; i < tmpM; i++, Atop2++) *Atop2 = *Atop2 / pivval; } /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, tmpM, n2, n1, (mzone), L, ldft, U, ldft, (zone), U + n1, ldft ); /* Search Max in first column of U+n1 */ tmp2 = U[n1]; max_it = ft; max_i = cblas_idamax( tmpM, U+n1, 1 ) + n1; tmp1 = U[max_i]; abstmp1 = fabs(tmp1); max_i += column; } } else { pivval = *pivot; if ( pivval == 0.0 ) { *info = column+n1; return; } else { if ( fabs(pivval) >= sfmin ) { piv_sf = 1; pivval = 1.0 / pivval; } else { piv_sf = 0; } } ld = BLKLDD( A, ft ); L = A( ft, 0 ) + column * ld; lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; U = Atop2 + column; /* First tile */ /* Scale last column of L */ if ( piv_sf ) { cblas_dscal( lm, (pivval), L+(n1-1)*ld, 1 ); } else { int i; Atop2 = L+(n1-1)*ld; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } /* Wait for pivoting and triangular solve to be finished * before to really start the update */ CORE_dbarrier_thread( thidx, thcnt ); /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); /* Search Max in first column of L+n1*ld */ max_it = ft; max_i = cblas_idamax( lm, L+n1*ld, 1 ); tmp1 = L[n1*ld+max_i]; abstmp1 = fabs(tmp1); } /* Update the other blocks */ for( it = ft+1; it < lt; it++) { ld = BLKLDD( A, it ); L = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; /* Scale last column of L */ if ( piv_sf ) { cblas_dscal( lm, (pivval), L+(n1-1)*ld, 1 ); } else { int i; Atop2 = L+(n1-1)*ld; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); /* Search the max on the first column of L+n1*ld */ jp = cblas_idamax( lm, L+n1*ld, 1 ); if ( fabs( L[n1*ld+jp] ) > abstmp1 ) { tmp1 = L[n1*ld+jp]; abstmp1 = fabs(tmp1); max_i = jp; max_it = it; } } jp = offset + max_it*A.mb + max_i; CORE_damax1_thread( tmp1, thidx, thcnt, &thwin, &tmp2, pivot, jp + 1, IPIV + column + n1 ); if ( thidx == 0 ) { U[n1] = *pivot; /* all threads have the pivot element: no need for synchronization */ } if (thwin == thidx) { /* the thread that owns the best pivot */ if ( jp-offset != column+n1 ) /* if there is a need to exchange the pivot */ { ld = BLKLDD(A, max_it); Atop2 = A( max_it, 0 ) + (column + n1 )* ld + max_i; *Atop2 = tmp2; } } CORE_dgetrf_rectil_rec( A, IPIV, info, pivot, thidx, thcnt, column+n1, n2, ft, lt ); if ( *info != 0 ) return; if ( thidx == 0 ) { /* Swap to the left */ int *lipiv = IPIV+column+n1; int idxMax = column+width; for (j = column+n1; j < idxMax; ++j, ++lipiv) { ip = (*lipiv) - offset - 1; if ( ip != j ) { it = ip / A.mb; i = ip % A.mb; ld = BLKLDD(A, it); cblas_dswap(n1, Atop + j, ldft, A(it, 0) + column*ld + i, ld ); } } } } else if ( width == 1 ) { /* Search maximum for column 0 */ if ( column == 0 ) { if ( thidx == 0 ) tmp2 = Atop[column]; /* First tmp1 */ ld = BLKLDD(A, ft); Atop2 = A( ft, 0 ); lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; max_it = ft; max_i = cblas_idamax( lm, Atop2, 1 ); tmp1 = Atop2[max_i]; abstmp1 = fabs(tmp1); /* Update */ for( it = ft+1; it < lt; it++) { Atop2= A( it, 0 ); lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; jp = cblas_idamax( lm, Atop2, 1 ); if ( fabs(Atop2[jp]) > abstmp1 ) { tmp1 = Atop2[jp]; abstmp1 = fabs(tmp1); max_i = jp; max_it = it; } } jp = offset + max_it*A.mb + max_i; CORE_damax1_thread( tmp1, thidx, thcnt, &thwin, &tmp2, pivot, jp + 1, IPIV + column ); if ( thidx == 0 ) { Atop[0] = *pivot; /* all threads have the pivot element: no need for synchronization */ } if (thwin == thidx) { /* the thread that owns the best pivot */ if ( jp-offset != 0 ) /* if there is a need to exchange the pivot */ { Atop2 = A( max_it, 0 ) + max_i; *Atop2 = tmp2; } } } CORE_dbarrier_thread( thidx, thcnt ); /* If it is the last column, we just scale */ if ( column == (min(A.m, A.n))-1 ) { pivval = *pivot; if ( pivval != 0.0 ) { if ( thidx == 0 ) { if ( fabs(pivval) >= sfmin ) { pivval = 1.0 / pivval; /* * We guess than we never enter the function with m == A.mt-1 * because it means that there is only one thread */ lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; cblas_dscal( lm - column - 1, (pivval), Atop+column+1, 1 ); for( it = ft+1; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; cblas_dscal( lm, (pivval), Atop2, 1 ); } } else { /* * We guess than we never enter the function with m == A.mt-1 * because it means that there is only one thread */ int i; Atop2 = Atop + column + 1; lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; for( i=0; i < lm-column-1; i++, Atop2++) *Atop2 = *Atop2 / pivval; for( it = ft+1; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } } } else { if ( fabs(pivval) >= sfmin ) { pivval = 1.0 / pivval; for( it = ft; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; cblas_dscal( lm, (pivval), Atop2, 1 ); } } else { /* * We guess than we never enter the function with m == A.mt-1 * because it means that there is only one thread */ int i; for( it = ft; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } } } } else { *info = column + 1; return; } } } }
int main(int argc, char **argv){ printf("Computing dsyev...\n"); int n, lda; double *A, *Acopy, *work, *w; int info, lwork; int i,j; double t1,t2,elapsed; struct timeval tp; int rtn; double normr, normb; n = 100; lda = 100; A = (double *)malloc(lda*n*sizeof(double)) ; if (A==NULL){ printf("error of memory allocation\n"); exit(0); } Acopy = (double *)malloc(lda*n*sizeof(double)) ; if (Acopy==NULL){ printf("error of memory allocation\n"); exit(0); } w=(double*)malloc(n*sizeof(double)); for(i=0;i<lda*n;i++) A[i] = ((double) rand()) / ((double) RAND_MAX) - 0.5; for(i=0;i<n;i++) { for(j=0;j<n;j++) A[i+lda*j]=A[j+lda*i]; } cblas_dcopy(lda*n,A,1,Acopy,1); work=malloc(sizeof(double)); lwork = -1; lapack_dsyev( lapack_compute_vectors, lapack_upper, n, A, lda, w, work, lwork, &info); lwork=work[0]; free(work); work=malloc(lwork*sizeof(double)); lapack_dsyev( lapack_compute_vectors, lapack_upper, n, A, lda, w, work, lwork, &info); double *tmp; tmp=(double*)malloc(n*lda*sizeof(double)); for(i=0;i<lda*n;i++) tmp[i]=0; for(i=0;i<n;i++) tmp[i+lda*i]=1.0e0; cblas_dgemm ( CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, 1.0e0, A, lda, A, lda, -1.0e0, tmp, lda); double ortho = 0.0e0; double* v; v=malloc(n*sizeof(double)); double* x; x=malloc(n*sizeof(double)); int* isgn; isgn=malloc(n*sizeof(int)); double est; int kase; double *work_dlange; work_dlange=malloc(n*sizeof(double)); ortho = lapack_dlange( lapack_one_norm, n, n, tmp, lda, work_dlange); free(work_dlange); printf("Orthogonality error : %e\n",ortho); for(i=0;i<lda*n;i++) tmp[i]=0; for(i=0;i<n;i++) tmp[i+lda*i]=w[i]; double *tmp2; tmp2=(double*)malloc(n*lda*sizeof(double)); cblas_dgemm ( CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0e0, A, lda, tmp, lda, 0.0e0, tmp2, lda); for(i=0;i<lda*n;i++) tmp[i]=Acopy[i]; cblas_dgemm ( CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, -1.0e0, tmp2, lda, A, lda, 1.0e0, tmp, lda); double normA; work_dlange=malloc(n*sizeof(double)); normA = lapack_dlange( lapack_one_norm, n, n, A, lda, work_dlange); free(work_dlange); double repr = 0.0e0; work_dlange=malloc(n*sizeof(double)); repr = lapack_dlange( lapack_one_norm, n, n, tmp, lda, work_dlange); free(work_dlange); printf("Reprentativity error : %e\n",repr); free(A); free(Acopy); free(work); free(tmp); free(tmp2); printf("*******************************************************\n"); printf("Computing zheev...\n"); n = 300; lda = 300; A = (double *)malloc(2*lda*n*sizeof(double)) ; if (A==NULL){ printf("error of memory allocation\n"); exit(0); } Acopy = (double *)malloc(2*lda*n*sizeof(double)) ; if (Acopy==NULL){ printf("error of memory allocation\n"); exit(0); } w=(double*)malloc(n*sizeof(double)); for(i=0;i<2*lda*n;i++) A[i] = ((double) rand()) / ((double) RAND_MAX) - 0.5; for (i=0;i<n;i++) for (j=0;j<n;j++) { A[2*(i+lda*j)+1] = -A[2*(j+lda*i)+1]; A[2*(i+lda*j)] = A[2*(j+lda*i)]; } for (i=0;i<n;i++) A[2*(i+lda*i)+1]=0; cblas_zcopy(lda*n,A,1,Acopy,1); double *rwork; rwork=malloc((3*n-2)*sizeof(double)); work=malloc(2*sizeof(double)); lwork = -1; lapack_zheev( lapack_compute_vectors, lapack_upper, n, A, lda, w, work, lwork, rwork, &info); lwork=work[0]; free(work); work=malloc(2*lwork*sizeof(double)); lapack_zheev( lapack_compute_vectors, lapack_upper, n, A, lda, w, work, lwork, rwork, &info); tmp=(double*)malloc(2*n*lda*sizeof(double)); double alpha[2]; double beta[2]; tmp2=(double*)malloc(2*n*lda*sizeof(double)); alpha[0]=1.0e0; alpha[1]=0.0e0; beta[0]=-1.0e0; beta[1]=0.0e0; for (i=0;i<2*n*lda;i++) tmp[i]=0; for (i=0;i<n;i++) tmp[2*(i+lda*i)]=1; cblas_zgemm ( CblasColMajor, CblasNoTrans, CblasConjTrans, n, n, n, alpha, A, lda, A, lda, beta, tmp, lda); ortho=cblas_dnrm2(2*n*n,tmp,1); printf("Orthogonality error : %e\n",ortho); for (i=0;i<n;i++) { for (j=0;j<n;j++) { tmp[2*(i+lda*j)]=A[2*(i+lda*j)]*w[j]; tmp[2*(i+lda*j)+1]=A[2*(i+lda*j)+1]*w[j]; } } cblas_zcopy(lda*n,Acopy,1,tmp2,1); cblas_zgemm ( CblasColMajor, CblasNoTrans, CblasConjTrans, n, n, n, alpha, tmp, lda, A, lda, beta, tmp2, lda); repr=cblas_dnrm2(2*n*n,tmp2,1); printf("Reprentativity error : %e\n",repr); free(A); free(Acopy); free(work); free(tmp); free(tmp2); exit(0); }
struct double_pair randmatstat(int t) { int n = 5; struct double_pair r; double *v = (double*)calloc(t,sizeof(double)); double *w = (double*)calloc(t,sizeof(double)); double *a = (double*)malloc((n)*(n)*sizeof(double)); double *b = (double*)malloc((n)*(n)*sizeof(double)); double *c = (double*)malloc((n)*(n)*sizeof(double)); double *d = (double*)malloc((n)*(n)*sizeof(double)); double *P = (double*)malloc((n)*(4*n)*sizeof(double)); double *Q = (double*)malloc((2*n)*(2*n)*sizeof(double)); double *PtP1 = (double*)malloc((4*n)*(4*n)*sizeof(double)); double *PtP2 = (double*)malloc((4*n)*(4*n)*sizeof(double)); double *QtQ1 = (double*)malloc((2*n)*(2*n)*sizeof(double)); double *QtQ2 = (double*)malloc((2*n)*(2*n)*sizeof(double)); for (int i=0; i < t; i++) { randmtzig_fill_randn(a, n*n); randmtzig_fill_randn(b, n*n); randmtzig_fill_randn(c, n*n); randmtzig_fill_randn(d, n*n); memcpy(P+0*n*n, a, n*n*sizeof(double)); memcpy(P+1*n*n, b, n*n*sizeof(double)); memcpy(P+2*n*n, c, n*n*sizeof(double)); memcpy(P+3*n*n, d, n*n*sizeof(double)); for (int j=0; j < n; j++) { for (int k=0; k < n; k++) { Q[2*n*j+k] = a[k]; Q[2*n*j+n+k] = b[k]; Q[2*n*(n+j)+k] = c[k]; Q[2*n*(n+j)+n+k] = d[k]; } } cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, n, n, 4*n, 1.0, P, 4*n, P, 4*n, 0.0, PtP1, 4*n); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 4*n, 4*n, 4*n, 1.0, PtP1, 4*n, PtP1, 4*n, 0.0, PtP2, 4*n); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 4*n, 4*n, 4*n, 1.0, PtP2, 4*n, PtP2, 4*n, 0.0, PtP1, 4*n); for (int j=0; j < n; j++) { v[i] += PtP1[(n+1)*j]; } cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, 2*n, 2*n, 2*n, 1.0, Q, 2*n, Q, 2*n, 0.0, QtQ1, 2*n); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 2*n, 2*n, 2*n, 1.0, QtQ1, 2*n, QtQ1, 2*n, 0.0, QtQ2, 2*n); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 2*n, 2*n, 2*n, 1.0, QtQ2, 2*n, QtQ2, 2*n, 0.0, QtQ1, 2*n); for (int j=0; j < 2*n; j++) { w[i] += QtQ1[(2*n+1)*j]; } } free(PtP1); free(PtP2); free(QtQ1); free(QtQ2); free(P); free(Q); free(a); free(b); free(c); free(d); double v1=0.0, v2=0.0, w1=0.0, w2=0.0; for (int i=0; i < t; i++) { v1 += v[i]; v2 += v[i]*v[i]; w1 += w[i]; w2 += w[i]*w[i]; } free(v); free(w); r.s1 = sqrt((t*(t*v2-v1*v1))/((t-1)*v1*v1)); r.s2 = sqrt((t*(t*w2-w1*w1))/((t-1)*w1*w1)); return r; }
void dgetrf( long m, long n, double a[], long lda, long ipiv[], long *info ) { /** * -- LAPACK routine (version 1.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments ..*/ /** .. * .. Array Arguments ..*/ #undef ipiv_1 #define ipiv_1(a1) ipiv[a1-1] #undef a_2 #define a_2(a1,a2) a[a1-1+lda*(a2-1)] /** .. * * Purpose * ======= * * DGETRF computes an LU factorization of a general M-by-N matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 3 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the M-by-N matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters ..*/ #undef one #define one 1.0e+0 /** .. * .. Local Scalars ..*/ long i, iinfo, j, jb, nb; /** .. * .. Intrinsic Functions ..*/ /* intrinsic max, min;*/ /** .. * .. Executable Statements .. * * Test the input parameters. **/ /*-----implicit-declarations-----*/ /*-----end-of-declarations-----*/ *info = 0; if( m<0 ) { *info = -1; } else if( n<0 ) { *info = -2; } else if( lda<max( 1, m ) ) { *info = -4; } if( *info!=0 ) { xerbla( "dgetrf", -*info ); return; } /** * Quick return if possible **/ if( m==0 || n==0 ) return; /** * Determine the block size for this environment. **/ nb = ilaenv( 1, "dgetrf", " ", m, n, -1, -1 ); if( nb<=1 || nb>=min( m, n ) ) { /** * Use unblocked code. **/ dgetf2( m, n, a, lda, ipiv, info ); } else { /** * Use blocked code. **/ for (j=1 ; nb>0?j<=min( m, n ):j>=min( m, n ) ; j+=nb) { jb = min( min( m, n )-j+1, nb ); /** * Factor diagonal and subdiagonal blocks and test for exact * singularity. **/ dgetf2( m-j+1, jb, &a_2( j, j ), lda, &ipiv_1( j ), &iinfo ); /** * Adjust INFO and the pivot indices. **/ if( *info==0 && iinfo>0 ) *info = iinfo + j - 1; for (i=j ; i<=min( m, j+jb-1 ) ; i+=1) { ipiv_1( i ) = j - 1 + ipiv_1( i ); } /** * Apply interchanges to columns 1:J-1. **/ dlaswp( j-1, a, lda, j, j+jb-1, ipiv, 1 ); if( j+jb<=n ) { /** * Apply interchanges to columns J+JB:N. **/ dlaswp( n-j-jb+1, &a_2( 1, j+jb ), lda, j, j+jb-1, ipiv, 1 ); /** * Compute block row of U. **/ cblas_dtrsm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, jb, n-j-jb+1, one, &a_2( j, j ), lda, &a_2( j, j+jb ), lda ); if( j+jb<=m ) { /** * Update trailing submatrix. **/ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m-j-jb+1, n-j-jb+1, jb, -one, &a_2( j+jb, j ), lda, &a_2( j, j+jb ), lda, one, &a_2( j+jb, j+jb ), lda ); } } } } return; /** * End of DGETRF **/ }
/* * Use DIIS to help SCF */ void calculateSCFDIIS(molecule_t *molecule) { #define EPS 0.0000000000001 #define DEL 0.0000000000001 double **fs[6], **es[6], **b, *c; int **piv; hamiltonian(molecule); sqrtMolecule(molecule); int n = molecule->orbitals; //So that the same thing does not need to be typed repeatedly. int count = 0; double elec, energy = 0, elast, rms; double **f0, **f1, **f2, **c0, **c1, **d0, **d1, **work1, **work2, **work3, **ham, **shalf, **s; double **sort; f0 = calloc_contiguous(2, sizeof(double), n, n); f1 = calloc_contiguous(2, sizeof(double), n, n); f2 = calloc_contiguous(2, sizeof(double), n, n); c0 = calloc_contiguous(2, sizeof(double), n, n); c1 = calloc_contiguous(2, sizeof(double), n, n); d0 = calloc_contiguous(2, sizeof(double), n, n); d1 = calloc_contiguous(2, sizeof(double), n, n); work1 = calloc_contiguous(2, sizeof(double), n, n); work2 = calloc_contiguous(2, sizeof(double), n, n); work3 = calloc_contiguous(2, sizeof(double), n, n); ham = calloc_contiguous(2, sizeof(double), n, n); shalf = calloc_contiguous(2, sizeof(double), n, n); sort = calloc_contiguous(2, sizeof(double), n, n); b = calloc_contiguous(2, sizeof(double), 7, 7); c = calloc(7, sizeof(double)); s = calloc_contiguous(2, sizeof(double), n, n); piv = calloc_contiguous(2, sizeof(double), 7, 7); for(int i = 0; i < 6; i++) { fs[i] = calloc_contiguous(2, sizeof(double), n, n); es[i] = calloc_contiguous(2, sizeof(double), n, n); } for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { s[i][j] = molecule->overlap[i][j]; shalf[i][j] = molecule->symmetric[i][j]; } } printf("\nElec\t\tEnergy\t\tDiff\t\tRMS\n"); do { elast = energy; if(count == 0) { for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { //Find the initial Fock guess. f0[i][j] = ham[i][j] = molecule->hamiltonian[i][j]; } } } else { memcpy(*d1, *d0, n * n * sizeof(double)); for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { f0[i][j] = ham[i][j]; for(int k = 0; k < n; k++) { for(int l = 0; l < n; l++) { f0[i][j] += d0[k][l] * (2 * molecule->two_electron[TEI(i, j, k, l)] - molecule->two_electron[TEI(i, k, j, l)]); } } } } } //DIIS extrapolation. memcpy(*(fs[count % 6]), *f0, n * n * sizeof(double)); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *s, n, *d0, n, 0, *work1, n); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *work1, n, *f0, n, 0, *work2, n); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *f0, n, *d0, n, 0, *work1, n); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *work1, n, *s, n, 0, *work3, n); for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { es[count % 6][i][j] = work3[i][j] - work2[i][j]; } } if(count >= 6) { for(int i = 0; i < ((count > 6)? 6: count); i++) { for(int j = 0; j < ((count > 6)? 6: count); j++) { b[i][j] = 0; for(int k = 0; k < n; k++) { for(int l = 0; l < n; l++) { b[i][j] += es[i][k][l] * es[j][k][l]; } } } } if(count < 6) { for(int i = 0; i < 6; i++) { for(int j = 0; j < 6; j++) { if(i < count && j < count) { continue; } if(i == j) { b[i][j] = 1; } else { b[i][j] = 0; } } } } for(int i = 0; i < 6; i++) { b[6][i] = -1; b[i][6] = -1; c[i] = 0; } b[6][6] = 0; c[6] = -1; LAPACKE_dgesv(LAPACK_ROW_MAJOR, 7, 1, *b, 7, *piv, c, 1); for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { f2[i][j] = 0; for(int m = 0; m < 6; m++) { f2[i][j] += c[m] * fs[m][i][j]; } } } cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, n, n, n, 1.0, *shalf, n, *f2, n, 0, *work1, n); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *work1, n, *shalf, n, 0, *f1, n); } else { cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, n, n, n, 1.0, *shalf, n, *f0, n, 0, *work1, n); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *work1, n, *shalf, n, 0, *f1, n); } memset(work1[0], 0, n * n * sizeof(double)); memset(work2[0], 0, n * n * sizeof(double)); memset(work3[0], 0, n * n * sizeof(double)); LAPACKE_dgeev(LAPACK_ROW_MAJOR, 'N', 'V', n, *f1, n, *work1, *work2, *work3, n, *c1, n); //Prepare for sorting. for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { work2[i][j] = c1[i][j]; } } //Sort for(int i = 0; i < n; i++) { sort[i] = work1[0] + i; } qsort(sort, n, sizeof(double *), comparedd); //Sift through data. for(int i = 0; i < n; i++) { unsigned long off = ((unsigned long) sort[i] - (unsigned long) work1[0]); off /= sizeof(double); for(int j = 0; j < n; j++) { c1[j][i] = work2[j][off]; } } cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *shalf, n, *c1, n, 0, *c0, n); for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { d0[i][j] = 0; for(int k = 0; k < molecule->electrons / 2; k++) { d0[i][j] += c0[i][k] * c0[j][k]; } } } elec = 0; for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { elec += d0[i][j] * (ham[i][j] + f0[i][j]); } } energy = elec + molecule->enuc; rms = 0; for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { rms += (d0[i][j] - d1[i][j]) * (d0[i][j] - d1[i][j]); } } rms = sqrt(rms); count++; printf("%d\t%.15f\t%.15f\t%.15f\t%.15f\n", count, elec, energy, fabs(elast - energy), rms); } while(count < 100 && (fabs(elast - energy) > EPS && rms > DEL)); molecule->scf_energy = energy; for(int i = 0; i < n; i++) { for(int j = 0; j < n; j++) { molecule->density[i][j] = d0[i][j]; molecule->fock[i][j] = f0[i][j]; molecule->molecular_orbitals[i][j] = c0[i][j]; molecule->molecular_eigs[i][j] = ((i == j)? sort[i][0]: 0); } } free_mult_contig(16, c0, c1, d0, d1, f0, f1, f2, ham, shalf, work1, work2, work3, sort, b, c, s); for(int i = 0; i < 6; i++) { free(fs[i]); free(es[i]); } }
// This dMList function creates a list of the dM values giving the probabilities of mutations static PyObject *dMList(PyObject *self, PyObject *args) { // Calling variables are (in order): uts, only_need, n_aa, length, grs, dmlist, residue_to_compute, iwt, brs PyObject *uts, *only_need, *grs, *r_grs_tuple, *gr_diag, *p, *p_inv, *dmlist, *brs, *brz; long n_aa, length, n_aa2, n_uts, residue_to_compute, i_ut, x, y, index, only_need_index, only_need_i, only_need_i_n_aa, index2, iwt, n_aa3, z; double *arr_dmlist, *cp_inv, *cp, *cgr_diag, *cexpd, *cbrz, *cvrz, *cvrz_p_inv; complex double *complex_cp_inv, *complex_cp, *complex_cgr_diag, *complex_cexpd, *complex_naa2_list, *complex_naa_list, *complex_cvrz, *complex_cvrz_p_inv, *complex_cbrz; double ut, exp_utdx, exp_utdy, dx, dy; complex double complex_exp_utdx, complex_exp_utdy, complex_dx, complex_dy; int array_type; #ifdef USE_ACCELERATE_CBLAS complex double complex_one = 1, complex_zero = 0; #else complex double complex_dmxy, complex_v_p_inv_xy; double dmxy, v_p_inv_xy; long yindex, irowcolumn; #endif // Parse the arguments. if (! PyArg_ParseTuple( args, "O!O!llO!O!llO!", &PyList_Type, &uts, &PyList_Type, &only_need, &n_aa, &length, &PyList_Type, &grs, &PyArray_Type, &dmlist, &residue_to_compute, &iwt, &PyList_Type, &brs)) { PyErr_SetString(PyExc_TypeError, "Invalid calling arguments to dMList."); return NULL; } // Error checking on arguments if (length < 1) { // length of the protein PyErr_SetString(PyExc_ValueError, "length is less than one."); return NULL; } if (n_aa < 1) { // number of amino acids. Normally will be 20. PyErr_SetString(PyExc_ValueError, "n_aa is less than one."); return NULL; } if (PyList_GET_SIZE(grs) != length) { // make sure grs is of the same size as length PyErr_SetString(PyExc_ValueError, "grs is not of the same size as length."); return NULL; } n_uts = PyList_GET_SIZE(uts); // number of entries in uts if (n_uts < 1) { // make sure there are entries in uts PyErr_SetString(PyExc_ValueError, "uts has no entries."); return NULL; } if (PyList_GET_SIZE(only_need) != n_uts * length) { // make sure only_need is of correct size PyErr_SetString(PyExc_ValueError, "only_need is of wrong size."); return NULL; } if (! ((residue_to_compute >= 0) && (residue_to_compute < length))) { PyErr_SetString(PyExc_ValueError, "Invalid value for residue_to_compute."); return NULL; } if (! ((iwt >= 0) && (iwt < n_aa))) { PyErr_SetString(PyExc_ValueError, "Invalid value for iwt."); return NULL; } if (PyList_GET_SIZE(brs) != n_aa) { // make sure brs has one entry for each amino acid PyErr_SetString(PyExc_ValueError, "brs is not of length equal to n_aa"); return NULL; } n_aa2 = n_aa * n_aa; // square of the number of amino acids n_aa3 = n_aa2 * n_aa; // cube of the number of amino acids // The results will be returned in a numpy ndarray 'float_' (C type double) array called dmlist. // This array will be of size length * n_uts * n_aa3. arr_dmlist = (double *) PyArray_DATA(dmlist); // this is the data array of dmlist long const sizeof_cexpd = n_aa2 * sizeof(double); long const complex_sizeof_cexpd = n_aa2 * sizeof(complex double); // gr_diag, p, and p_inv are the eigenvalues, left, and right diagonalizing matrices of gr r_grs_tuple = PyList_GET_ITEM(grs, residue_to_compute); gr_diag = PyTuple_GET_ITEM(r_grs_tuple, 0); p = PyTuple_GET_ITEM(r_grs_tuple, 1); p_inv = PyTuple_GET_ITEM(r_grs_tuple, 2); // Now begin filling arr_dmlist with the appropriate values index = 0; only_need_index = residue_to_compute * n_uts; // determine if these arrays are complex double or real doubles array_type = PyArray_TYPE(gr_diag); if (array_type == NPY_DOUBLE) { // array is of doubles, real not complex // Note that these next assignments assume that the arrays are C-style contiguous cp = PyArray_DATA(p); cp_inv = PyArray_DATA(p_inv); cgr_diag = PyArray_DATA(gr_diag); cexpd = (double *) malloc(sizeof_cexpd); // allocate memory cvrz = (double *) malloc(sizeof_cexpd); // allocate memory cvrz_p_inv = (double *) malloc(sizeof_cexpd); // allocate memory for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in dmlist index += n_aa3; } else { // we need to compute at least some entries in dmlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value // Entries of cexpd are defined by D_xy = (exp(ut d_x) - exp(ut d_y)) / (d_x - d_y) // for x != y, and D_yy = ut exp(d_x ut) index2 = 0; for (x = 0; x < n_aa; x++) { dx = cgr_diag[x]; exp_utdx = exp(ut * dx); for (y = 0; y < n_aa; y++) { if (y == x) { cexpd[index2] = ut * exp_utdx; } else { dy = cgr_diag[y]; exp_utdy = exp(ut * dy); cexpd[index2] = (exp_utdx - exp_utdy) / (dx - dy); } index2++; } } for (z = 0; z < n_aa; z++) { // compute derivative with respect to z if (z == iwt) { // don't compute values with respect to wildtype index += n_aa2; continue; } brz = PyList_GET_ITEM(brs, z); cbrz = PyArray_DATA(brz); // cvrz is the element-by-element product of cbrz and cexpd for (index2 = 0; index2 < n_aa2; index2++) { cvrz[index2] = cbrz[index2] * cexpd[index2]; } #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_dgemm cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cvrz, n_aa, cp_inv, n_aa, (double) 0.0, cvrz_p_inv, n_aa); // multiply cvrz and cp_inv into cvrz_p_inv #else // multiply the matrices in pure C code // multiply cvrz and cp_inv into cvrz_p_inv index2 = 0; for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { v_p_inv_xy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { v_p_inv_xy += cvrz[irowcolumn] * cp_inv[yindex]; yindex += n_aa; } cvrz_p_inv[index2++] = v_p_inv_xy; } } #endif if (only_need_i == -2) { // we need to compute all of these entries in dmlist #ifdef USE_ACCELERATE_CBLAS cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cp, n_aa, cvrz_p_inv, n_aa, (double) 0.0, &arr_dmlist[index], n_aa); // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] index += n_aa2; #else // multiply the matrices in pure C code, and fill dmlist with the results // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { dmxy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { dmxy += cp[irowcolumn] * cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = dmxy; } } #endif } else { // we need to compute entries in dmlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in dmlist cblas_dgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, (double) 1.0, cvrz_p_inv, n_aa, &cp[only_need_i_n_aa], 1, (double) 0.0, &arr_dmlist[index], 1); index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { dmxy = 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { dmxy += cp[irowcolumn] * cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = dmxy; } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } } free(cexpd); free(cvrz); free(cvrz_p_inv); } else if (array_type == NPY_CDOUBLE) { // array is of complex doubles // Note that these next assignments assume that the arrays are C-style contiguous complex_cp = PyArray_DATA(p); complex_cp_inv = PyArray_DATA(p_inv); complex_cgr_diag = PyArray_DATA(gr_diag); complex_cexpd = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory complex_cvrz = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory complex_cvrz_p_inv = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory complex_naa2_list = (complex double *) malloc(n_aa2 * sizeof(complex double)); // allocate memory complex_naa_list = (complex double *) malloc(n_aa * sizeof(complex double)); // allocate memory for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in dmlist index += n_aa3; } else { // we need to compute at least some entries in dmlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value // Entries of cexpd are defined by D_xy = (exp(ut d_x) - exp(ut d_y)) / (d_x - d_y) // for x != y, and D_yy = ut exp(d_x ut) index2 = 0; for (x = 0; x < n_aa; x++) { complex_dx = complex_cgr_diag[x]; complex_exp_utdx = cexp(ut * complex_dx); for (y = 0; y < n_aa; y++) { if (y == x) { complex_cexpd[index2] = ut * complex_exp_utdx; } else { complex_dy = complex_cgr_diag[y]; complex_exp_utdy = cexp(ut * complex_dy); complex_cexpd[index2] = (complex_exp_utdx - complex_exp_utdy) / (complex_dx - complex_dy); } index2++; } } for (z = 0; z < n_aa; z++) { // compute derivative with respect to z if (z == iwt) { // don't compute values with respect to wildtype index += n_aa2; continue; } brz = PyList_GET_ITEM(brs, z); complex_cbrz = PyArray_DATA(brz); // cvrz is the element-by-element product of cbrz and cexpd for (index2 = 0; index2 < n_aa2; index2++) { complex_cvrz[index2] = complex_cbrz[index2] * complex_cexpd[index2]; } #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_zgemm cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cvrz, n_aa, complex_cp_inv, n_aa, &complex_zero, complex_cvrz_p_inv, n_aa); // multiply cvrz and cp_inv into cvrz_p_inv #else // multiply the matrices in pure C code // multiply cvrz and cp_inv into cvrz_p_inv index2 = 0; for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { complex_v_p_inv_xy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { complex_v_p_inv_xy += complex_cvrz[irowcolumn] * complex_cp_inv[yindex]; yindex += n_aa; } complex_cvrz_p_inv[index2++] = complex_v_p_inv_xy; } } #endif if (only_need_i == -2) { // we need to compute all of these entries in dmlist #ifdef USE_ACCELERATE_CBLAS cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cp, n_aa, complex_cvrz_p_inv, n_aa, &complex_zero, complex_naa2_list, n_aa); // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] for (index2 = 0; index2 < n_aa2; index2++) { arr_dmlist[index + index2] = creal(complex_naa2_list[index2]); } index += n_aa2; #else // multiply the matrices in pure C code, and fill dmlist with the results // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { complex_dmxy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { complex_dmxy += complex_cp[irowcolumn] * complex_cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = creal(complex_dmxy); } } #endif } else { // we need to compute entries in dmlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in dmlist cblas_zgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, &complex_one, complex_cvrz_p_inv, n_aa, &complex_cp[only_need_i_n_aa], 1, &complex_zero, complex_naa_list, 1); for (index2 = 0; index2 < n_aa; index2++) { arr_dmlist[index + index2] = creal(complex_naa_list[index2]); } index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { complex_dmxy = 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { complex_dmxy += complex_cp[irowcolumn] * complex_cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = creal(complex_dmxy); } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } } free(complex_cexpd); free(complex_cvrz); free(complex_cvrz_p_inv); free(complex_naa2_list); free(complex_naa_list); } else { // array is of neither real nor complex doubles PyErr_SetString(PyExc_ValueError, "matrices are neither double nor complex doubles."); return NULL; } return PyInt_FromLong((long) 1); }
template <typename fptype> static inline int lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha, const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags) { int ldsrc1 = src1_step / sizeof(fptype); int ldsrc2 = src2_step / sizeof(fptype); int ldsrc3 = src3_step / sizeof(fptype); int lddst = dst_step / sizeof(fptype); int c_m, c_n, d_m; CBLAS_TRANSPOSE transA, transB; if(flags & CV_HAL_GEMM_2_T) { transB = CblasTrans; if(flags & CV_HAL_GEMM_1_T ) { d_m = a_n; } else { d_m = a_m; } } else { transB = CblasNoTrans; if(flags & CV_HAL_GEMM_1_T ) { d_m = a_n; } else { d_m = a_m; } } if(flags & CV_HAL_GEMM_3_T) { c_m = d_n; c_n = d_m; } else { c_m = d_m; c_n = d_n; } if(flags & CV_HAL_GEMM_1_T ) { transA = CblasTrans; std::swap(a_n, a_m); } else { transA = CblasNoTrans; } if(src3 != dst && beta != 0.0 && src3_step != 0) { if(flags & CV_HAL_GEMM_3_T) transpose(src3, ldsrc3, dst, lddst, c_m, c_n); else copy_matrix(src3, ldsrc3, dst, lddst, c_m, c_n); } else if (src3 == dst && (flags & CV_HAL_GEMM_3_T)) //actually transposing C in this case done by openCV return CV_HAL_ERROR_NOT_IMPLEMENTED; else if(src3_step == 0 && beta != 0.0) set_value(dst, lddst, (fptype)0.0, d_m, d_n); if(typeid(fptype) == typeid(float)) cblas_sgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (float)alpha, (float*)src1, ldsrc1, (float*)src2, ldsrc2, (float)beta, (float*)dst, lddst); else if(typeid(fptype) == typeid(double)) cblas_dgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (double)alpha, (double*)src1, ldsrc1, (double*)src2, ldsrc2, (double)beta, (double*)dst, lddst); return CV_HAL_ERROR_OK; }
int main(int argc, char * argv[]) { std::string file; int RDN = 0; char c; while ((c = getopt(argc, argv, "A:R:")) != -1) { switch (c) { case 'A': file = optarg; break; case 'R': RDN = atoi(optarg); break; } } // file= "../../SVM/epsilon_normalized"; // file = "data/dense.svm"; // file = "data/a1a"; // file = "../../SVM/random_dense"; stringstream ss(""); ss << file << "_log"; ofstream logFile; logFile.open(ss.str().c_str()); omp_set_num_threads(1); const int MAXIMUM_THREADS = 64; std::vector<gsl_rng *> rs = randomNumberUtil::inittializeRandomSeeds( MAXIMUM_THREADS); int n = 1000; // this value was used for experiments int m = n * 2; randomNumberUtil::init_random_seeds(rs); //--------------------- run experiment - one can change precission here // string inputFile, int file, int totalFiles, // ProblemData<L, D> & part, bool zeroBased ProblemData<int, double> part; std::vector<double> A; std::vector<double> b; if (RDN == 0) { loadDistributedSparseSVMRowData(file, -1, -1, part, false); m = part.m; n = part.n; // std::vector<double> &A = part.A_csr_values; // std::vector<double> &b = part.b; } else { switch (RDN) { case 1: n = 2048; m = n / 2; break; case 2: n = 2048; m = n * 2; break; case 3: n = 2048; m = n; break; default: break; } A.resize(m * n, 0); for (int i = 0; i < n; i++) { for (int j = 0; j < m; j++) { A[i * m + j] = -1 + 2 * rand() / (0.0 + RAND_MAX); } double norm = cblas_l2_norm(m, &A[m * i], 1); cblas_vector_scale(m, &A[m * i], 1 / norm); } b.resize(n); for (int i = 0; i < b.size(); i++) { b[i] = -1 + 2 * round(rand() / (0.0 + RAND_MAX)); } } std::vector<double> Li(n, 0); std::vector<double> LiSqInv(n, 0); bool dense = (m * n == A.size()); if (dense) { cout << "Input data is dense!!!" << endl; for (int i = 0; i < n; i++) { Li[i] = 0; LiSqInv[i] = 0; for (int j = 0; j < m; j++) { Li[i] += A[i * m + j] * A[i * m + j]; } if (Li[i] > 0) { LiSqInv[i] = 1 / sqrt(Li[i]); } } } else { cout << "Input data is sparse!!! " << part.A_csr_row_ptr.size() << endl; for (int i = 0; i < n; i++) { Li[i] = 0; LiSqInv[i] = 0; for (int j = part.A_csr_row_ptr[i]; j < part.A_csr_row_ptr[i + 1]; j++) { Li[i] += part.A_csr_values[j] * part.A_csr_values[j]; } if (Li[i] > 0) { LiSqInv[i] = 1 / sqrt(Li[i]); } } } cout << "Stage 2" << endl; std::vector<double> x(n); std::vector<double> y(m); for (int i = 0; i < n; i++) { x[i] = rand() / (0.0 + RAND_MAX); } double norm = cblas_l2_norm(n, &x[0], 1); cblas_vector_scale(n, &x[0], 1 / norm); double maxEig = 1; for (int PM = 0; PM < 20; PM++) { if (dense) { for (int j = 0; j < m; j++) { y[j] = 0; for (int i = 0; i < n; i++) { y[j] += A[i * m + j] * LiSqInv[i] * x[i]; } } } else { for (int j = 0; j < m; j++) { y[j] = 0; } for (int i = 0; i < n; i++) { for (int j = part.A_csr_row_ptr[i]; j < part.A_csr_row_ptr[i + 1]; j++) { y[part.A_csr_col_idx[j]] += part.A_csr_values[j] * LiSqInv[i] * x[i]; } } } for (int i = 0; i < n; i++) { x[i] = 0; if (dense) { for (int j = 0; j < m; j++) { x[i] += A[i * m + j] * LiSqInv[i] * y[j]; } } else { for (int j = part.A_csr_row_ptr[i]; j < part.A_csr_row_ptr[i + 1]; j++) { x[i] += part.A_csr_values[j] * LiSqInv[i] * y[part.A_csr_col_idx[j]]; } } } maxEig = cblas_l2_norm(n, &x[0], 1); cout << maxEig << endl; cblas_vector_scale(n, &x[0], 1 / maxEig); } cout << "Max eigenvalue estimated " << endl; x.resize(0); y.resize(0); LiSqInv.resize(0); double lambda = 1 / (n + 0.0); double maxTime = 1000; std::vector<double> Hessian; if (n < 10000) { Hessian.resize(n * n); if (dense) { cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, n, n, m, 1.0, &A[0], m, &A[0], m, 0, &Hessian[0], n); } else { std::vector<double>& vals = part.A_csr_values; std::vector<int> &rowPtr = part.A_csr_row_ptr; std::vector<int> &colIdx = part.A_csr_col_idx; for (int row = 0; row < n; row++) { for (int col = row; col < n; col++) { double tmp = 0; int id1 = rowPtr[row]; int id2 = rowPtr[col]; while (id1 < rowPtr[row + 1] && id2 < rowPtr[col + 1]) { if (colIdx[id1] == colIdx[id2]) { tmp += vals[id1] * vals[id2]; id1++; id2++; } else if (colIdx[id1] < colIdx[id2]) { id1++; } else { id2++; } } Hessian[row * n + col] = tmp; Hessian[col * n + row] = tmp; } } } } int MAXTAU = n / 2; if (MAXTAU > 1024 * 8) { MAXTAU = 1024 * 8; } for (int tau = 1; tau <= MAXTAU; tau = tau * 2) { // int tau = 1; // omp_set_num_threads(tau); double sigma = 1 + (tau - 1) * (maxEig - 1) / (n - 1.0); if (dense) { randomNumberUtil::init_random_seeds(rs); runPCDMExperiment(m, n, A, b, lambda, tau, logFile, Li, sigma, maxTime); randomNumberUtil::init_random_seeds(rs); runCDNExperiment(m, n, A, b, lambda, tau, logFile, 1, maxTime, Hessian); randomNumberUtil::init_random_seeds(rs); runCDNExperiment(m, n, A, b, lambda, tau, logFile, 2, maxTime, Hessian); } else { randomNumberUtil::init_random_seeds(rs); runPCDMExperimentSparse(m, n, part, part.b, lambda, tau, logFile, Li, sigma, maxTime); randomNumberUtil::init_random_seeds(rs); runCDNExperimentSparse(m, n, part, part.b, lambda, tau, logFile, 1, maxTime, Hessian); randomNumberUtil::init_random_seeds(rs); runCDNExperimentSparse(m, n, part, part.b, lambda, tau, logFile, 2, maxTime, Hessian); } } logFile.close(); // histogramLogFile.close(); // experimentLogFile.close(); return 0; }
// This MList function creates a list of the M values giving the probabilities of mutations static PyObject *MList(PyObject *self, PyObject *args) { // Calling variables are (in order): uts, only_need, n_aa, length, grs, mlist, residue_to_compute PyObject *uts, *only_need, *grs, *r_grs_tuple, *gr_diag, *p, *p_inv, *mlist; long n_aa, length, n_aa2, n_uts, residue_to_compute, i_ut, y, index, irowcolumn, only_need_index, only_need_i, only_need_i_n_aa; double *arr_mlist, *cp_inv, *cp, *cgr_diag, *cp_inv_exp; complex double *complex_cp_inv, *complex_cp, *complex_cgr_diag, *complex_cp_inv_exp, *complex_naa2_list, *complex_naa_list; double ut, exp_ut; complex double complex_exp_ut; int array_type; #ifdef USE_ACCELERATE_CBLAS complex double complex_one = 1, complex_zero = 0; long index2; #else complex double complex_mxy; double mxy; long x, yindex; #endif // Parse the arguments. if (! PyArg_ParseTuple( args, "O!O!llO!O!l", &PyList_Type, &uts, &PyList_Type, &only_need, &n_aa, &length, &PyList_Type, &grs, &PyArray_Type, &mlist, &residue_to_compute)) { PyErr_SetString(PyExc_TypeError, "Invalid calling arguments to MList."); return NULL; } // Error checking on arguments if (length < 1) { // length of the protein PyErr_SetString(PyExc_ValueError, "length is less than one."); return NULL; } if (n_aa < 1) { // number of amino acids. Normally will be 20. PyErr_SetString(PyExc_ValueError, "n_aa is less than one."); return NULL; } if (PyList_GET_SIZE(grs) != length) { // make sure grs is of the same size as length // PyErr_SetString(PyExc_ValueError, "grs is not of the same size as length."); char errstring[200]; sprintf(errstring, "grs is not of the same size as length: %ld, %ld", PyList_GET_SIZE(grs), length); PyErr_SetString(PyExc_ValueError, errstring); return NULL; } n_uts = PyList_GET_SIZE(uts); // number of entries in uts if (n_uts < 1) { // make sure there are entries in uts PyErr_SetString(PyExc_ValueError, "uts has no entries."); return NULL; } if (PyList_GET_SIZE(only_need) != n_uts * length) { // make sure only_need is of correct size PyErr_SetString(PyExc_ValueError, "only_need is of wrong size."); return NULL; } if (! ((residue_to_compute >= 0) && (residue_to_compute < length))) { char errstring[200]; sprintf(errstring, "Invalid value for residue_to_compute: %ld, %ld", residue_to_compute, length); PyErr_SetString(PyExc_ValueError, errstring); return NULL; } n_aa2 = n_aa * n_aa; // square of the number of amino acids // The results will be returned in a numpy ndarray 'float_' (C type double) array called mlist. // This array will be of size length * n_uts * n_aa2. arr_mlist = (double *) PyArray_DATA(mlist); // this is the data array of mlist long const sizeof_cp_inv = n_aa2 * sizeof(double); long const complex_sizeof_cp_inv = n_aa2 * sizeof(complex double); // Now begin filling arr_mlist with the appropriate values index = 0; only_need_index = residue_to_compute * n_uts; r_grs_tuple = PyList_GET_ITEM(grs, residue_to_compute); // gr_diag, p, and p_inv are the eigenvalues, left, and right diagonalizing matrices of gr gr_diag = PyTuple_GET_ITEM(r_grs_tuple, 0); p = PyTuple_GET_ITEM(r_grs_tuple, 1); p_inv = PyTuple_GET_ITEM(r_grs_tuple, 2); // determine if these arrays are complex double or real doubles array_type = PyArray_TYPE(gr_diag); if (array_type == NPY_DOUBLE) { // array is of doubles, real not complex cp_inv_exp = (double *) malloc(sizeof_cp_inv); // allocate memory // Note that these next assignments assume that the arrays are C-style contiguous cp = PyArray_DATA(p); cp_inv = PyArray_DATA(p_inv); cgr_diag = PyArray_DATA(gr_diag); for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in mlist index += n_aa2; } else { // we need to compute at least some entries in mlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value for (irowcolumn = 0; irowcolumn < n_aa; irowcolumn++) { exp_ut = exp(ut * cgr_diag[irowcolumn]); for (y = irowcolumn * n_aa; y < (irowcolumn + 1) * n_aa; y++) { cp_inv_exp[y] = cp_inv[y] * exp_ut; } } if (only_need_i == -2) { // we need to compute all of these entries in mlist #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_dgemm, and fill mlist with the results cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cp, n_aa, cp_inv_exp, n_aa, (double) 0.0, &arr_mlist[index], n_aa); for (index2 = index; index2 < index + n_aa2; index2++) { arr_mlist[index2] = fabs(arr_mlist[index2]); } index += n_aa2; #else // multiply the matrices in pure C code, and fill mlist with the results for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { mxy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { mxy += cp[irowcolumn] * cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = fabs(mxy); } } #endif } else { // we need to compute entries in mlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in mlist cblas_dgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, (double) 1.0, cp_inv_exp, n_aa, &cp[only_need_i_n_aa], 1, (double) 0.0, &arr_mlist[index], 1); for (index2 = index; index2 < index + n_aa2 - only_need_i_n_aa; index2++) { arr_mlist[index2] = fabs(arr_mlist[index2]); } index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { mxy = 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { mxy += cp[irowcolumn] * cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = fabs(mxy); } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } free(cp_inv_exp); } else if (array_type == NPY_CDOUBLE) { // array is complex doubles complex_cp_inv_exp = (complex double *) malloc(complex_sizeof_cp_inv); // allocate memory complex_naa2_list = (complex double *) malloc(n_aa2 * sizeof(complex double)); // allocate memory complex_naa_list = (complex double *) malloc(n_aa * sizeof(complex double)); // allocate memory // Note that these next assignments assume that the arrays are C-style contiguous complex_cp = PyArray_DATA(p); complex_cp_inv = PyArray_DATA(p_inv); complex_cgr_diag = PyArray_DATA(gr_diag); for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in mlist index += n_aa2; } else { // we need to compute at least some entries in mlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value for (irowcolumn = 0; irowcolumn < n_aa; irowcolumn++) { complex_exp_ut = cexp(ut * complex_cgr_diag[irowcolumn]); for (y = irowcolumn * n_aa; y < (irowcolumn + 1) * n_aa; y++) { complex_cp_inv_exp[y] = complex_cp_inv[y] * complex_exp_ut; } } if (only_need_i == -2) { // we need to compute all of these entries in mlist #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_zgemm, and fill mlist with the results cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cp, n_aa, complex_cp_inv_exp, n_aa, &complex_zero, complex_naa2_list, n_aa); for (index2 = 0; index2 < n_aa2; index2++) { arr_mlist[index + index2] = cabs(complex_naa2_list[index2]); } index += n_aa2; #else // multiply the matrices in pure C code, and fill mlist with the results for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { complex_mxy = (complex double) 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { complex_mxy += complex_cp[irowcolumn] * complex_cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = cabs(complex_mxy); } } #endif } else { // we need to compute entries in mlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in mlist cblas_zgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, &complex_one, complex_cp_inv_exp, n_aa, &complex_cp[only_need_i_n_aa], 1, &complex_zero, complex_naa_list, 1); for (index2 = 0; index2 < n_aa; index2++) { arr_mlist[index + index2] = cabs(complex_naa_list[index2]); } index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { complex_mxy = (complex double) 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { complex_mxy += complex_cp[irowcolumn] * complex_cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = cabs(complex_mxy); } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } free(complex_cp_inv_exp); free(complex_naa2_list); free(complex_naa_list); } else { // array is of neither real nor complex doubles PyErr_SetString(PyExc_ValueError, "gr entries are neither double nor complex doubles."); return NULL; } return PyInt_FromLong((long) 1); }
static inline int CORE_dpamm_a2(PLASMA_enum side, PLASMA_enum trans, PLASMA_enum uplo, int M, int N, int K, int L, int vi2, int vi3, double *A2, int LDA2, const double *V, int LDV, double *W, int LDW) { /* * A2 = A2 + op(V) * W or A2 = A2 + W * op(V) */ int j; static double zone = 1.0; static double mzone = -1.0; if (side == PlasmaLeft) { if (((trans == PlasmaTrans) && (uplo == CblasUpper)) || ((trans == PlasmaNoTrans) && (uplo == CblasLower))) { printf("Left Upper/ConjTrans & Lower/NoTrans not implemented yet\n"); return PLASMA_ERR_NOT_SUPPORTED; } else { //trans /* * A2 = A2 - V * W */ /* A2_1 = A2_1 - V_1 * W_1 */ if (M > L) { cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans, M-L, N, L, (mzone), V, LDV, W, LDW, (zone), A2, LDA2); } /* W_1 = V_2 * W_1 */ cblas_dtrmm( CblasColMajor, CblasLeft, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, CblasNonUnit, L, N, (zone), &V[vi2], LDV, W, LDW); /* A2_2 = A2_2 - W_1 */ for(j = 0; j < N; j++) { cblas_daxpy( L, (mzone), &W[LDW*j], 1, &A2[LDA2*j+(M-L)], 1); } /* A2 = A2 - V_3 * W_2 */ if (K > L) { cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans, M, N, (K-L), (mzone), &V[vi3], LDV, &W[L], LDW, (zone), A2, LDA2); } } } else { //side right if (((trans == PlasmaTrans) && (uplo == CblasUpper)) || ((trans == PlasmaNoTrans) && (uplo == CblasLower))) { /* * A2 = A2 - W * V' */ /* A2 = A2 - W_2 * V_3' */ if (K > L) { cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans, M, N, K-L, (mzone), &W[LDW*L], LDW, &V[vi3], LDV, (zone), A2, LDA2); } /* A2_1 = A2_1 - W_1 * V_1' */ if (N > L) { cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans, M, N-L, L, (mzone), W, LDW, V, LDV, (zone), A2, LDA2); } /* A2_2 = A2_2 - W_1 * V_2' */ if (L > 0) { cblas_dtrmm( CblasColMajor, CblasRight, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M, L, (mzone), &V[vi2], LDV, W, LDW); for (j = 0; j < L; j++) { cblas_daxpy( M, (zone), &W[LDW*j], 1, &A2[LDA2*(N-L+j)], 1); } } } else { printf("Right Upper/NoTrans & Lower/ConjTrans not implemented yet\n"); return PLASMA_ERR_NOT_SUPPORTED; } } return PLASMA_SUCCESS; }
static inline int CORE_dpamm_w(PLASMA_enum side, PLASMA_enum trans, PLASMA_enum uplo, int M, int N, int K, int L, int vi2, int vi3, const double *A1, int LDA1, double *A2, int LDA2, const double *V, int LDV, double *W, int LDW) { /* * W = A1 + op(V) * A2 or W = A1 + A2 * op(V) */ int j; static double zone = 1.0; static double zzero = 0.0; if (side == PlasmaLeft) { if (((trans == PlasmaTrans) && (uplo == CblasUpper)) || ((trans == PlasmaNoTrans) && (uplo == CblasLower))) { /* * W = A1 + V' * A2 */ /* W = A2_2 */ LAPACKE_dlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), L, N, &A2[K-L], LDA2, W, LDW); /* W = V_2' * W + V_1' * A2_1 (ge+tr, top L rows of V') */ if (L > 0) { /* W = V_2' * W */ cblas_dtrmm( CblasColMajor, CblasLeft, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, CblasNonUnit, L, N, (zone), &V[vi2], LDV, W, LDW); /* W = W + V_1' * A2_1 */ if (K > L) { cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans, L, N, K-L, (zone), V, LDV, A2, LDA2, (zone), W, LDW); } } /* W_2 = V_3' * A2: (ge, bottom M-L rows of V') */ if (M > L) { cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans, (M-L), N, K, (zone), &V[vi3], LDV, A2, LDA2, (zzero), &W[L], LDW); } /* W = A1 + W */ for(j = 0; j < N; j++) { cblas_daxpy( M, (zone), &A1[LDA1*j], 1, &W[LDW*j], 1); } } else { printf("Left Upper/NoTrans & Lower/ConjTrans not implemented yet\n"); return PLASMA_ERR_NOT_SUPPORTED; } } else { //side right if (((trans == PlasmaTrans) && (uplo == CblasUpper)) || ((trans == PlasmaNoTrans) && (uplo == CblasLower))) { printf("Right Upper/ConjTrans & Lower/NoTrans not implemented yet\n"); return PLASMA_ERR_NOT_SUPPORTED; } else { /* * W = A1 + A2 * V */ if (L > 0) { /* W = A2_2 */ LAPACKE_dlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M, L, &A2[LDA2*(K-L)], LDA2, W, LDW); /* W = W * V_2 --> W = A2_2 * V_2 */ cblas_dtrmm( CblasColMajor, CblasRight, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M, L, (zone), &V[vi2], LDV, W, LDW); /* W = W + A2_1 * V_1 */ if (K > L) { cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans, M, L, K-L, (zone), A2, LDA2, V, LDV, (zone), W, LDW); } } /* W = W + A2 * V_3 */ if (N > L) { cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans, M, N-L, K, (zone), A2, LDA2, &V[vi3], LDV, (zzero), &W[LDW*L], LDW); } /* W = A1 + W */ for (j = 0; j < N; j++) { cblas_daxpy( M, (zone), &A1[LDA1*j], 1, &W[LDW*j], 1); } } } return PLASMA_SUCCESS; }
/* Square matrix-matrix multiplication */ void matrix_multiply(int M, int N, int K, int blockX_len, int blockY_len) { /* Local buffers and Global arrays declaration */ double *a=NULL, *b=NULL, *c=NULL; int dims[NDIMS], ld[NDIMS], chunks[NDIMS]; int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */ int g_a, g_b, g_c, g_cnt, g_cnt2; int offset; double alpha = 1.0, beta=0.0; int count_p = 0, next_p = 0; int count_gac = 0, next_gac = 0; double t1,t2,seconds; ga_nbhdl_t nbh; int count_acc = 0; /* Find local processor ID and the number of processes */ int proc=GA_Nodeid(), nprocs=GA_Nnodes(); if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 || (K % blockX_len) != 0 || (K % blockY_len) != 0) GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101); /* Allocate/Set process local buffers */ a = malloc (blockX_len * blockY_len * sizeof(double)); b = malloc (blockX_len * blockY_len * sizeof(double)); c = malloc (blockX_len * blockY_len * sizeof(double)); cdims[0] = blockX_len; cdims[1] = blockY_len; /* Configure array dimensions */ for(int i = 0; i < NDIMS; i++) { dims[i] = N; chunks[i] = -1; ld[i] = cdims[i]; /* leading dimension/stride of the local buffer */ } /* create a global array g_a and duplicate it to get g_b and g_c*/ g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks); if (!g_a) GA_Error("NGA_Create failed: A", NDIMS); #if DEBUG>1 if (proc == 0) printf(" Created Array A\n"); #endif /* Ditto for C and B */ g_b = GA_Duplicate(g_a, "array B"); g_c = GA_Duplicate(g_a, "array C"); if (!g_b || !g_c) GA_Error("GA_Duplicate failed",NDIMS); if (proc == 0) printf("Created Arrays B and C\n"); /* Subscript array for read-incr, which is nothing but proc */ int * rdcnt = malloc (nprocs * sizeof(int)); memset (rdcnt, 0, nprocs * sizeof(int)); int * rdcnt2 = malloc (nprocs * sizeof(int)); memset (rdcnt2, 0, nprocs * sizeof(int)); /* Create global array of nprocs elements for nxtval */ int counter_dim[1]; counter_dim[0] = nprocs; g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL); if (!g_cnt) GA_Error("Shared counter failed",1); g_cnt2 = GA_Duplicate(g_cnt, "another shared counter"); if (!g_cnt2) GA_Error("Another shared counter failed",1); GA_Zero(g_cnt); GA_Zero(g_cnt2); #if DEBUG>1 /* initialize data in matrices a and b */ if(proc == 0) printf("Initializing local buffers - a and b\n"); #endif int w = 0; int l = 7; for(int i = 0; i < cdims[0]; i++) { for(int j = 0; j < cdims[1]; j++) { a[i*cdims[1] + j] = (double)(++w%29); b[i*cdims[1] + j] = (double)(++l%37); } } /* Copy data to global arrays g_a and g_b from local buffers */ next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); for (int i = 0; i < N; i+=cdims[0]) { if (next_p == count_p) { for (int j = 0; j < N; j+=cdims[1]) { /* Indices of patch */ lo[0] = i; lo[1] = j; hi[0] = lo[0] + cdims[0]; hi[1] = lo[1] + cdims[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>1 printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Put(g_a, lo, hi, a, ld); NGA_Put(g_b, lo, hi, b, ld); } next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); } count_p++; } #if DEBUG>1 printf ("After NGA_PUT to global - A and B arrays\n"); #endif /* Synchronize all processors to make sure puts from nprocs has finished before proceeding with dgemm */ GA_Sync(); t1 = GA_Wtime(); next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); for (int m = 0; m < N; m+=cdims[0]) { for (int k = 0; k < N; k+=cdims[0]) { if (next_gac == count_gac) { /* A = m x k */ lo[0] = m; lo[1] = k; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_a, lo, hi, a, ld); for (int n = 0; n < N; n+=cdims[1]) { memset (c, 0, sizeof(double) * cdims[0] * cdims[1]); /* B = k x n */ lo[0] = k; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_b, lo, hi, b, ld); //_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0); /* TODO I am assuming square matrix blocks, further testing/work required for rectangular matrices */ cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */ cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha, a, cdims[0], /* lda */ b, cdims[1], /* ldb */ beta=1.0, c, cdims[0] /* ldc */); NGA_NbWait(&nbh); /* C = m x n */ lo[0] = m; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh); count_acc += 1; } /* END LOOP N */ next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); } /* ENDIF if count == next */ count_gac++; } /* END LOOP K */ } /* END LOOP M */ GA_Sync(); t2 = GA_Wtime(); seconds = t2 - t1; if (proc == 0) printf("Time taken for MM (secs):%lf \n", seconds); printf("Number of ACC: %d\n", count_acc); /* Correctness test - modify data again before this function */ for (int i = 0; i < NDIMS; i++) { lo[i] = 0; hi[i] = dims[i]-1; ld[i] = dims[i]; } verify(g_a, g_b, g_c, lo, hi, ld, N); /* Clear local buffers */ free(a); free(b); free(c); free(rdcnt); free(rdcnt2); GA_Sync(); /* Deallocate arrays */ GA_Destroy(g_a); GA_Destroy(g_b); GA_Destroy(g_c); GA_Destroy(g_cnt); GA_Destroy(g_cnt2); }
int main(int argc, char* argv[]) { char dummy[L2_CACHE_SIZE]; // Tests de performances de ddot int size = 50; blas_t *matriceD, *matriceE; alloc_vecteur(&matriceD, size); alloc_vecteur(&matriceE, size); printf("Tests de performance de la fonction ddot\n"); perf_t *t1, *t2,*t3, *t4,*t5, *t6,*t7, *t8, *t9, *t10; t1 = malloc(sizeof(perf_t)); t2 = malloc(sizeof(perf_t)); t3 = malloc(sizeof(perf_t)); t4 = malloc(sizeof(perf_t)); t5 = malloc(sizeof(perf_t)); t6 = malloc(sizeof(perf_t)); t7 = malloc(sizeof(perf_t)); t8 = malloc(sizeof(perf_t)); t9 = malloc(sizeof(perf_t)); t10 = malloc(sizeof(perf_t)); double mflops, mflops1,mflops2,mflops3,mflops4, mflops5; char command[200]; system("rm results/ddot_perf.txt"); for(size = 50; size < 100000000; size += size/4) { printf("M: %d ", size); if(size != 50) { free(matriceD); free(matriceE); alloc_vecteur(&matriceD, size); alloc_vecteur(&matriceE, size); } memset(dummy, 0, sizeof(dummy)); perf(t1); blas_t res = cblas_ddot(size, matriceD, 1, matriceE, 1); perf(t2); perf_diff(t1, t2); mflops = perf_mflops(t2, 2 * size); printf("Mflops/s: %le\n", mflops); sprintf(command, "echo %d %lf >> results/ddot_perf.txt", size, mflops); system(command); } // Test de performance dgemm ////////////////////////////////////////// long m = 100; blas_t *matriceA, *matriceB, *matriceC; alloc_matrice(&matriceA, m, m); alloc_matrice(&matriceB, m, m); matriceC = calloc(m*m,sizeof(blas_t)); system("rm results/dgemm_perf.txt"); for(; m< 1000; m+=20) { printf("M: %d ", m); if(m != 100) { free(matriceA); free(matriceB); free(matriceC); alloc_matrice(&matriceA, m, m); alloc_matrice(&matriceB, m, m); alloc_matrice(&matriceC, m, m); } memset(dummy, 0, sizeof(dummy)); perf(t1); cblas_dgemm_scalaire( CblasNoTrans, CblasNoTrans ,m, m, m, 1, matriceA, m, matriceB, m, 1, matriceC, m); perf(t2); perf_diff(t1, t2); mflops1 = perf_mflops(t2, m * m * m * 3 + m * m ); perf(t3); cblas_dgemm_scalaire1(matriceC, m, matriceA, m, matriceB, m, m); perf(t4); perf_diff(t3, t4); mflops2 = perf_mflops(t4, m * m * m * 3); perf(t5); cblas_dgemm_scalaire2(matriceC, m, matriceA, m, matriceB, m, m); perf(t6); perf_diff(t5, t6); mflops3 = perf_mflops(t6, m * m * m * 3); perf(t7); cblas_dgemm_scalaire3(matriceC, m, matriceA, m, matriceB, m, m); perf(t8); perf_diff(t7, t8); mflops4 = perf_mflops(t8, m * m * m * 3); perf(t9); cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, m,m, 1, matriceA, m, matriceB, m, 1, matriceC, m); perf(t10); perf_diff(t9, t10); mflops5 = perf_mflops(t10, m * m * m * 3); sprintf(command, "echo %d %lf %lf %lf %lf %lf >> results/dgemm_perf.txt", m * m, mflops1, mflops2, mflops3, mflops4, mflops5); system(command); printf("Mflops/s : %d %lf %lf %lf %lf %lf\n", m * m, mflops1, mflops2, mflops3, mflops4, mflops5 ); } free(matriceA); free(matriceB); free(matriceC); free(matriceD); free(matriceE); return EXIT_SUCCESS; }
static inline void compute_gemm_blas3(CMDOptions * options, double * C, double * A, double * B) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, options->m, options->n, options->k, 1.0, A, options->k, B, options->n, 1.0, C, options->n); }
int sscf (basis_set_t *basis, erd_t *erd_inp, double *H, double * S, double *S_sinv, int n, int n_ele, int maxit, int diis_lim, double *D_old, double *D_new, double *F) { double *int_buffer; double *tmp; double *tmp2; double *F_tt; double *F_t; double *D_t; double *delta_D; double err; int conv = 0; int iter = 0; double trace; double s; double c; double lambda; int i; int max_funcs; int max_buffer_dim; max_funcs = 2 * basis->max_momentum + 1; max_buffer_dim = max_funcs * max_funcs * max_funcs * max_funcs; int_buffer = (double *)malloc (max_buffer_dim * sizeof(double)); tmp = (double *)malloc (n * n * sizeof(double)); F_t = (double *)malloc (n * n * sizeof(double)); D_t = (double *)malloc (n * n * sizeof(double)); delta_D = (double *)malloc (n * n * sizeof(double)); tmp2 = (double *)malloc (n * n * sizeof(double)); F_tt = (double *)malloc (n * n * sizeof(double)); memset (int_buffer, 0, max_buffer_dim * sizeof(double)); memset (tmp, 0, n * n * sizeof(double)); memset (F_t, 0, n * n * sizeof(double)); memset (D_old, 0, n * n * sizeof(double)); memset (D_new, 0, n * n * sizeof(double)); memset (D_t, 0, n * n * sizeof(double)); memset (delta_D, 0, n * n * sizeof(double)); memset (tmp2, 0, n * n * sizeof(double)); memset (F_tt, 0, n * n * sizeof(double)); memcpy (F, H, n * n * sizeof(double)); memcpy (F_t, H, n * n * sizeof(double)); do { #if ODA /*1. D <- Diagonalize(F_t) */ cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, S_sinv, n, F_t, n, 0.0, tmp, n); cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, tmp, n, S_sinv, n, 0.0, F_tt, n); /*Compute D*/ compute_D (n, n_ele, F_tt, D_new); /*Transform D*/ cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, S_sinv, n, D_new, n, 0.0, tmp, n); cblas_dgemm (CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, 1.0, tmp, n, S_sinv, n, 0.0, D_new, n); /*2. conv = Check (D-D') */ /*3. F = Fock (D)*/ memcpy (F, H, n * n * sizeof(double)); build_fock (basis, erd_inp, int_buffer, D_new, F); /* delta_D = D - D_t*/ memset (delta_D, 0, n * n * sizeof(double)); cblas_daxpy (n * n, -1.0, D_t, 1, delta_D, 1); cblas_daxpy (n * n, 1.0, D_new, 1, delta_D, 1); /* s = trace(F_t * delta_D) */ cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, F_t, n, delta_D, n, 0.0, tmp, n); s = compute_trace (tmp, n); /*tmp = F - F_t*/ memset (tmp, 0, n * n * sizeof(double)); cblas_daxpy (n * n, -1.0, F_t, 1, tmp, 1); cblas_daxpy (n * n, 1.0, F, 1, tmp, 1); /* c = trace (tmp * delta_D) */ cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, tmp, n, delta_D, n, 0.0, tmp2, n); c = compute_trace (tmp2, n); /* set lambda */ if (c < -s/2.0) { lambda = 1.0; } else { lambda = -s / (2.0 * c); } memcpy (D_old, D_t, n * n * sizeof (double)); memcpy (F_tt, F_t, n * n * sizeof (double)); /* D_t = (1-lambda) * D_t + lambda * D */ memset (tmp, 0, n * n * sizeof(double)); cblas_daxpy (n * n, (1.0 - lambda), D_t, 1, tmp, 1); cblas_daxpy (n * n, lambda, D_new, 1, tmp, 1); memset (D_t, 0, n * n * sizeof(double)); cblas_daxpy (n * n, 1.0, tmp, 1, D_t, 1); /* F_t = (1-lambda) * F_t + lambda * F */ memset (tmp, 0, n * n * sizeof(double)); cblas_daxpy (n * n, (1.0 - lambda), F_t, 1, tmp, 1); cblas_daxpy (n * n, lambda, F, 1, tmp, 1); memset (F_t, 0, n * n * sizeof(double)); cblas_daxpy (n * n, 1.0, tmp, 1, F_t, 1); /* print energy at each iteration */ err = fabs (calc_hf_ene (D_new, F, H, n) - calc_hf_ene (D_old, F_tt, H, n)); /* fprintf (stderr, "\n iteration ene %d: %lf", iter, calc_hf_ene(D_new, F, H, n)); */ /* fprintf (stderr, "\n iteration %d: %10.6e", iter, err); */ /* fprintf (stderr, "\n lambda %d: %lf",iter, lambda); */ fprintf (stdout, "\n %d, %lf, %10.6e", iter, calc_hf_ene(D_new, F, H, n), err); iter++; #endif #if NORM memcpy (D_old, D_new, n * n * sizeof(double)); /*Build F*/ memcpy (F, H, n * n * sizeof(double)); build_fock (basis, erd_inp, int_buffer, D_new, F); /*Transform F*/ cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, S_sinv, n, F, n, 0.0, tmp, n); cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, tmp, n, S_sinv, n, 0.0, F_t, n); /*Compute D*/ compute_D (n, n_ele, F_t, D_new); /*Transform D*/ cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, S_sinv, n, D_new, n, 0.0, tmp, n); cblas_dgemm (CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, 1.0, tmp, n, S_sinv, n, 0.0, D_new, n); iter++; /*Check energy convergence*/ err = fabs (calc_hf_ene (D_new, F, H, n) - calc_hf_ene (D_old, F, H, n)); /* fprintf (stderr, "\n iteration ene %d: %lf", iter, calc_hf_ene(D_new, F, H, n)); */ /* fprintf (stderr, "\n iteration %d: %10.6e", iter, err); */ fprintf (stdout, "\n %d, %lf, %10.6e", iter, calc_hf_ene(D_new, F, H, n), err); #endif } while ((iter < maxit)); fprintf (stderr, "\n Final Energy: %lf \n", calc_hf_ene (D_new, F, H, n)); /* printmatCM ("Final D", D_new, n, n); */ /* printmatCM ("Final F", F, n, n); */ free (D_t); free (delta_D); free (tmp2); free (tmp); free (F_t); free (F_tt); free (int_buffer); return 0; }
/***********************************************************//** Perform a left-right dmrg sweep \param[in] z - initial guess \param[in] f - specialized function to multiply core by matrix \param[in] args - arguments to f \param[in,out] phil - left multipliers \param[in] psir - right multiplies \param[in] epsilon - splitting tolerance \param[in] opts - approximation options \return na - a new approximation ***************************************************************/ struct FunctionTrain * dmrg_sweep_lr(struct FunctionTrain * z, void (*f)(char,size_t,size_t, double *, struct Qmarray **, void *), void * args, struct QR ** phil, struct QR ** psir, double epsilon, struct MultiApproxOpts * opts) { double * RL = NULL; size_t dim = z->dim; struct FunctionTrain * na = function_train_alloc(dim); struct OneApproxOpts * o = NULL; na->ranks[0] = 1; na->ranks[na->dim] = 1; if (phil[0] == NULL){ struct Qmarray * temp0 = NULL; f('L',0,1,NULL,&temp0,args); /* printf("temp0 size(%zu,%zu) \n",temp0->nrows,temp0->ncols); */ o = multi_approx_opts_get_aopts(opts,0); phil[0] = qr_reduced(temp0,1,o); qmarray_free(temp0); temp0 = NULL; } /* exit(1); */ size_t nrows = phil[0]->mr; size_t nmult = phil[0]->mc; size_t ncols = psir[0]->mc; RL = calloc_double(nrows * ncols); cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols, nmult,1.0,phil[0]->mat,nrows,psir[0]->mat,nmult,0.0,RL,nrows); double * u = NULL; double * vt = NULL; double * s = NULL; /* printf("Left-Right sweep\n"); */ /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */ size_t rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon); /* printf("rank=%zu\n",rank); */ na->ranks[1] = rank; na->cores[0] = qmam(phil[0]->Q,u,rank); size_t ii; for (ii = 1; ii < dim-1; ii++){ /* printf("ii = %zu\n",ii); */ double * newphi = calloc_double(rank * phil[ii-1]->mc); cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,rank,nmult, nrows,1.0,u,nrows,phil[ii-1]->mat,nrows,0.0,newphi,rank); //struct Qmarray * temp = mqma(newphi,y->cores[ii],rank); //struct Qmarray * temp = qmarray_mat_kron(rank,newphi,a->cores[ii],b->cores[ii]); struct Qmarray * temp = NULL; f('L',ii,rank,newphi,&temp,args); qr_free(phil[ii]); phil[ii] = NULL; o = multi_approx_opts_get_aopts(opts,ii); phil[ii] = qr_reduced(temp,1,o); free(RL); RL = NULL; free(newphi); newphi = NULL; free(u); u = NULL; free(vt); vt = NULL; free(s); s = NULL; qmarray_free(temp); temp = NULL; nrows = phil[ii]->mr; nmult = phil[ii]->mc; ncols = psir[ii]->mc; RL = calloc_double(nrows * ncols); cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols, nmult,1.0,phil[ii]->mat,nrows,psir[ii]->mat,nmult,0.0,RL,nrows); /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */ rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon); /* dprint(nrows,s); */ /* printf("rank=%zu\n",rank); */ na->ranks[ii+1] = rank; na->cores[ii] = qmam(phil[ii]->Q,u,rank); } /* exit(1); */ size_t kk,jj; for (kk = 0; kk < ncols; kk++){ for (jj = 0; jj < rank; jj++){ vt[kk*rank+jj] = vt[kk*rank+jj]*s[jj]; } } na->cores[dim-1] = mqma(vt,psir[dim-2]->Q,rank); free(RL); RL = NULL; free(u); u = NULL; free(vt); vt = NULL; free(s); s = NULL; return na; }
int pdgemm(MPI_Comm comm, int p, int q, int bm, int bn, int bk, int gm, int gn, int gk, double *a, double *b, double *c) { int rank, size; MPI_Comm_rank (comm, &rank); MPI_Comm_size (comm, &size); MPI_Comm comm_row, comm_col; CreateGemmCommGroups(p, q, comm, &comm_row, &comm_col); int col_rank; MPI_Comm_rank (comm_col, &col_rank); int pm = bm / p, pn = bn / q, pk = bk / q; int m = gm / bm, n = gn / bn, k = gk / bk; int bi, bj, bl; double *b_buffer = malloc(sizeof(double) * pk * pn * n); if (b_buffer == NULL) { fprintf(stderr, "Failed to malloc memory for arrays\n"); exit(1); } int i, j, l; for ( l=0; l<p*k; l++ ) { for (i=0; i<n; i++ ) { int buffer_offset = (i*pm*pn); int col_J = i*q + rank/p; int row_I = (col_J + l) % (p*k); int bcast_origin_rank = row_I % p; if (col_rank == bcast_origin_rank) { bl = row_I / p; bj = col_J / q; for ( j=0; j<pk*pn; j++ ) b_buffer[buffer_offset + j] = b[(bl + bj * k) * (pk * pn) + j]; } //printf("(%d %d) [%d %d] %d ", row_I, col_J, bl, bj, bcast_origin_rank); MPI_Bcast(b_buffer + buffer_offset, pk*pn, MPI_DOUBLE, bcast_origin_rank, comm_col); } /* printf("Rank %d ", rank); for ( i=0; i< pk*pn*n; i++ ) { printf("%2.2f ", b_buffer[i]); } printf("\n"); */ for ( bi=0; bi<m; bi++ ) { for ( bj=0; bj<n; bj++ ) { int a_offset = (bi + bj * m) * (pm * pk); int b_offset = bj * pk * pn; int c_offset = (bi + bj * m) * (pm * pn); cblas_dgemm (CblasColMajor, CblasNoTrans , CblasNoTrans, pm, pn, pk, 1.0, a + a_offset, pm, b_buffer + b_offset, pk, 1.0, c + c_offset, pm); } } ShiftMPIMatrixLeft(comm_row, gm, gn, bm, bn, p, q, a); } free(b_buffer); return 0; }
/***********************************************************//** Perform a right-left dmrg sweep as part of ft-product \param[in] z - initial guess \param[in] f - specialized function to multiply core by matrix \param[in] args - arguments to f \param[in,out] phil - left multipliers \param[in] psir - right multiplies \param[in] epsilon - splitting tolerance \param[in] opts - approximation options \return na - a new approximation ***************************************************************/ struct FunctionTrain * dmrg_sweep_rl(struct FunctionTrain * z, void (*f)(char,size_t,size_t,double *,struct Qmarray **,void *), void * args, struct QR ** phil, struct QR ** psir, double epsilon, struct MultiApproxOpts * opts) { double * RL = NULL; size_t dim = z->dim; struct FunctionTrain * na = function_train_alloc(dim); na->ranks[0] = 1; na->ranks[na->dim] = 1; struct OneApproxOpts * o = NULL; if (psir[dim-2] == NULL){ struct Qmarray * temp0 = NULL; f('R',dim-1,1,NULL,&temp0,args); //qmarray_kron(a->cores[dim-1],b->cores[dim-1]); o = multi_approx_opts_get_aopts(opts,dim-1); psir[dim-2] = qr_reduced(temp0,0,o); qmarray_free(temp0); temp0 = NULL; } size_t nrows = phil[dim-2]->mr; size_t nmult = phil[dim-2]->mc; size_t ncols = psir[dim-2]->mc; RL = calloc_double(nrows * ncols); cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols, nmult,1.0,phil[dim-2]->mat,nrows,psir[dim-2]->mat,nmult,0.0,RL,nrows); double * u = NULL; double * vt = NULL; double * s = NULL; /* printf("Right-Left sweep\n"); */ /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */ size_t rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon); /* printf("rank=%zu\n",rank); */ na->ranks[dim-1] = rank; na->cores[dim-1] = mqma(vt,psir[dim-2]->Q,rank); int ii; for (ii = dim-3; ii > -1; ii--){ double * newpsi = calloc_double( psir[ii+1]->mr * rank); // cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans, psir[ii+1]->mr,rank, psir[ii+1]->mc, 1.0,psir[ii+1]->mat,psir[ii+1]->mr,vt,rank, 0.0,newpsi,psir[ii+1]->mr); struct Qmarray * temp = NULL; // qmarray_kron_mat(rank,newpsi,a->cores[ii+1],b->cores[ii+1]); f('R',(size_t)ii+1,rank,newpsi,&temp,args); qr_free(psir[ii]); psir[ii] = NULL; o = multi_approx_opts_get_aopts(opts,(size_t)ii+1); psir[ii] = qr_reduced(temp,0,o); free(RL); RL = NULL; free(newpsi); newpsi = NULL; free(u); u = NULL; free(vt); vt = NULL; free(s); s = NULL; qmarray_free(temp); temp = NULL; nrows = phil[ii]->mr; nmult = phil[ii]->mc; ncols = psir[ii]->mc; RL = calloc_double(nrows * ncols); cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols, nmult,1.0,phil[ii]->mat,nrows,psir[ii]->mat,nmult,0.0,RL,nrows); /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */ rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon); /* printf("rank=%zu\n",rank); */ na->ranks[ii+1] = rank; na->cores[ii+1] = mqma(vt,psir[ii]->Q,rank); } size_t kk,jj; for (jj = 0; jj < rank; jj++){ for (kk = 0; kk < nrows; kk++){ u[jj*nrows+kk] = u[jj*nrows+kk]*s[jj]; } } na->cores[0] = qmam(phil[0]->Q,u,rank); /* exit(1); */ free(RL); RL = NULL; free(u); u = NULL; free(vt); vt = NULL; free(s); s = NULL; return na; }
int main(int argc, char **argv){ int iter, i,ii,j,jj,k,kk,ig,jg,kg; /* dummies */ int iterations; /* number of times the multiplication is done */ double dgemm_time, /* timing parameters */ avgtime; double checksum = 0.0, /* checksum of result */ ref_checksum; double epsilon = 1.e-8; /* error tolerance */ int nthread_input, /* thread parameters */ nthread; int num_error=0; /* flag that signals that requested and obtained numbers of threads are the same */ static double * RESTRICT A, /* input (A,B) and output (C) matrices */ * RESTRICT B, * RESTRICT C; long order; /* number of rows and columns of matrices */ int block; /* tile size of matrices */ int shortcut; /* true if only doing initialization */ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("OpenMP Dense matrix-matrix multiplication\n"); #if !MKL if (argc != 4 && argc != 5) { printf("Usage: %s <# threads> <# iterations> <matrix order> [tile size]\n",*argv); #else if (argc != 4) { printf("Usage: %s <# threads> <# iterations> <matrix order>\n",*argv); #endif exit(EXIT_FAILURE); } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: Iterations must be positive : %d \n", iterations); exit(EXIT_FAILURE); } order = atol(*++argv); if (order < 0) { shortcut = 1; order = -order; } else shortcut = 0; if (order < 1) { printf("ERROR: Matrix order must be positive: %ld\n", order); exit(EXIT_FAILURE); } A = (double *) prk_malloc(order*order*sizeof(double)); B = (double *) prk_malloc(order*order*sizeof(double)); C = (double *) prk_malloc(order*order*sizeof(double)); if (!A || !B || !C) { printf("ERROR: Could not allocate space for global matrices\n"); exit(EXIT_FAILURE); } ref_checksum = (0.25*forder*forder*forder*(forder-1.0)*(forder-1.0)); #pragma omp parallel for private(i,j) for(j = 0; j < order; j++) for(i = 0; i < order; i++) { A_arr(i,j) = B_arr(i,j) = (double) j; C_arr(i,j) = 0.0; } #if !MKL if (argc == 5) { block = atoi(*++argv); } else block = DEFAULTBLOCK; #pragma omp parallel private (i,j,k,ii,jj,kk,ig,jg,kg,iter) { double * RESTRICT AA, * RESTRICT BB, * RESTRICT CC; if (block > 0) { /* matrix blocks for local temporary copies */ AA = (double *) prk_malloc(block*(block+BOFFSET)*3*sizeof(double)); if (!AA) { num_error = 1; printf("Could not allocate space for matrix tiles on thread %d\n", omp_get_thread_num()); } bail_out(num_error); BB = AA + block*(block+BOFFSET); CC = BB + block*(block+BOFFSET); } #pragma omp master { nthread = omp_get_num_threads(); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Matrix order = %ld\n", order); if (shortcut) printf("Only doing initialization\n"); printf("Number of threads = %d\n", nthread_input); if (block>0) printf("Blocking factor = %d\n", block); else printf("No blocking\n"); printf("Block offset = %d\n", BOFFSET); printf("Number of iterations = %d\n", iterations); printf("Using MKL library = off\n"); } } bail_out(num_error); if (shortcut) exit(EXIT_SUCCESS); for (iter=0; iter<=iterations; iter++) { if (iter==1) { #pragma omp barrier #pragma omp master { dgemm_time = wtime(); } } if (block > 0) { #pragma omp for for(jj = 0; jj < order; jj+=block){ for(kk = 0; kk < order; kk+=block) { for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) for (kg=kk,k=0; kg<MIN(kk+block,order); k++,kg++) BB_arr(j,k) = B_arr(kg,jg); for(ii = 0; ii < order; ii+=block){ for (kg=kk,k=0; kg<MIN(kk+block,order); k++,kg++) for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++) AA_arr(i,k) = A_arr(ig,kg); for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++) CC_arr(i,j) = 0.0; for (kg=kk,k=0; kg<MIN(kk+block,order); k++,kg++) for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++) CC_arr(i,j) += AA_arr(i,k)*BB_arr(j,k); for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++) C_arr(ig,jg) += CC_arr(i,j); } } } } else { #pragma omp for for (jg=0; jg<order; jg++) for (kg=0; kg<order; kg++) for (ig=0; ig<order; ig++) C_arr(ig,jg) += A_arr(ig,kg)*B_arr(kg,jg); } } /* end of iterations */ #pragma omp barrier #pragma omp master { dgemm_time = wtime() - dgemm_time; } } /* end of parallel region */ #else printf("Matrix size = %ldx%ld\n", order, order); printf("Number of threads = %d\n", nthread_input); printf("Using MKL library = on\n"); printf("Number of iterations = %d\n", iterations); for (iter=0; iter<=iterations; iter++) { if (iter==1) dgemm_time = wtime(); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, order, order, order, 1.0, &(A_arr(0,0)), order, &(B_arr(0,0)), order, 1.0, &(C_arr(0,0)), order); } dgemm_time = wtime()-dgemm_time; #endif for(checksum=0.0,j = 0; j < order; j++) for(i = 0; i < order; i++) checksum += C_arr(i,j); /* verification test */ ref_checksum *= (iterations+1); if (ABS((checksum - ref_checksum)/ref_checksum) > epsilon) { printf("ERROR: Checksum = %lf, Reference checksum = %lf\n", checksum, ref_checksum); exit(EXIT_FAILURE); } else { printf("Solution validates\n"); #if VERBOSE printf("Reference checksum = %lf, checksum = %lf\n", ref_checksum, checksum); #endif } double nflops = 2.0*forder*forder*forder; avgtime = dgemm_time/iterations; printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0E-06 *nflops/avgtime, avgtime); exit(EXIT_SUCCESS); }
int main(int argc, char *argv[]) { double *a, *b, *c, *aa ; unsigned int n ; unsigned i, j, k, iInner, jInner, kInner, blockSize ; struct timespec ts1, ts2, ts3, ts4, ts5, ts6, ts7 ; printf("hello code beginning\n") ; n = MATSIZE ; // default settings blockSize = BLOCKSIZE ; if (argc != 3) { printf("input matrix size and blocksize\n") ; exit(0); } n = atoi(argv[1]) ; blockSize = atoi(argv[2]) ; printf("matrix size %d blocksize %d\n", n,blockSize) ; if (n%blockSize) { printf("for this simple example matrix size must be a multiple of the block size.\n Please re-start \n") ; exit(0); } // allocate matrices a = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; b = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; c = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; aa = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ; if (aa == NULL) // cheap check only the last allocation checked. { printf("insufficient memory \n") ; exit(0) ; } // fill matrices setmat(n, n, a) ; setmat(n, n, aa) ; srand(1) ; // set random seed (change to go off time stamp to make it better fillmat(n,n,b) ; fillmat(n,n,c) ; current_utc_time(&ts1) ; // multiply matrices abasicmm (n,n,a,b,c) ; current_utc_time(&ts2) ; setmat(n, n, a) ; current_utc_time(&ts3) ; abettermm (n,n,a,b,c) ; current_utc_time(&ts4) ; ablockmm (n, n, aa, b, c, blockSize) ; current_utc_time(&ts5) ; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, b, n, c, n, 0.0, a, n); current_utc_time(&ts6) ; printf("matrix multplies complete \n") ; fflush(stdout) ; /**/ checkmatmult(n,n,a,aa) ; { double t1, t2, t3, t4, tmp ; t1 = ts2.tv_sec-ts1.tv_sec; tmp = ts2.tv_nsec-ts1.tv_nsec; tmp /= 1.0e+09 ; t1 += tmp ; printf("ijk ordering basic time %lf\n",t1) ; t2 = ts4.tv_sec-ts3.tv_sec; tmp = ts4.tv_nsec-ts3.tv_nsec; tmp /= 1.0e+09 ; t2 += tmp ; printf("ikj ordering bette time %lf\n",t2) ; t3 = ts5.tv_sec-ts4.tv_sec; tmp = ts5.tv_nsec-ts4.tv_nsec; tmp /= 1.0e+09 ; t3 += tmp ; printf("ikj blocked time %lf\n",t3) ; t4 = ts6.tv_sec-ts5.tv_sec; tmp = ts6.tv_nsec-ts5.tv_nsec; tmp /= 1.0e+09 ; t4 += tmp ; printf("cblas_dgemm %lf\n",t4) ; } }
Result base_case(Problem p) { cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,p.m,p.n,p.k,1,p.A,p.M,p.B,p.K,0,p.C,p.CM); Result r = {p.m, p.n, p.CM, p.C}; return r; }
int main (int argc, char **argv ) { int rout=-1,info=0,m,n,k,lda,ldb,ldc; double A[2] = {0.0,0.0}, B[2] = {0.0,0.0}, C[2] = {0.0,0.0}, ALPHA=0.0, BETA=0.0; if (argc > 2){ rout = atoi(argv[1]); info = atoi(argv[2]); } if (rout == 1) { if (info==0) { printf("Checking if cblas_dgemm fails on parameter 4\n"); cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); } if (info==1) { printf("Checking if cblas_dgemm fails on parameter 5\n"); cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); } if (info==2) { printf("Checking if cblas_dgemm fails on parameter 9\n"); cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); } if (info==3) { printf("Checking if cblas_dgemm fails on parameter 11\n"); cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); } } else { if (info==0) { printf("Checking if F77_dgemm fails on parameter 3\n"); m=INVALID; n=0; k=0; lda=1; ldb=1; ldc=1; F77_dgemm( "T", "N", &m, &n, &k, &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); } if (info==1) { m=0; n=INVALID; k=0; lda=1; ldb=1; ldc=1; printf("Checking if F77_dgemm fails on parameter 4\n"); F77_dgemm( "N", "T", &m, &n, &k, &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); } if (info==2) { printf("Checking if F77_dgemm fails on parameter 8\n"); m=2; n=0; k=0; lda=1; ldb=1; ldc=2; F77_dgemm( "N", "N" , &m, &n, &k, &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); } if (info==3) { printf("Checking if F77_dgemm fails on parameter 10\n"); m=0; n=0; k=2; lda=1; ldb=1; ldc=1; F77_dgemm( "N", "N" , &m, &n, &k, &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); } } return 1; }
double *matmul_aat(int n, double *b) { double *c = (double *) malloc(n*n*sizeof(double)); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, 1.0, b, n, b, n, 0.0, c, n); return c; }
// DGEMM way of matrix multiply using Intel MKL // Link with Intel MKL library: With MSFT VS and Intel Composer integration: Select build components in the Project context menu. // For command line - check out the Intel® Math Kernel Library Link Line Advisor void multiply5(int msize, int tidx, int numt, TYPE a[][NUM], TYPE b[][NUM], TYPE c[][NUM], TYPE t[][NUM]) { double alpha = 1.0, beta = 0.; cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,NUM,NUM,NUM,alpha,(const double *)b,NUM,(const double *)a,NUM,beta,(double *)c,NUM); }
static inline void CORE_dgetrf_rectil_update(const PLASMA_desc A, int *IPIV, int column, int n1, int n2, int thidx, int thcnt, int ft, int lt) { int ld, lm, tmpM; int ip, j, it, i, ldft; double zone = 1.0; double mzone = -1.0; double *Atop, *Atop2, *U, *L; int offset = A.i; ldft = BLKLDD(A, 0); Atop = A(0, 0) + column * ldft; Atop2 = Atop + n1 * ldft; if (thidx == 0) { /* Swap to the right */ int *lipiv = IPIV+column; int idxMax = column+n1; for (j = column; j < idxMax; ++j, ++lipiv) { ip = (*lipiv) - offset - 1; if ( ip != j ) { it = ip / A.mb; i = ip % A.mb; ld = BLKLDD(A, it); cblas_dswap(n2, Atop2 + j, ldft, A(it, 0) + (column+n1)*ld + i, ld ); } } /* Trsm on the upper part */ U = Atop2 + column; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, n1, n2, (zone), Atop + column, ldft, U, ldft ); /* Signal to other threads that they can start update */ CORE_dbarrier_thread( thidx, thcnt ); /* First tile */ L = Atop + column + n1; tmpM = min(ldft, A.m) - column - n1; /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, tmpM, n2, n1, (mzone), L, ldft, U, ldft, (zone), U + n1, ldft ); } else { ld = BLKLDD( A, ft ); L = A( ft, 0 ) + column * ld; lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; U = Atop2 + column; /* Wait for pivoting and triangular solve to be finished * before to really start the update */ CORE_dbarrier_thread( thidx, thcnt ); /* First tile */ /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); } /* Update the other blocks */ for( it = ft+1; it < lt; it++) { ld = BLKLDD( A, it ); L = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); } }
int main(int argc, char *argv[]) { //MPI initialize MPI_Init (&argc, &argv); int rank, size, master = 0; MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &size); MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN); CheckPreprocessorMacros(); /* -------------------------------------------------------------------- */ /* .. Local variables. */ /* -------------------------------------------------------------------- */ timer_t start_t, end_t; const integer_t nrhs = 1; Error_t error; if(rank == master){ fprintf(stderr, "\nShared Memory Spike Solver.\n"); /* -------------------------------------------------------------------- */ /* .. Load and initalize the system Ax=f. */ /* -------------------------------------------------------------------- */ matrix_t* A = matrix_LoadCSR("../Tests/spike/penta_15.bin"); //matrix_t* A = matrix_LoadCSR("../Tests/pentadiagonal/large.bin"); //matrix_t* A = matrix_LoadCSR("../Tests/dummy/tridiagonal.bin"); matrix_PrintAsDense( A, "Original coeffient matrix" ); // Compute matrix bandwidth block_t* x = block_CreateEmptyBlock( A->n, nrhs, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ ); block_t* f = block_CreateEmptyBlock( A->n, nrhs, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ ); block_InitializeToValue( x, __zero ); // solution of the system block_InitializeToValue( f, __punit ); // rhs of the system start_t = GetReferenceTime(); /* compute an optimal solving strategy */ sm_schedule_t* S = spike_solve_analysis( A, nrhs, size-1 ); /* create the reduced sytem in advanced, based on the solving strategy */ matrix_t* R = matrix_CreateEmptyReducedSystem ( S->p, S->n, S->ku, S->kl); block_t* xr = block_CreateReducedRHS( S->p, S->ku, S->kl, nrhs ); /* -------------------------------------------------------------------- */ /* .. Factorization Phase. */ /* -------------------------------------------------------------------- */ for(integer_t p=0; p < S->p; p++) { sendSchedulePacked(S, p+1); const integer_t r0 = S->n[p]; const integer_t rf = S->n[p+1]; matrix_t* Aij = matrix_ExtractMatrix(A, r0, rf, r0, rf); sendMatrix(Aij, p+1); block_t* fi = block_ExtractBlock( f, r0, rf ); block_t* yi = block_CreateEmptyBlock( rf - r0, nrhs, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ ); block_SetBandwidthValues( fi, A->ku, A->kl ); block_SetBandwidthValues( yi, A->ku, A->kl ); sendBlock(fi, p+1); sendBlock(yi, p+1); /* Add the tips of the yi block to the reduced RHS */ block_t* yit = recvBlock(p+1); block_t* yib = recvBlock(p+1); block_AddTipTOReducedRHS( p, S->ku, S->kl, xr, yit ); block_AddTipTOReducedRHS( p, S->ku, S->kl, xr, yib ); /* clean up */ block_Deallocate (fi ); block_Deallocate (yi ); block_Deallocate (yit); block_Deallocate (yib); if(p == 0){ block_t* Vi = block_CreateEmptyBlock ( rf - r0, A->ku, A->ku, A->kl, _V_BLOCK_, _WHOLE_SECTION_ ); block_t* Bi = matrix_ExtractBlock ( A, r0, rf, rf, rf + A->ku, _V_BLOCK_ ); sendBlock(Vi, p+1); sendBlock(Bi, p+1); block_t* Vit = recvBlock(p+1); block_t* Vib = recvBlock(p+1); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vit ); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vib ); block_Deallocate( Bi ); block_Deallocate( Vi ); block_Deallocate( Vit); block_Deallocate( Vib); } else if (p == ( S->p -1)){ block_t* Wi = block_CreateEmptyBlock( rf - r0, A->kl, A->ku, A->kl, _W_BLOCK_, _WHOLE_SECTION_ ); block_t* Ci = matrix_ExtractBlock(A, r0, rf, r0 - A->kl, r0, _W_BLOCK_ ); sendBlock(Wi, p+1); sendBlock(Ci, p+1); block_t* Wit = recvBlock(p+1); block_t* Wib = recvBlock(p+1); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wit ); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wib ); block_Deallocate( Ci ); block_Deallocate( Wi ); block_Deallocate( Wit); block_Deallocate( Wib); } else{ block_t* Vi = block_CreateEmptyBlock ( rf - r0, A->ku, A->ku, A->kl, _V_BLOCK_, _WHOLE_SECTION_ ); block_t* Bi = matrix_ExtractBlock ( A, r0, rf, rf, rf + A->ku, _V_BLOCK_ ); sendBlock(Vi, p+1); sendBlock(Bi, p+1); block_t* Vit = recvBlock(p+1); block_t* Vib = recvBlock(p+1); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vit ); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vib ); block_Deallocate( Bi ); block_Deallocate( Vi ); block_Deallocate( Vit); block_Deallocate( Vib); block_t* Wi = block_CreateEmptyBlock( rf - r0, A->kl, A->ku, A->kl, _W_BLOCK_, _WHOLE_SECTION_ ); block_t* Ci = matrix_ExtractBlock(A, r0, rf, r0 - A->kl, r0, _W_BLOCK_ ); sendBlock(Wi, p+1); sendBlock(Ci, p+1); block_t* Wit = recvBlock(p+1); block_t* Wib = recvBlock(p+1); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wit ); matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wib ); block_Deallocate( Ci ); block_Deallocate( Wi ); block_Deallocate( Wit); block_Deallocate( Wib); } matrix_Deallocate( Aij); } MPI_Barrier(MPI_COMM_WORLD); /* -------------------------------------------------------------------- */ /* .. Solution of the reduced system. */ /* -------------------------------------------------------------------- */ block_t* yr = block_CreateEmptyBlock( xr->n, xr->m, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ ); fprintf(stderr, "\nSolving reduced linear system\n"); system_solve ( R->colind, R->rowptr, R->aij, yr->aij, xr->aij, R->n, xr->m); block_Print(yr, "Solucion del sistema reducido"); /* Free some memory, yr and R are not needed anymore */ block_Deallocate ( xr ); matrix_Deallocate( R ); /* -------------------------------------------------------------------- */ /* .. Backward substitution phase. */ /* -------------------------------------------------------------------- */ for(integer_t p=0; p < S->p; p++) { fprintf(stderr, "Processing backward solution for the %d-th block\n", p); /* compute the limits of the blocks */ const integer_t obs = S->n[p]; /* original system starting row */ const integer_t obe = S->n[p+1]; /* original system ending row */ const integer_t rbs = S->r[p]; /* reduceed system starting row */ const integer_t rbe = S->r[p+1]; /* reduced system ending row */ const integer_t ni = S->n[p+1] - S->n[p]; /* number of rows in the block */ /* allocate pardiso configuration parameters */ MKL_INT pardiso_conf[64]; /* extract xi sub-block */ block_t* xi = block_ExtractBlock(x, obs, obe ); sendBlock(xi, p+1); /* extract fi sub-block */ block_t* fi = block_ExtractBlock(f, obs, obe ); sendBlock(fi, p+1); printf("Lets go %d\n", p); if ( p == 0 ){ block_t* xt_next = block_ExtractBlock ( yr, rbe, rbe + S->ku[p+1]); sendBlock(xt_next, p+1); block_Deallocate (xt_next); } else if ( p == ( S->p -1)){ block_t* xb_prev = block_ExtractBlock ( yr, rbs - S->kl[p], rbs ); sendBlock(xb_prev, p+1); block_Deallocate (xb_prev); } else{ block_t* xt_next = block_ExtractBlock ( yr, rbe, rbe + S->ku[p+1]); sendBlock(xt_next, p+1); block_Deallocate (xt_next); block_t* xb_prev = block_ExtractBlock ( yr, rbs - S->kl[p], rbs ); sendBlock(xb_prev, p+1); block_Deallocate (xb_prev); } xi = recvBlock(p+1); block_AddBlockToRHS(x, xi, obs, obe); block_Deallocate ( xi ); block_Deallocate ( fi ); } schedule_Destroy( S ); block_Deallocate( yr); end_t = GetReferenceTime(); fprintf(stderr, "\nSPIKE solver took %.6lf seconds", end_t - start_t); block_Print( x, "Solution of the linear system"); ComputeResidualOfLinearSystem( A->colind, A->rowptr, A->aij, x->aij, f->aij, A->n, nrhs); fprintf(stderr, "\nPARDISO REFERENCE SOLUTION...\n"); SolveOriginalSystem( A, x, f); /* -------------------------------------------------------------------- */ /* .. Clean up. */ /* -------------------------------------------------------------------- */ matrix_Deallocate ( A ); block_Deallocate ( x ); block_Deallocate ( f ); /* -------------------------------------------------------------------- */ /* .. Load and initalize the system Ax=f. */ /* -------------------------------------------------------------------- */ fprintf(stderr, "\nProgram finished\n"); debug("Number of malloc() calls %d, number of free() calls %d\n", cnt_alloc, cnt_free ); } else{ //WORKERS /* -------------------------------------------------------------------- */ /* .. Factorization Phase. */ /* -------------------------------------------------------------------- */ //fprintf(stderr, "Solving %d-th block\n", p); sm_schedule_t* S = recvSchedulePacked(master); /* compute the limits of the blocks */ integer_t p = rank -1; const integer_t obs = S->n[p]; /* original system starting row */ const integer_t obe = S->n[p+1]; /* original system ending row */ const integer_t rbs = S->r[p]; /* reduceed system starting row */ const integer_t rbe = S->r[p+1]; /* reduced system ending row */ const integer_t ni = S->n[p+1] - S->n[p]; /* number of rows in the block */ MKL_INT pardiso_conf[64]; /* allocate pardiso configuration parameters */ DirectSolverHander_t *handler = directSolver_CreateHandler(); directSolver_Configure( handler ); /* factorize matrix */ matrix_t* Aij = recvMatrix(master); directSolver_Factorize( handler, Aij->n, Aij->nnz, Aij->colind, Aij->rowptr, Aij->aij, Aij->n); /* -------------------------------------------------------------------- */ /* .. Solve Ai * yi = fi */ /* Extracts the fi portion from f, creates a yi block used as container */ /* for the solution of the system. Then solves the system. */ /* -------------------------------------------------------------------- */ /* solve the system for the RHS value */ block_t* fi = recvBlock(master); block_t* yi = recvBlock(master); /* solve Ai * yi = fi */ directSolver_SolveForRHS( handler, nrhs, yi->aij, fi->aij ); /* Extract the tips of the yi block */ block_t* yit = block_ExtractTip( yi, _TOP_SECTION_ , _COLMAJOR_ ); block_t* yib = block_ExtractTip( yi, _BOTTOM_SECTION_, _COLMAJOR_ ); sendBlock(yit, master); sendBlock(yib, master); /* clean up */ block_Deallocate (fi ); block_Deallocate (yi ); block_Deallocate (yit); block_Deallocate (yib); if ( rank == 1 ){ block_t* Vi = recvBlock(master); block_t* Bi = recvBlock(master); /* solve Ai * Vi = Bi */ directSolver_SolveForRHS( handler, Vi->m, Vi->aij, Bi->aij ); block_t* Vit = block_ExtractTip( Vi, _TOP_SECTION_, _ROWMAJOR_ ); block_t* Vib = block_ExtractTip( Vi, _BOTTOM_SECTION_, _ROWMAJOR_ ); sendBlock(Vit, master); sendBlock(Vib, master); block_t* Bib = block_ExtractTip( Bi, _BOTTOM_SECTION_, _COLMAJOR_ ); //block_Deallocate( Vi ); block_Deallocate( Bi ); block_Deallocate( Vi); block_Deallocate( Vit); block_Deallocate( Vib); //Here Master Resolve Reduced System MPI_Barrier(MPI_COMM_WORLD); block_t* xi = recvBlock(master); block_t* fi = recvBlock(master); block_t* xt_next = recvBlock(master); /* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit + fi */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, Bib->n, /* m - number of rows of A */ xt_next->m, /* n - number of columns of B */ Bib->m, /* k - number of columns of A */ __nunit, /* alpha */ Bib->aij, /* A block */ Bib->n, /* lda - first dimension of A */ xt_next->aij, /* B block */ xt_next->n, /* ldb - first dimension of B */ __punit, /* beta */ &fi->aij[ni - S->ku[p]], /* C block */ ni ); /* ldc - first dimension of C */ /* solve Ai * xi = fi */ directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij ); sendBlock(xi, master); block_Deallocate ( Bib ); block_Deallocate ( xt_next); block_Deallocate ( xi ); block_Deallocate ( fi ); } else if ( rank == size -1){ block_t* Wi = recvBlock(master); block_t* Ci = recvBlock(master); /* solve Ai * Wi = Ci */ directSolver_SolveForRHS( handler, Wi->m, Wi->aij, Ci->aij ); block_t* Wit = block_ExtractTip( Wi, _TOP_SECTION_, _ROWMAJOR_ ); block_t* Wib = block_ExtractTip( Wi, _BOTTOM_SECTION_, _ROWMAJOR_ ); sendBlock(Wit, master); sendBlock(Wib, master); block_t* Cit = block_ExtractTip(Ci, _TOP_SECTION_, _COLMAJOR_ ); block_Deallocate( Ci ); block_Deallocate( Wi ); block_Deallocate( Wit); block_Deallocate( Wib); //Here Master Resolve Reduced System MPI_Barrier(MPI_COMM_WORLD); block_t* xi = recvBlock(master); block_t* fi = recvBlock(master); block_t* xb_prev = recvBlock(master); /* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit + fi */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, Cit->n, /* m - number of rows of A */ xb_prev->m, /* n - number of columns of B */ Cit->m, /* k - number of columns of A */ __nunit, /* alpha */ Cit->aij, /* A block */ Cit->n, /* lda - first dimension of A */ xb_prev->aij, /* B block */ xb_prev->n, /* ldb - first dimension of B */ __punit, /* beta */ fi->aij, /* C block */ ni ); /* ldc - first dimension of C */ /* solve Ai * xi = fi */ directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij ); sendBlock(xi, master); block_Deallocate ( Cit ); block_Deallocate ( xb_prev); block_Deallocate ( xi ); block_Deallocate ( fi ); } else{ block_t* Vi = recvBlock(master); block_t* Bi = recvBlock(master); /* solve Ai * Vi = Bi */ directSolver_SolveForRHS( handler, Vi->m, Vi->aij, Bi->aij ); block_t* Vit = block_ExtractTip( Vi, _TOP_SECTION_, _ROWMAJOR_ ); block_t* Vib = block_ExtractTip( Vi, _BOTTOM_SECTION_, _ROWMAJOR_ ); sendBlock(Vit, master); sendBlock(Vib, master); block_t* Bib = block_ExtractTip( Bi, _BOTTOM_SECTION_, _COLMAJOR_ ); block_Deallocate( Bi ); block_Deallocate( Vi ); block_Deallocate( Vit); block_Deallocate( Vib); block_t* Wi = recvBlock(master); block_t* Ci = recvBlock(master); /* solve Ai * Wi = Ci */ directSolver_SolveForRHS( handler, Wi->m, Wi->aij, Ci->aij ); block_t* Wit = block_ExtractTip( Wi, _TOP_SECTION_, _ROWMAJOR_ ); block_t* Wib = block_ExtractTip( Wi, _BOTTOM_SECTION_, _ROWMAJOR_ ); sendBlock(Wit, master); sendBlock(Wib, master); block_t* Cit = block_ExtractTip(Ci, _TOP_SECTION_, _COLMAJOR_ ); block_Deallocate( Ci ); block_Deallocate( Wi ); block_Deallocate( Wit); block_Deallocate( Wib); //Here Master Resolve Reduced System MPI_Barrier(MPI_COMM_WORLD); block_t* xi = recvBlock(master); block_t* fi = recvBlock(master); block_t* xt_next = recvBlock(master); /* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit + fi */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, Bib->n, /* m - number of rows of A */ xt_next->m, /* n - number of columns of B */ Bib->m, /* k - number of columns of A */ __nunit, /* alpha */ Bib->aij, /* A block */ Bib->n, /* lda - first dimension of A */ xt_next->aij, /* B block */ xt_next->n, /* ldb - first dimension of B */ __punit, /* beta */ &fi->aij[ni - S->ku[p]], /* C block */ ni ); /* ldc - first dimension of C */ directSolver_ApplyFactorToRHS( Aij->colind, Aij->rowptr, Aij->aij, xi->aij, fi->aij, Aij->n, xi->m, &pardiso_conf ); /* solve Ai * xi = fi */ directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij ); block_Deallocate ( Bib ); block_Deallocate ( xt_next); block_t* xb_prev = recvBlock(master); /* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit + fi */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, Cit->n, /* m - number of rows of A */ xb_prev->m, /* n - number of columns of B */ Cit->m, /* k - number of columns of A */ __nunit, /* alpha */ Cit->aij, /* A block */ Cit->n, /* lda - first dimension of A */ xb_prev->aij, /* B block */ xb_prev->n, /* ldb - first dimension of B */ __punit, /* beta */ fi->aij, /* C block */ ni ); /* ldc - first dimension of C */ /* solve Ai * xi = fi */ directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij ); sendBlock(xi, master); block_Deallocate ( Cit ); block_Deallocate ( xb_prev); block_Deallocate ( xi ); block_Deallocate ( fi ); } /* Show statistics and clean up solver internal memory */ directSolver_ShowStatistics(handler); directSolver_Finalize(handler); schedule_Destroy ( S ); matrix_Deallocate(Aij); debug("Number of malloc() calls %d, number of free() calls %d\n", cnt_alloc, cnt_free ); } debug("Rank %d Finished!\n", rank); MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); return 0; }
void wrapper_cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc) { cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }
/* * test ga_dgemm * Note: - change nummax for large arrays * - turn off "dgemm_verify" for large arrays due to memory * limitations, as dgemm_verify=1 for large arrays produces * segfault, dumps core,or any crap. */ int main(int argc, char **argv) { int num_m; int num_n; int num_k; int i; int ii; double *h0; int g_c; int g_b; int g_a; double a; double t1; double mf; double avg_t[ntrans]; double avg_mf[ntrans]; int itime; int ntimes; int nums_m[/*howmany*/] = {512,1024}; int nums_n[/*howmany*/] = {512,1024}; int nums_k[/*howmany*/] = {512,1024}; char transa[/*ntrans*/] = "ntnt"; char transb[/*ntrans*/] = "nntt"; char ta; char tb; double *tmpa; double *tmpb; double *tmpc; int ndim; int dims[2]; #ifdef BLOCK_CYCLIC int block_size[2]; #endif #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); #else MP_INIT(argc,argv); if (!MA_init(MT_DBL,1,20000000)) { GA_Error("failed: ma_init(MT_DBL,1,20000000)",10); } GA_INIT(argc,argv); me = GA_Nodeid(); #endif h0 = (double*)malloc(sizeof(double) * nummax*nummax); tmpa = (double*)malloc(sizeof(double) * nummax*nummax); tmpb = (double*)malloc(sizeof(double) * nummax*nummax); tmpc = (double*)malloc(sizeof(double) * nummax*nummax); ii = 0; for (i=0; i<nummax*nummax; i++) { ii = ii + 1; if (ii > nummax) { ii = 0; } h0[i] = ii; } /* Compute times assuming 500 mflops and 5 second target time */ /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */ ntimes = 5; for (ii=0; ii<howmany; ii++) { num_m = nums_m[ii]; num_n = nums_n[ii]; num_k = nums_k[ii]; a = 0.5/(num_m*num_n); if (num_m > nummax || num_n > nummax || num_k > nummax) { GA_Error("Insufficient memory: check nummax", 1); } #ifndef BLOCK_CYCLIC ndim = 2; /* dims[0] = num_m; dims[1] = num_n; */ dims[1] = num_m; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c ); #else if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) { GA_Error("failed: create g_c",20); } #endif /* dims[0] = num_k; dims[1] = num_n; */ dims[1] = num_k; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b ); #else if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) { GA_Error("failed: create g_b",30); } #endif /* dims[0] = num_m; dims[1] = num_k; */ dims[1] = num_m; dims[0] = num_k; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a ); #else if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) { GA_Error("failed: create g_a",40); } #endif #else ndim = 2; block_size[0] = 128; block_size[1] = 128; dims[0] = num_m; dims[1] = num_n; g_c = GA_Create_handle(); GA_Set_data(g_c,ndim,dims,MT_DBL); GA_Set_array_name(g_c,"g_c"); GA_Set_block_cyclic(g_c,block_size); if (!GA_Allocate(g_c)) { GA_Error("failed: create g_c",40); } dims[0] = num_k; dims[1] = num_n; g_b = GA_Create_handle(); GA_Set_data(g_b,ndim,dims,MT_DBL); GA_Set_array_name(g_b,"g_b"); GA_Set_block_cyclic(g_b,block_size); if (!ga_allocate(g_b)) { GA_Error("failed: create g_b",40); } dims[0] = num_m; dims[1] = num_k; g_a = GA_Create_handle(); GA_Set_data(g_a,ndim,dims,MT_DBL); GA_Set_array_name(g_a,"g_a"); GA_Set_block_cyclic(g_a,block_size); if (!ga_allocate(g_a)) { GA_Error('failed: create g_a',40); } #endif /* Initialize matrices A and B */ if (me == 0) { load_ga(g_a, h0, num_m, num_k); load_ga(g_b, h0, num_k, num_n); } #if defined(USE_ELEMENTAL) double zero = 0.0; ElGlobalArraysFill_d( eldga, g_c, &zero ); ElGlobalArraysSync_d( eldga ); #else GA_Zero(g_c); GA_Sync(); #endif #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n", (long)num_m, (long)num_k, (long)num_k, (long)num_n); fflush(stdout); } for (i=0; i<ntrans; i++) { avg_t[i] = 0.0; avg_mf[i] = 0.0; } for (itime=0; itime<ntimes; itime++) { for (i=0; i<ntrans; i++) { #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); #else GA_Sync(); #endif ta = transa[i]; tb = transb[i]; t1 = MP_TIMER(); #if defined(USE_ELEMENTAL) ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c ); #else GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c); #endif t1 = MP_TIMER() - t1; #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif #if defined(USE_ELEMENTAL) mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc; #else mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes(); #endif avg_t[i] = avg_t[i]+t1; avg_mf[i] = avg_mf[i] + mf; printf("%15s%2d: %12.4f seconds %12.1f mflops/proc %c %c\n", "Run#", itime, t1, mf, ta, tb); fflush(stdout); if (dgemm_verify && itime == 0) { /* recall the C API swaps the matrix order */ /* we swap it here for the Fortran-based verify */ verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0, g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc); } } } } #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\n"); for (i=0; i<ntrans; i++) { printf("%17s: %12.4f seconds %12.1f mflops/proc %c %c\n", "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes, transa[i], transb[i]); } if(dgemm_verify) { printf("All GA_Dgemms are verified...O.K.\n"); } fflush(stdout); } /* GA_Print(g_a); GA_Print(g_b); GA_Print(g_c); */ #if defined(USE_ELEMENTAL) ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysDestroy_d( eldga, g_b ); ElGlobalArraysDestroy_d( eldga, g_c ); #else GA_Destroy(g_c); GA_Destroy(g_b); GA_Destroy(g_a); #endif } /* ??? format(a15, i2, ': ', e12.4, ' seconds ',f12.1, . ' mflops/proc ', 3a2) */ #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("All tests successful\n"); } free(h0); free(tmpa); free(tmpb); free(tmpc); #if defined(USE_ELEMENTAL) // call el::global arrays destructor ElGlobalArraysTerminate_d( eldga ); ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Terminate(); MP_FINALIZE(); #endif return 0; } /* * Verify for correctness. Process 0 computes BLAS dgemm * locally. For larger arrays, disbale this test as memory * might not be sufficient */ void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k, double alpha, int g_a, int g_b, double beta, int g_c, double *tmpa, double *tmpb, double *tmpc) { int i,j,type,ndim,dims[2],lo[2],hi[2]; double abs_value; for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { tmpc[j+i*num_m] = -1.0; tmpa[j+i*num_m] = -2.0; } } #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_a, lo, hi, tmpa, &dims[1]); #endif #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] ); #else NGA_Get(g_b, lo, hi, tmpb, &dims[1]); #endif /* compute dgemm sequentially */ #if defined(USE_ELEMENTAL) cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), ( xt2 == 'n'? CblasNoTrans: CblasTrans ), num_m /* M */, num_n /* N */, num_k /* K */, alpha, tmpa, num_m, /* lda */ tmpb, num_k, /* ldb */ beta, tmpc, num_m /* ldc */); #else xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k, &alpha, tmpa, &num_m, tmpb, &num_k, &beta, tmpc, &num_m); #endif /* after computing c locally, verify it with the values in g_c */ #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_c, lo, hi, tmpa, &dims[1]); #endif for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]); if(abs_value > 1.0 || abs_value < -1.0) { printf("Values are = %f %f\n", tmpc[j+i*num_m], tmpa[j+i*num_m]); printf("Values are = %f %f\n", fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value); fflush(stdout); GA_Error("verify ga_dgemm failed", 1); } } } } /** * called by process '0' (or your master process ) */ void load_ga(int handle, double *f, int dim1, int dim2) { int lo[2], hi[2]; if (dim1 < 0 || dim2 < 0) { return; } lo[0] = 0; lo[1] = 0; hi[0] = dim1-1; hi[1] = dim2-1; #if defined(USE_ELEMENTAL) ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 ); #else NGA_Put(handle, lo, hi, f, &dim1); #endif }