void dgather(int ictxt, int n, int numc, int nb, double *A, double *A_d, int *descAd){ int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info; int nprow, npcol, myrow, mycol, descA[9], itemp; int i,k; sl_init_(&RootNodeic, &ione, &ione); Cblacs_gridinfo(ictxt, &nprow,&npcol, &myrow, &mycol); if (myrow==0 && mycol ==0){ isRootNode = 1;} if(isRootNode){ nru = numroc_(&n, &n, &myrow, &izero, &nprow); itemp = max(1,nru); descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info ); } else{ k=0; for(i=0;i<9;i++){ descA[k]=0; k++; } descA[1]=-1; } pdgemr2d_(&n,&numc,A_d,&ione, &ione, descAd, A, &ione, &ione, descA, &ictxt ); if (isRootNode){ Cblacs_gridexit(RootNodeic); } }
F_VOID_FUNC blacs_exit_(int *NotDone) #endif { void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); BLACBUFF *bp; extern BLACBUFF *BI_ReadyB, *BI_ActiveQ, BI_AuxBuff; int i; extern int BI_MaxNCtxt, BI_Np; extern BLACSCONTEXT **BI_MyContxts; /* * Destroy all contexts */ for (i=0; i < BI_MaxNCtxt; i++) if (BI_MyContxts[i]) Cblacs_gridexit(i); free(BI_MyContxts); if (BI_ReadyB) free(BI_ReadyB); while (BI_ActiveQ != NULL) { bp = BI_ActiveQ; BI_BuffIsFree(bp, 1); /* wait for async sends to complete */ BI_ActiveQ = bp->next; free(bp); } free (BI_AuxBuff.Aops); /* * Reset parameters to initial values */ BI_MaxNCtxt = 0; BI_MyContxts = NULL; BI_Np = -1; if (!Mpval(NotDone)) { MPI_Finalize(); } BI_ReadyB = NULL; }
/* Test program * created 23/09/2014 * author Alex Bombrun * * icc -O1 -o eigen.exe lapackReadStore.c mpiutil.c normals.c matrixBlockStore.c -mkl * ./eigen.exe 4 4 * */ int main(int argc, char **argv) { FILE* store; FILE* scaStore; int N , M; int i, j; int n_blocks; int scalapack_size; int NB, MB; int i_block, j_block; int dim[4]; double * mat; // local matrix block use for reading int t, t_block; const char* profileG_file_name= "./data/NormalsG/profile.txt"; const char* store_location = "./data/ReducedNormals"; const char* scaStore_location ="./data/DiagCholeskyReducedNormals"; int mp; // number of rows in the processor grid int mla; // number of rows in the local array int mb; // number of rows in a block int np; // number of columns in the processor grid int nla; // number of columns in the local array int nb; // number of columns in a block int mype,npe; // rank and total number of process int idescal[9]; // matrix descriptors double *la; // matrix values: al is the local array int idescbl[9]; double *lb; double normb; int idescxl[9]; double *lx; double normx; int idesczl[9]; // matrix descriptors double *lz; // matrix values: al is the local array double *w; int ierr; // error output int mp_ret, np_ret, myrow, mycol; // to store grid info int zero=0; // value used for the descriptor initialization int one=1; // value used for the descriptor initialization int m,n; // matrix A dimensions double norm, cond; double *work = NULL; double * work2 = NULL; int *iwork = NULL; int lwork, liwork; float ll,mm,cr,cc; int ii,jj,pr,pc,h,g; // ii,jj coordinates of local array element int rsrc=0,csrc=0; // assume that 0,0 element should be stored in the 0,0 process int n_b = 1; int index; int icon; // scalapack cblacs context char normJob, jobz, uplo, trans, diag; double MPIt1, MPIt2, MPIelapsed; jobz= 'N'; uplo='U'; Cblacs_pinfo( &mype, &npe ); if (argc == 3) { //printf("%s %s %s\n", argv[0], argv[1], argv[2]); n_blocks= (int) strtol(argv[1], NULL, 10); scalapack_size= (int) strtol(argv[2], NULL, 10); } else { printf("Usage: expect 2 integers \n"); printf(" 1 : the number of diagonal blocks \n"); printf(" 2 : scalapack number to define block size (assume n is divisible by sqrt(p) and that n/sqrt(p) is divisible by this number)\n"); exit( -1); } printf("%d/%d: read store\n",mype,npe); N = getNumberOfLine(profileG_file_name); // the dimension of the matrix; M = N; // square matrix m=M; //mla*mp; n=N; //nla*np; np = isqrt(npe); // assume that the number of process is a square mp = np; // square grid mla = m/mp; // assume that the matrix dimension if a multiple of the process grid dimension nla = n/np; mb = mla/scalapack_size; // assume that the dimension of the matrix is a multiple of the number of the number of diagonal blocks nb = nla/scalapack_size; // init CBLACS Cblacs_get( -1, 0, &icon ); Cblacs_gridinit( &icon,"c", mp, np ); Cblacs_gridinfo( icon, &mp_ret, &np_ret, &myrow, &mycol); // allocate local matrix la=malloc(sizeof(double)*mla*nla); printf("%d/%d: full matrix (%d,%d), local matrix (%d,%d), processor grid (%d,%d), block (%d,%d) \n", mype, npe, m, n, mla, nla, np, mp, mb, nb); // set identity matrix for(i = 0;i<M;i++){ for(j = i;j<i+1;j++){ cr = (float)( i/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( j/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( i/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( j/(mp*nb) ) ); ii = i%mb + (int)(ll)*mb; jj = j%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = 1; } } } /* for(i_block=0;i_block<n_blocks;i_block++){ printf("%d/%d: process store block %d \n", mype, npe, i_block); readStore(&store,i_block,store_location); t_block = 0; while(readNextBlockDimension(dim,store)!=-1) { // loop B over all block tasks j_block = mpi_get_diag_block_id(i_block, t_block, n_blocks); mat = malloc((dim[1]-dim[0])*(dim[3]-dim[2]) * sizeof(double)); readNextBlock(dim[0],dim[1],dim[2],dim[3],mat,store); if (dim[0]==dim[2]){ // process only the diagonal blocks // printf("%d/%d: read block (%d,%d) with global indices (%d,%d,%d,%d) \n",mype, npe, i_block,j_block,dim[0],dim[1],dim[2],dim[3]); NB = dim[1]-dim[0]; MB = dim[3]-dim[2]; for(i = dim[0];i<dim[1];i++){ for(j = dim[2];j<dim[3];j++){ //matA[i*M+j] = mat[(i-dim[0])*MB+(j-dim[2])]; // finding out which pe gets this i,j element cr = (float)( i/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( j/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( i/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( j/(mp*nb) ) ); ii = i%mb + (int)(ll)*mb; jj = j%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = mat[(i-dim[0])*MB+(j-dim[2])]; } } } // transpose if(j_block != i_block){ for(i = dim[0];i<dim[1];i++){ for(j = dim[2];j<dim[3];j++){ //matA[j*M+i] = mat[(i-dim[0])*MB+(j-dim[2])]; // finding out which pe gets this j,i element cr = (float)( j/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( i/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( j/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( i/(mp*nb) ) ); ii = j%mb + (int)(ll)*mb; jj = i%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = mat[(i-dim[0])*MB+(j-dim[2])]; } } } } } free(mat); t_block++; } closeStore(store); } */ printf("%d/%d: finished scaterring the matrix \n",mype,npe); printf("%d/%d: start computing \n",mype,npe); // set the matrix descriptor ierr=0; descinit_(idescal, &m, &n , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); // processor grip id start at 0 if (mype==0) saveMatrixDescriptor(idescal, scaStore_location); ierr=0; descinit_(idescbl, &m, &one , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0 lb = calloc(sizeof(double),mla); ierr=0; // set x descinit_(idescxl, &n, &one , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0 lx = calloc(sizeof(double),mla); for(i=0;i<mla;i++){ lx[i] = 1.0/m; } pddot_(&n,&normx,lx,&one,&one,idescxl,&one,lx,&one,&one,idescxl,&one); // normx <- x'x if (mype==0) printf("%d/%d: normx2 %E \n",mype,npe,normx); ierr=0; // set b double alpha =1.0; double beta =0.0; trans = 'N'; pdgemv_(&trans,&m,&n,&alpha,la,&one,&one,idescal,lx,&one,&one,idescxl,&one,&beta,lb,&one,&one,idescbl,&one); // b <- A x pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b if (mype==0) printf("%d/%d: normb2 %E \n",mype,npe,normb); ierr = 0; // compute norm 1 of the reduced normal matrix /* DO NOT WORK lwork = 2*mla+2*nla; work = malloc(sizeof(double)*lwork); normJob = '1'; norm = pdlansy_(&normJob, &uplo, &n, la, &one, &one, idescal, work); // matrix index start at one printf("%d/%d: norm %f \n",mype,npe,norm); free(work); */ ierr = 0; // compute the cholesky decomposition printf("%d/%d: start computing cholesky factor\n",mype,npe); pdpotrf_(&uplo,&n,la,&one,&one,idescal,&ierr); printf("%d/%d: finish computing cholesky factor\n",mype,npe); openScalapackStore(&scaStore,myrow,mycol,scaStore_location); saveLocalMatrix(la,nla,mla,scaStore); double test=0.0; for(i=0;i<nla*mla;i++){ test += la[i]*la[i]; } printf("%d/%d: finished computing cholesky, test=%f \n",mype,npe,test); ierr =0; // assume x and b set // assume cholesky decomposition // compute the soluation A x = b diag = 'N'; printf("%d/%d: start solving\n",mype,npe); //pdpptrs_(&uplo, &trans , &diag , &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // solve triangular system //pdtrtrs (&uplo, &trans , &diag , &n , &n , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); pdpotrs_(&uplo, &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // b<- A-1 b alpha = -1.0; normb=0; pdaxpy_(&n,&alpha,lx,&one,&one,idescxl,&one,lb,&one,&one,idescbl,&one); // b<-b-x pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b if (mype==0) printf("%d/%d: finish solving, norm2(sol-true) %E \n",mype,npe,normb); ierr = 0; /* // compute the eigen values jobz= 'N'; uplo='U'; // with N z is ignored descinit_(idesczl, &m, &n , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); lz = malloc(sizeof(double)*mla*nla); w = malloc(sizeof(double)*m); lwork = -1; work = malloc(sizeof(double)*2); pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr); // only compute lwork //pdsyev_( &jobz, &uplo, &n, A, &ione, &ione, descA, W, Z, &ione, &ione, descZ, work, &lwork, &info ); lwork= (int) work[0]; free(work); work = (double *)calloc(lwork,sizeof(double)) ; //MPIt1 = MPI_Wtime(); pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr); // compute the eigen values //MPIt2 = MPI_Wtime(); //MPIelapsed=MPIt2-MPIt1; if (mype == 0) { saveMatrix(n,w,"eigenvalues.txt"); //printf("%d/%d: finished job in %8.2fs\n",mype,npe,MPIelapsed); // not working } */ ierr = 0; // compute the conditioner number assume that the norm and the cholesky decomposition have been computed /* DO NOT WORK lwork = 2*mla+3*nla; printf("%d/%d: lwork=%d @%p\n",mype,npe,lwork,&lwork); work2 = malloc(sizeof(double)*lwork); liwork = 2*mla+3*nla; iwork = malloc(sizeof(int)*liwork); pdpocon_(&uplo,&n,la,&one,&one,idescal,&norm,&cond,work2,&lwork,iwork,&liwork,&ierr); printf("%d/%d: condition number %f \n",mype,npe,cond); */ free(la); Cblacs_gridexit(icon); Cblacs_exit( 0 ); return 0; }
SEXP R_blacs_gridexit(SEXP CONT) { Cblacs_gridexit(INT(CONT)); return R_NilValue; }
void ddistr(int ictxt, int n, int numc, int nb, double *A , double *A_d, int *descAd ){ int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info; int nprow, npcol, myrow, mycol,descA[9], itemp; int i,k; /* #ifdef NOUNDERLAPACK sl_init__(&RootNodeic,&ione, &ione); #else sl_init__(&RootNodeic,&ione, &ione); #endif */ sl_init_(&RootNodeic,&ione, &ione); Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); //printf("nprow=%d, npcol=%d, myrow=%d, mycol=%d\n",nprow,npcol,myrow,mycol); //printf("nb=%d\n",nb); if (myrow==0 && mycol==0) { isRootNode=1;} if (isRootNode){ //printf("root entro aca...\n"); nru = numroc_(&n, &n, &myrow,&izero, &nprow ); //printf("root paso numroc\n"); itemp = max(1, nru); descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info); //printf("root paso descinit\n"); } else{ //printf("yo entre aca\n"); k=0; for(i=0;i<9;i++){ descA[k]=0; k++; } descA[1]=-1; } //printf("inicio de cosas para todos\n"); nru = numroc_(&n, &nb, &myrow, &izero, &nprow); //printf("todos pasan numroc\n"); itemp = max(1,nru); descinit_(descAd, &n, &numc, &nb, &nb, &izero, &izero, &ictxt,&itemp, &info); //printf("todos pasan descinit\n"); pdgemr2d_( &n, &numc, A, &ione, &ione, descA, A_d, &ione, &ione, descAd, &ictxt); //printf("todos pasan pdgemr2d\n"); if (isRootNode){ //printf("RootNodeic=%d\n",RootNodeic); Cblacs_gridexit(RootNodeic); //printf("root paso gridexit\n"); } }
void test_gemr2d(int M, int N) { int repeat = 10; int32_t one = 1; int32_t isrc = 0; int32_t num_ranks; MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); int32_t rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); int32_t bs_row_A = M / num_ranks + std::min(1, M % num_ranks); int32_t bs_col_A = 1; int32_t bs_row_B = 1; int32_t bs_col_B = 1; int32_t blacs_handler = Csys2blacs_handle(MPI_COMM_WORLD); int32_t context1 = blacs_handler; int32_t context2 = blacs_handler; /* create BLACS context */ Cblacs_gridinit(&context1, "C", num_ranks, 1); /* get row and column ranks */ int32_t rank_row1, rank_col1; Cblacs_gridinfo(context1, &num_ranks, &one, &rank_row1, &rank_col1); /* get local number of rows and columns of a matrix */ int32_t num_rows_local1, num_cols_local1; num_rows_local1 = numroc_(&M, &bs_row_A, &rank_row1, &isrc, &num_ranks); num_cols_local1 = numroc_(&N, &bs_col_A, &rank_col1, &isrc, &one); Cblacs_gridinit(&context2, "C", 1, num_ranks); int32_t rank_row2, rank_col2; Cblacs_gridinfo(context2, &one, &num_ranks, &rank_row2, &rank_col2); int32_t num_rows_local2, num_cols_local2; num_rows_local2 = numroc_(&M, &bs_row_B, &rank_row2, &isrc, &one); num_cols_local2 = numroc_(&N, &bs_col_B, &rank_col2, &isrc, &num_ranks); if (rank == 0) { printf("local dimensions of A: %i x %i\n", num_rows_local1, num_cols_local1); printf("local dimensions of B: %i x %i\n", num_rows_local2, num_cols_local2); } int32_t descA[9], descB[9], info; descinit_(descA, &M, &N, &bs_row_A, &bs_col_A, &isrc, &isrc, &context1, &num_rows_local1, &info); descinit_(descB, &M, &N, &bs_row_B, &bs_col_B, &isrc, &isrc, &context2, &num_rows_local2, &info); std::vector<double_complex> A(num_rows_local1 * num_cols_local1); std::vector<double_complex> B(num_rows_local2 * num_cols_local2, double_complex(0, 0)); std::vector<double_complex> C(num_rows_local1 * num_cols_local1); for (int i = 0; i < num_rows_local1 * num_cols_local1; i++) { A[i] = double_complex(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX); C[i] = A[i]; } double time = -MPI_Wtime(); for (int i = 0; i < repeat; i++) { pzgemr2d_(&M, &N, &A[0], &one, &one, descA, &B[0], &one, &one, descB, &context1); } time += MPI_Wtime(); if (rank == 0) { printf("average time %.4f sec, swap speed: %.4f GB/sec\n", time / repeat, sizeof(double_complex) * repeat * M * N / double(1 << 30) / time); } /* check correctness */ pzgemr2d_(&M, &N, &B[0], &one, &one, descB, &A[0], &one, &one, descA, &context1); for (int i = 0; i < num_rows_local1 * num_cols_local1; i++) { if (std::abs(A[i] - C[i]) > 1e-14) { printf("Fail.\n"); exit(0); } } Cblacs_gridexit(context1); Cblacs_gridexit(context2); Cfree_blacs_system_handle(blacs_handler); }
int main(int argc, char *argv[]){ gettimeofday(&tp, NULL); starttime=(double)tp.tv_sec+(1.e-6)*tp.tv_usec; int id, np, ret; /* dirs, files, tags ...*/ const char* dir="../data/compleib_data/"; const char* dirinit="../data/initial_points/"; const char* code="REA1"; const char* tag="01"; char* solutionQP="solutionQP.dat-s"; /*algorithm vars*/ int max_iter=10000,step_1_fail=20; double romax=10.0,beta=0.9,gamma=0.1,sigma=0.5,tolerancia_ro=10e-4; /* compleib data matrices */ genmatrix A, B1, B,C1, C,D11,D12,D21; /* compleib initial point */ genmatrix F0, Q0, V0; /* compleib matrix dimensions*/ int nx,nw,nu,nz,ny; int i,n,k; double rho=100.0; /* auxiliar matrices */ genmatrix AF, CF, OUT, MF1, MF2; struct blockmatrix diagMF1, diagMF2; /* filter SDP */ filter Fil; //double beta=0.9; //double gamma=0.1; /* sdp problem data */ struct blockmatrix Csdp; double *asdp; struct constraintmatrix *constraintssdp; /* sdp variables */ struct blockmatrix X,Z; double *y; /* sdp value of objectives functions */ double pobj,dobj; genmatrix *null=NULL; /* Initialize the process grid */ struct scalapackpar scapack; struct paramstruc params; int printlevel; /*load compleib data*/ //int size=10974; //load_genmatrix("testeigenvalue/bcsstk17.dat",&A,size,size,0); //load_compleib(code, dir, &A, &B1, &B, &C1, &C, &D11, &D12, &D21, &nx, &nw, &nu, &nz, &ny, id); //load_initial_point(code,tag,dirinit,&F0,&Q0,&V0,nx,nu,ny, id); //initialize_filter(&Fil,500.0,500.0); MPI_Init(&argc,&argv); MPI_Comm_rank (MPI_COMM_WORLD,&id); MPI_Comm_size (MPI_COMM_WORLD,&np); scapack.id = id; scapack.np = np; switch (scapack.np) { case 1: scapack.nprow=1; scapack.npcol=1; break; case 2: scapack.nprow=2; scapack.npcol=1; break; case 4: scapack.nprow=2; scapack.npcol=2; break; case 8: scapack.nprow=4; scapack.npcol=2; break; case 16: scapack.nprow=4; scapack.npcol=4; break; case 32: scapack.nprow=8; scapack.npcol=4; break; case 64: scapack.nprow=8; scapack.npcol=8; break; default: if (scapack.id==0) printf("Can not setup %d processors to a grid.\nPlease use 1,2,4,8,9,16,32 or 64 nodes to run or modify fnlsdp.c file. \n",scapack.np); MPI_Finalize(); return(1); }; sl_init_(&scapack.ic,&scapack.nprow,&scapack.npcol); Cblacs_gridinfo(scapack.ic,&scapack.nprow,&scapack.npcol,&scapack.myrow,&scapack.mycol); /* if(id==0)print_filter(&Fil); if(acceptable(&Fil,10.0,3.0,beta,gamma)) {printf("acceptable!\n");add(&Fil,10.0,3.0);} if(id==0)print_filter(&Fil); if(acceptable(&Fil,8.0,3.1,beta,gamma)) {printf("acceptable!\n");add(&Fil,8.0,3.1);} if(id==0)print_filter(&Fil); if(acceptable(&Fil,6.0,3.2,beta,gamma)) {printf("acceptable!\n");add(&Fil,6.0,3.2);} if(id==0)print_filter(&Fil); extract(&Fil,10.0,3.0); if(id==0)print_filter(&Fil); extract(&Fil,8.0,3.1); if(id==0)print_filter(&Fil); */ //printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow); //printf("f=%f\ntheta=%f\n",eval_f(&F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id),eval_theta(&F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id)); //printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow); //test_nelmin(&F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id); //fix_genmatrix(&Q0); //print_genmatrix(&V0); /*double ll=lambda1(&A,size,scapack,params,printlevel,id); if(id==0)printf("lambda1=%f\n",ll); MPI_Barrier(MPI_COMM_WORLD); free_mat_gen(&A,0);*/ //printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow); algorithm(code,tag,dir,dirinit,max_iter,romax,beta,gamma,sigma,tolerancia_ro,step_1_fail,scapack,params,printlevel,id); /* double *dx=(double *)calloc(nu*ny+nx*(nx+1),sizeof(double)); double *x_current=(double *)calloc(nu*ny+nx*(nx+1),sizeof(double)); printf("holaaaa\n"); mats2vec(dx,&F0,&Q0,&V0,nu,ny,nx); for(i=0;i<nu*ny+nx*(nx+1);i++){ printf("dx[%d]=%f\n",i,dx[i]); } mats2vec(x_current,&F0,&Q0,&V0,nu,ny,nx); for(i=0;i<nu*ny+nx*(nx+1);i++){ printf("x_current[%d]=%f\n",i,x_current[i]); } printf("fobj=%f\n",eval_nabla_f_vec(dx,x_current, rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id)); free(dx); free(x_current); */ /*solve qp*/ //ret=0; //fix_genmatrix(&Q0); //ret=solve_qp(code,"01", &F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id,solutionQP); //printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow); /*if(DEBUG_FNLSDP && id==0){ printf("F1:\n"); print_genmatrix(&F0); printf("Q1:\n"); print_genmatrix(&Q0); printf("V1:\n"); print_genmatrix(&V0); }*/ //free_filter(&Fil); //free_initial_point(&F0,&Q0,&V0, np); //free_compleib(&A, &B1, &B, &C1, &C, &D11, &D12, &D21, np); Cblacs_gridexit(scapack.ic); MPI_Finalize(); gettimeofday(&tp, NULL); endtime=(double)tp.tv_sec+(1.e-6)*tp.tv_usec; totaltime=endtime-starttime; othertime=totaltime-opotime-factortime; if(id==0){ printf("Elements time: %f \n",opotime); printf("Factor time: %f \n",factortime); printf("Other time: %f \n",othertime); printf("Total time: %f \n",totaltime); } return ret; }
int main(int argc, char **argv) { int iam, nprocs; int myrank_mpi, nprocs_mpi; int ictxt, nprow, npcol, myrow, mycol; int nb, m, n; int mpA, nqA, mpU, nqU, mpVT, nqVT; int i, j, k, itemp, min_mn; int descA[9], descU[9], descVT[9]; float *A=NULL; int info, infoNN, infoVV, infoNV, infoVN; float *U_NN=NULL, *U_VV=NULL, *U_NV=NULL, *U_VN=NULL; float *VT_NN=NULL, *VT_VV=NULL, *VT_NV=NULL, *VT_VN=NULL; float *S_NN=NULL, *S_VV=NULL, *S_NV=NULL, *S_VN=NULL; float *S_res_NN=NULL; float orthU_VV, residF, orthVT_VV; float orthU_VN, orthVT_NV; float residS_NN, eps; float res_repres_NV, res_repres_VN; /**/ int izero=0,ione=1; float rtmone=-1.0e+00; /**/ double MPIelapsedVV, MPIelapsedNN, MPIelapsedVN, MPIelapsedNV; char jobU, jobVT; int nbfailure=0, nbtestcase=0,inputfromfile, nbhetereogeneity=0; float threshold=100e+00; char buf[1024]; FILE *fd; char *c; char *t_jobU, *t_jobVT; int *t_m, *t_n, *t_nb, *t_nprow, *t_npcol; int nb_expe, expe; char hetereogeneityVV, hetereogeneityNN, hetereogeneityVN, hetereogeneityNV; int iseed[4], idist; /**/ MPI_Init( &argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); /**/ m = 100; n = 100; nprow = 1; npcol = 1; nb = 64; jobU='A'; jobVT='A'; inputfromfile = 0; for( i = 1; i < argc; i++ ) { if( strcmp( argv[i], "-f" ) == 0 ) { inputfromfile = 1; } if( strcmp( argv[i], "-jobvt" ) == 0 ) { if (i+1<argc) { if( strcmp( argv[i+1], "V" ) == 0 ){ jobVT = 'V'; i++; } else if( strcmp( argv[i+1], "N" ) == 0 ){ jobVT = 'N'; i++; } else if( strcmp( argv[i+1], "A" ) == 0 ){ jobVT = 'A'; i++; } else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n"); } else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n"); } if( strcmp( argv[i], "-jobu" ) == 0 ) { if (i+1<argc) { if( strcmp( argv[i+1], "V" ) == 0 ){ jobU = 'V'; i++; } else if( strcmp( argv[i+1], "N" ) == 0 ){ jobU = 'N'; i++; } else if( strcmp( argv[i+1], "A" ) == 0 ){ jobU = 'A'; i++; } else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n"); } else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n"); } if( strcmp( argv[i], "-m" ) == 0 ) { m = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-n" ) == 0 ) { n = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-p" ) == 0 ) { nprow = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-q" ) == 0 ) { npcol = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-nb" ) == 0 ) { nb = atoi(argv[i+1]); i++; } } /**/ if (inputfromfile){ nb_expe = 0; fd = fopen("svd.dat", "r"); if (fd == NULL) { printf("File failed to open svd.dat from processor mpirank(%d/%d): \n",myrank_mpi,nprocs_mpi); exit(-1); } do { c = fgets(buf, 1024, fd); /* get one line from the file */ if (c != NULL) if (c[0] != '#') nb_expe++; } while (c != NULL); /* repeat until NULL */ fclose(fd); t_jobU = (char *)calloc(nb_expe,sizeof(char)) ; t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ; t_m = (int *)calloc(nb_expe,sizeof(int )) ; t_n = (int *)calloc(nb_expe,sizeof(int )) ; t_nb = (int *)calloc(nb_expe,sizeof(int )) ; t_nprow = (int *)calloc(nb_expe,sizeof(int )) ; t_npcol = (int *)calloc(nb_expe,sizeof(int )) ; fd = fopen("svd.dat", "r"); expe=0; do { c = fgets(buf, 1024, fd); /* get one line from the file */ if (c != NULL) if (c[0] != '#'){ //printf("NBEXPE = %d\n",expe); sscanf(c,"%c %c %d %d %d %d %d", &(t_jobU[expe]),&(t_jobVT[expe]),&(t_m[expe]),&(t_n[expe]), &(t_nb[expe]),(&t_nprow[expe]),&(t_npcol[expe])); expe++; } } while (c != NULL); /* repeat until NULL */ fclose(fd); } else { nb_expe = 1; t_jobU = (char *)calloc(nb_expe,sizeof(char)) ; t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ; t_m = (int *)calloc(nb_expe,sizeof(int )) ; t_n = (int *)calloc(nb_expe,sizeof(int )) ; t_nb = (int *)calloc(nb_expe,sizeof(int )) ; t_nprow = (int *)calloc(nb_expe,sizeof(int )) ; t_npcol = (int *)calloc(nb_expe,sizeof(int )) ; t_jobU[0] = jobU; t_jobVT[0] = jobVT; t_m[0] = m; t_n[0] = n; t_nb[0] = nb; t_nprow[0] = nprow; t_npcol[0] = npcol; } if (myrank_mpi==0){ printf("\n"); printf("--------------------------------------------------------------------------------------------------------------------\n"); printf(" Testing psgsevd -- float precision SVD ScaLAPACK routine \n"); printf("jobU jobVT m n nb p q || info heter resid orthU orthVT |SNN-SVV| time(s) cond(A) \n"); printf("--------------------------------------------------------------------------------------------------------------------\n"); } /**/ for (expe = 0; expe<nb_expe; expe++){ jobU = t_jobU[expe] ; jobVT = t_jobVT[expe] ; m = t_m[expe] ; n = t_n[expe] ; nb = t_nb[expe] ; nprow = t_nprow[expe] ; npcol = t_npcol[expe] ; if (nb>n) nb = n; if (nprow*npcol>nprocs_mpi){ if (myrank_mpi==0) printf(" **** ERROR : we do not have enough processes available to make a p-by-q process grid ***\n"); printf(" **** Bye-bye ***\n"); MPI_Finalize(); exit(1); } /**/ Cblacs_pinfo( &iam, &nprocs ) ; Cblacs_get( -1, 0, &ictxt ); Cblacs_gridinit( &ictxt, "Row", nprow, npcol ); Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); /**/ min_mn = min(m,n); /**/ //if (iam==0) //printf("\tm=%d\tn = %d\t\t(%d,%d)\t%dx%d\n",m,n,nprow,npcol,nb,nb); //printf("Hello World, I am proc %d over %d for MPI, proc %d over %d for BLACS in position (%d,%d) in the process grid\n", //myrank_mpi,nprocs_mpi,iam,nprocs,myrow,mycol); /* * * Work only the process in the process grid * */ //if ((myrow < nprow)&(mycol < npcol)){ if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)){ /* * * Compute the size of the local matrices (thanks to numroc) * */ mpA = numroc_( &m , &nb, &myrow, &izero, &nprow ); nqA = numroc_( &n , &nb, &mycol, &izero, &npcol ); mpU = numroc_( &m , &nb, &myrow, &izero, &nprow ); nqU = numroc_( &min_mn, &nb, &mycol, &izero, &npcol ); mpVT = numroc_( &min_mn, &nb, &myrow, &izero, &nprow ); nqVT = numroc_( &n , &nb, &mycol, &izero, &npcol ); /* * * Allocate and fill the matrices A and B * */ A = (float *)calloc(mpA*nqA,sizeof(float)) ; if (A==NULL){ printf("error of memory allocation A on proc %dx%d\n",myrow,mycol); exit(0); } /**/ // seed = iam*(mpA*nqA*2); srand(seed); idist = 2; iseed[0] = mpA%4096; iseed[1] = iam%4096; iseed[2] = nqA%4096; iseed[3] = 23; /**/ k = 0; for (i = 0; i < mpA; i++) { for (j = 0; j < nqA; j++) { slarnv_( &idist, iseed, &ione, &(A[k]) ); k++; } } /* * * Initialize the array descriptor for the distributed matrices xA, U and VT * */ itemp = max( 1, mpA ); descinit_( descA, &m, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); itemp = max( 1, mpA ); descinit_( descU, &m, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); itemp = max( 1, mpVT ); descinit_( descVT, &min_mn, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); /**/ eps = pslamch_( &ictxt, "Epsilon" ); /**/ if ( ((jobU=='V')&(jobVT=='N')) ||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; U_VN = (float *)calloc(mpU*nqU,sizeof(float)) ; if (U_VN==NULL){ printf("error of memory allocation U_VN on proc %dx%d\n",myrow,mycol); exit(0); } S_VN = (float *)calloc(min_mn,sizeof(float)) ; if (S_VN==NULL){ printf("error of memory allocation S_VN on proc %dx%d\n",myrow,mycol); exit(0); } infoVN = driver_psgesvd( 'V', 'N', m, n, A, 1, 1, descA, S_VN, U_VN, 1, 1, descU, VT_VN, 1, 1, descVT, &MPIelapsedVN); orthU_VN = verif_orthogonality(m,min_mn,U_VN , 1, 1, descU); res_repres_VN = verif_repres_VN( m, n, A, 1, 1, descA, U_VN, 1, 1, descU, S_VN); if (infoVN==min_mn+1) hetereogeneityVN = 'H'; else hetereogeneityVN = 'N'; if ( iam==0 ) printf(" V N %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoVN,hetereogeneityVN,res_repres_VN/(S_VN[0]/S_VN[min_mn-1]), orthU_VN,MPIelapsedVN,S_VN[0]/S_VN[min_mn-1]); if (infoVN==min_mn+1) nbhetereogeneity++ ; else if ((res_repres_VN/eps/(S_VN[0]/S_VN[min_mn-1])>threshold)||(orthU_VN/eps>threshold)||(infoVN!=0)) nbfailure++; } /**/ if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; VT_NV = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VT_NV==NULL){ printf("error of memory allocation VT_NV on proc %dx%d\n",myrow,mycol); exit(0); } S_NV = (float *)calloc(min_mn,sizeof(float)) ; if (S_NV==NULL){ printf("error of memory allocation S_NV on proc %dx%d\n",myrow,mycol); exit(0); } infoNV = driver_psgesvd( 'N', 'V', m, n, A, 1, 1, descA, S_NV, U_NV, 1, 1, descU, VT_NV, 1, 1, descVT, &MPIelapsedNV); orthVT_NV = verif_orthogonality(min_mn,n,VT_NV, 1, 1, descVT); res_repres_NV = verif_repres_NV( m, n, A, 1, 1, descA, VT_NV, 1, 1, descVT, S_NV); if (infoNV==min_mn+1) hetereogeneityNV = 'H'; else hetereogeneityNV = 'N'; if ( iam==0 ) printf(" N V %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoNV,hetereogeneityNV,res_repres_NV/(S_NV[0]/S_NV[min_mn-1]), orthVT_NV,MPIelapsedNV,S_NV[0]/S_NV[min_mn-1]); if (infoNV==min_mn+1) nbhetereogeneity++ ; else if ((res_repres_NV/eps/(S_NV[0]/S_NV[min_mn-1])>threshold)||(orthVT_NV/eps>threshold)||(infoNV!=0)) nbfailure++; } /**/ if ( ((jobU=='N')&(jobVT=='N')) || ((jobU=='V')&(jobVT=='V')) || (jobU == 'A' ) || (jobVT=='A') ) { nbtestcase++; U_VV = (float *)calloc(mpU*nqU,sizeof(float)) ; if (U_VV==NULL){ printf("error of memory allocation U_VV on proc %dx%d\n",myrow,mycol); exit(0); } VT_VV = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VT_VV==NULL){ printf("error of memory allocation VT_VV on proc %dx%d\n",myrow,mycol); exit(0); } S_VV = (float *)calloc(min_mn,sizeof(float)) ; if (S_VV==NULL){ printf("error of memory allocation S_VV on proc %dx%d\n",myrow,mycol); exit(0); } infoVV = driver_psgesvd( 'V', 'V', m, n, A, 1, 1, descA, S_VV, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT, &MPIelapsedVV); orthU_VV = verif_orthogonality(m,min_mn,U_VV , 1, 1, descU); orthVT_VV = verif_orthogonality(min_mn,n,VT_VV, 1, 1, descVT); residF = verif_representativity( m, n, A, 1, 1, descA, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT, S_VV); if (infoVV==min_mn+1) hetereogeneityVV = 'H'; else hetereogeneityVV = 'N'; if ( iam==0 ) printf(" V V %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoVV,hetereogeneityVV,residF,orthU_VV,orthVT_VV,MPIelapsedVV,S_VV[0]/S_VV[min_mn-1]); if (infoVV==min_mn+1) nbhetereogeneity++ ; else if ((residF/eps>threshold)||(orthU_VV/eps>threshold)||(orthVT_VV/eps>threshold)||(infoVV!=0)) nbfailure++; } /**/ if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; S_NN = (float *)calloc(min_mn,sizeof(float)) ; if (S_NN==NULL){ printf("error of memory allocation S_NN on proc %dx%d\n",myrow,mycol); exit(0); } infoNN = driver_psgesvd( 'N', 'N', m, n, A, 1, 1, descA, S_NN, U_NN, 1, 1, descU, VT_NN, 1, 1, descVT, &MPIelapsedNN); S_res_NN = (float *)calloc(min_mn,sizeof(float)) ; if (S_res_NN==NULL){ printf("error of memory allocation S on proc %dx%d\n",myrow,mycol); exit(0); } scopy_(&min_mn,S_VV,&ione,S_res_NN,&ione); saxpy_ (&min_mn,&rtmone,S_NN,&ione,S_res_NN,&ione); residS_NN = snrm2_(&min_mn,S_res_NN,&ione) / snrm2_(&min_mn,S_VV,&ione); free(S_res_NN); if (infoNN==min_mn+1) hetereogeneityNN = 'H'; else hetereogeneityNN = 'N'; if ( iam==0 ) printf(" N N %6d %6d %3d %3d %3d || %3d %c %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoNN,hetereogeneityNN,residS_NN,MPIelapsedNN,S_NN[0]/S_NN[min_mn-1]); if (infoNN==min_mn+1) nbhetereogeneity++ ; else if ((residS_NN/eps>threshold)||(infoNN!=0)) nbfailure++; } /**/ if (((jobU=='V')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_VN); free(U_VN); } if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(VT_NV); free(S_NV); } if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_NN); } if (((jobU=='N')&(jobVT=='N'))||((jobU=='V')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(U_VV); free(S_VV); free(VT_VV);} free(A); Cblacs_gridexit( 0 ); } /* * Print ending messages */ } if ( iam==0 ){ printf("--------------------------------------------------------------------------------------------------------------------\n"); printf(" [ nbhetereogeneity = %d / %d ]\n",nbhetereogeneity, nbtestcase); printf(" [ nbfailure = %d / %d ]\n",nbfailure, nbtestcase-nbhetereogeneity); printf("--------------------------------------------------------------------------------------------------------------------\n"); printf("\n"); } /**/ free(t_jobU ); free(t_jobVT ); free(t_m ); free(t_n ); free(t_nb ); free(t_nprow ); free(t_npcol ); MPI_Finalize(); exit(0); }
void FreeGrid( int context ) { Cblacs_gridexit( context ); }
int main(int argc, char **argv) { int ictxt, nside, ngrid, nblock, nthread; int rank, size; int ic, ir, nc, nr; int i, j; char *fname; int info, ZERO=0, ONE=1; struct timeval st, et; double dtnn, dtnt, dttn, dttt; double gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt; /* Initialising MPI stuff */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); printf("Process %i of %i.\n", rank, size); /* Parsing arguments */ if(argc < 6) { exit(-3); } nside = atoi(argv[1]); ngrid = atoi(argv[2]); nblock = atoi(argv[3]); nthread = atoi(argv[4]); fname = argv[5]; if(rank == 0) { printf("Multiplying matrices of size %i x %i\n", nside, nside); printf("Process grid size %i x %i\n", ngrid, ngrid); printf("Block size %i x %i\n", nblock, nblock); printf("Using %i OpenMP threads\n", nthread); } #ifdef _OPENMP if(rank == 0) printf("Setting OMP_NUM_THREADS=%i\n", nthread); omp_set_num_threads(nthread); #endif /* Setting up BLACS */ Cblacs_pinfo( &rank, &size ) ; Cblacs_get(-1, 0, &ictxt ); Cblacs_gridinit(&ictxt, "Row", ngrid, ngrid); Cblacs_gridinfo(ictxt, &nr, &nc, &ir, &ic); int descA[9], descB[9], descC[9]; /* Fetch local array sizes */ int Ar, Ac, Br, Bc, Cr, Cc; Ar = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Ac = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Br = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Bc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Cr = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Cc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); printf("Local array section %i x %i\n", Ar, Ac); /* Set descriptors */ descinit_(descA, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Ar, &info); descinit_(descB, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Br, &info); descinit_(descC, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Cr, &info); /* Initialise and fill arrays */ double *A = (double *)malloc(Ar*Ac*sizeof(double)); double *B = (double *)malloc(Br*Bc*sizeof(double)); double *C = (double *)malloc(Cr*Cc*sizeof(double)); for(i = 0; i < Ar; i++) { for(j = 0; j < Ac; j++) { A[j*Ar + i] = drand48(); B[j*Br + i] = drand48(); C[j*Cr + i] = 0.0; } } double alpha = 1.0, beta = 0.0; //======================== if(rank == 0) printf("Starting multiplication (NN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nn = 2.0*pow(nside, 3) / (dtnn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnn, gfpc_nn); //======================== //======================== if(rank == 0) printf("Starting multiplication (NT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nt = 2.0*pow(nside, 3) / (dtnt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnt, gfpc_nt); //======================== //======================== if(rank == 0) printf("Starting multiplication (TN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tn = 2.0*pow(nside, 3) / (dttn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttn, gfpc_tn); //======================== //======================== if(rank == 0) printf("Starting multiplication (TT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tt = 2.0*pow(nside, 3) / (dttt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttt, gfpc_tt); //======================== if(rank == 0) { FILE * fd; fd = fopen(fname, "w"); fprintf(fd, "%g %g %g %g %i %i %i %i %g %g %g %g\n", gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt, nside, ngrid, nblock, nthread, dtnn, dtnt, dttn, dttt); fclose(fd); } Cblacs_gridexit( 0 ); MPI_Finalize(); }