void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){ int i,j,k; /* BLACS_PARAMS: [0] = ictxt; [1] = my_proc; [2] = total_procs; [3] = Nproc_row; [4] = Nproc_col; [5] = my_row; [6] = my_col; [7] = Global_rows; [8] = Global_columns; [9] = Block_Size_row; [10] = Block_Size_col; [11] = LOC_rows; [12] = LOC_columns; */ blacs_pinfo_( i, j); BLACS_PARAMS[1] = i; BLACS_PARAMS[2] = j; blacs_get_( -1, 0, k); // <- LG check the arguments of this call BLACS_PARAMS[0] = k; i = BLACS_PARAMS[3]; j = BLACS_PARAMS[4]; blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j); blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j); BLACS_PARAMS[5] = i; BLACS_PARAMS[6] = j; i = 0; BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]); BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]); i = 0; j = 0; descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8], BLACS_PARAMS[9], BLACS_PARAMS[10], i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); descinit_(DESCB, BLACS_PARAMS[7], 1, BLACS_PARAMS[9], 1, i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); }
/// /// This code generates a PdgesvdArgs parameter block that can be used to drive /// pdgesvdSlave2() when there is no SciDB application to provide the info. /// It makes up parameters for a pdgesvd call that are appropriate to the /// processor grid and order of matrix being decomposed /// scidb::PdgesvdArgs pdgesvdGenTestArgs(slpp::int_t ICTXT, slpp::int_t NPROW, slpp::int_t NPCOL, slpp::int_t MYPROW, slpp::int_t MYPCOL, slpp::int_t MYPNUM, slpp::int_t order) { scidb::PdgesvdArgs result; // hard-code a problem based on order and fixed block size const slpp::int_t M=order; const slpp::int_t N=order; const slpp::int_t MIN_MN=order; const slpp::int_t BLKSZ=slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE; // we are making up an array descriptor, not receiving one // as is normal for functions in a xxxxSlave.cpp file. It is only because // its a test routine that SCALAPACK_EFFICIENT_BLOCK_SIZE is referenced here // Normally it is only used at the xxxxxPhysical.cpp operator level. const slpp::int_t one = 1 ; const char jobU = 'V'; const char jobVT = 'V'; // create ScaLAPACK array descriptors const slpp::int_t RSRC = 0 ; // LLD(A) slpp::int_t LLD_A = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW )); // LLD(VT) slpp::int_t LLD_VT = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW )); // WARNING -- note I never checked INFO from descinits !! slpp::int_t INFO = 0; slpp::desc_t DESC_A; descinit_(DESC_A, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); slpp::desc_t DESC_U; descinit_(DESC_U, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); slpp::desc_t DESC_VT; descinit_(DESC_VT, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_VT, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); // S is different: global, not distributed, so its LLD(S) == LEN(S) slpp::desc_t DESC_S; descinit_(DESC_S, MIN_MN, 1, BLKSZ, BLKSZ, 0, 0, ICTXT, MIN_MN, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); pdgesvdMarshallArgs(&result, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM, jobU, jobVT, M, N, NULL /*A*/, one, one, DESC_A, NULL /*S*/, NULL /*U*/, one, one, DESC_U, NULL /*VT*/, one, one, DESC_VT); return result; }
void dgather(int ictxt, int n, int numc, int nb, double *A, double *A_d, int *descAd){ int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info; int nprow, npcol, myrow, mycol, descA[9], itemp; int i,k; sl_init_(&RootNodeic, &ione, &ione); Cblacs_gridinfo(ictxt, &nprow,&npcol, &myrow, &mycol); if (myrow==0 && mycol ==0){ isRootNode = 1;} if(isRootNode){ nru = numroc_(&n, &n, &myrow, &izero, &nprow); itemp = max(1,nru); descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info ); } else{ k=0; for(i=0;i<9;i++){ descA[k]=0; k++; } descA[1]=-1; } pdgemr2d_(&n,&numc,A_d,&ione, &ione, descAd, A, &ione, &ione, descA, &ictxt ); if (isRootNode){ Cblacs_gridexit(RootNodeic); } }
/*=================================================================== * * Simplified wrapper for the ScaLAPACK routine descinit, which * initializes the descrip array associated to each distributed * matrix or vector. */ void Build_descrip(int my_rank, char* name, int* descrip, int m, int n, int row_block_size, int col_block_size, int blacs_grid, int leading_dim) { int first_proc_row = 0; /* Assume all distributed arrays begin */ int first_proc_col = 0; /* in process row 0, process col 0 */ int error_info; extern void descinit_(int* descrip, int* m, int* n, int* row_block_size, int* col_block_size, int* first_proc_row, int* first_proc_col, int* blacs_grid, int* leading_dim, int* error_info); descinit_(descrip, &m, &n, &row_block_size, &col_block_size, &first_proc_row, &first_proc_col, &blacs_grid, &leading_dim, &error_info); if (error_info != 0) { fprintf(stderr, "Process %d > Descinit for b failed.\n", my_rank); fprintf(stderr, "Process %d > error_info = %d\n", my_rank, error_info); fprintf(stderr, "Process %d > Quitting.\n", my_rank); MPI_Abort(MPI_COMM_WORLD, -1); } } /* Build_descrip */
SEXP R_descinit(SEXP DIM, SEXP BLDIM, SEXP ICTXT, SEXP LLD) { R_INIT; int row_col_src = 0; int info = 0; SEXP desc; newRvec(desc, 9, "int"); descinit_(INTP(desc), INTP(DIM), INTP(DIM)+1, INTP(BLDIM), INTP(BLDIM)+1, &row_col_src, &row_col_src, INTP(ICTXT), INTP(LLD), &info); R_END; return desc; }
float verif_repres_VN(int m, int n, float *A, int ia, int ja, int *descA, float *U, int iu, int ju, int *descU, float *S){ float *VTcpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, prow, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, mpVT, nqVT, descVTcpy[9], itemp, ivtcpy, jvtcpy; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, info; float tpone= +1.0e+00, tzero= +0.0e+00; float verif_repres_VN, invStemp; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); mpVT = numroc_( &min_mn, &nbA, &myrow, &rsrcA, &nprow ); nqVT = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); itemp = max( 1, mpVT ); descinit_( descVTcpy, &min_mn, &n, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info ); ivtcpy = 1; jvtcpy = 1; VTcpy = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VTcpy==NULL){ printf("error of memory allocation VTcpy on proc %dx%d\n",myrow,mycol); exit(0); } psgemm_( "T", "N", &min_mn, &n, &m, &tpone, U, &iu, &ju, descU, A, &ia, &ja, descA, &tzero, VTcpy, &ivtcpy, &jvtcpy, descVTcpy ); for (i=1;i<min_mn+1;i++){ prow = indxg2p_( &i, &nbA, &izero, &izero, &nprow ); localcol = indxg2l_( &i, &nbA, &izero, &izero, &nprow ); invStemp = 1/S[i-1]; if( myrow==prow ) sscal_( &nqA, &invStemp, &(VTcpy[localcol-1]), &mpVT ); } verif_repres_VN = verif_orthogonality(min_mn,n,VTcpy, ivtcpy, jvtcpy, descVTcpy); free(VTcpy); return verif_repres_VN; }
float verif_repres_NV(int m, int n, float *A, int ia, int ja, int *descA, float *VT, int ivt, int jvt, int *descVT, float *S){ float *Ucpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, pcol, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, mpU, nqU, descUcpy[9], itemp, iucpy, jucpy; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, ione = 1, info; float tpone= +1.0e+00, tzero= +0.0e+00; float verif_repres_NV, invStemp; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); itemp = max( 1, mpA ); descinit_( descUcpy, &m, &min_mn, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info ); iucpy = 1; jucpy = 1; mpU = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqU = numroc_( &min_mn, &nbA, &mycol, &csrcA, &npcol ); Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ; if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); } psgemm_( "N", "T", &m, &min_mn, &n, &tpone, A, &ia, &ja, descA, VT, &ivt, &jvt, descVT, &tzero, Ucpy, &iucpy, &jucpy, descUcpy ); for (i=1;i<min_mn+1;i++){ pcol = indxg2p_( &i, &(descUcpy[5]), &izero, &izero, &npcol ); localcol = indxg2l_( &i, &(descUcpy[5]), &izero, &izero, &npcol ); invStemp = 1/S[i-1]; if( mycol==pcol ) sscal_( &mpA, &invStemp, &(Ucpy[ ( localcol-1 )*descUcpy[8] ]), &ione ); } verif_repres_NV = verif_orthogonality(m,min_mn,Ucpy, iucpy, jucpy, descUcpy); free(Ucpy); return verif_repres_NV; }
float verif_orthogonality(int m, int n, float *U, int iu, int ju, int *descU){ float *R=NULL; int nprow, npcol, myrow, mycol; int mpR, nqR, nb, itemp, descR[9], ictxt, info, min_mn, max_mn; int ctxt_ = 1, nb_ = 5; int izero = 0, ione = 1; float *wwork=NULL; float tmone= -1.0e+00, tpone= +1.0e+00, tzero= +0.0e+00; float orthU; min_mn = min(m,n); max_mn = max(m,n); ictxt = descU[ctxt_]; nb = descU[nb_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); mpR = numroc_( &min_mn, &nb, &myrow, &izero, &nprow ); nqR = numroc_( &min_mn, &nb, &mycol, &izero, &npcol ); R = (float *)calloc(mpR*nqR,sizeof(float)) ; if (R==NULL){ printf("error of memory allocation R on proc %dx%d\n",myrow,mycol); exit(0); } itemp = max( 1, mpR ); descinit_( descR, &min_mn, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); pslaset_( "F", &min_mn, &min_mn, &tzero, &tpone, R, &ione, &ione, descR ); if (m>n) psgemm_( "T", "N", &min_mn, &min_mn, &m, &tpone, U, &iu, &ju, descU, U, &iu, &ju, descU, &tmone, R, &ione, &ione, descR ); else psgemm_( "N", "T", &min_mn, &min_mn, &n, &tpone, U, &iu, &ju, descU, U, &iu, &ju, descU, &tmone, R, &ione, &ione, descR ); orthU = pslange_( "F", &min_mn, &min_mn, R, &ione, &ione, descR, wwork ); orthU = orthU / ((float) max_mn); free(R); return orthU; }
void ddistr(int ictxt, int n, int numc, int nb, double *A , double *A_d, int *descAd ){ int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info; int nprow, npcol, myrow, mycol,descA[9], itemp; int i,k; /* #ifdef NOUNDERLAPACK sl_init__(&RootNodeic,&ione, &ione); #else sl_init__(&RootNodeic,&ione, &ione); #endif */ sl_init_(&RootNodeic,&ione, &ione); Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); //printf("nprow=%d, npcol=%d, myrow=%d, mycol=%d\n",nprow,npcol,myrow,mycol); //printf("nb=%d\n",nb); if (myrow==0 && mycol==0) { isRootNode=1;} if (isRootNode){ //printf("root entro aca...\n"); nru = numroc_(&n, &n, &myrow,&izero, &nprow ); //printf("root paso numroc\n"); itemp = max(1, nru); descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info); //printf("root paso descinit\n"); } else{ //printf("yo entre aca\n"); k=0; for(i=0;i<9;i++){ descA[k]=0; k++; } descA[1]=-1; } //printf("inicio de cosas para todos\n"); nru = numroc_(&n, &nb, &myrow, &izero, &nprow); //printf("todos pasan numroc\n"); itemp = max(1,nru); descinit_(descAd, &n, &numc, &nb, &nb, &izero, &izero, &ictxt,&itemp, &info); //printf("todos pasan descinit\n"); pdgemr2d_( &n, &numc, A, &ione, &ione, descA, A_d, &ione, &ione, descAd, &ictxt); //printf("todos pasan pdgemr2d\n"); if (isRootNode){ //printf("RootNodeic=%d\n",RootNodeic); Cblacs_gridexit(RootNodeic); //printf("root paso gridexit\n"); } }
int MAIN__(int argc, char** argv) { int num; // number of data int dim; // dimension of each data int nprow=4; // number of row int npcol=1; // number of columnn int zero=0, one=1; // constant value int ictxt,myrow,mycol,pnum,pdim,info; char ifilename[LEN_FILENAME]; char ofilename[LEN_FILENAME]; int myproc, nprocs; Cblacs_pinfo(&myproc, &nprocs); Cblacs_setup(&myproc, &nprocs); Cblacs_get(-1,0,&ictxt); nprow = nprocs; npcol = 1; // fixed char order[] = "Row"; Cblacs_gridinit(&ictxt, order, nprow, npcol); Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); if (DEBUG_MODE) { printf("ConTxt = %d\n", ictxt); printf("nprocs=%d, nprow=%d, npcol=%d\n", nprocs, nprow, npcol); printf("nprocs=%d, myrow=%d, mycol=%d\n", nprocs, myrow, mycol); } get_option(argc, argv, ifilename, ofilename, &num, &dim); // 0. cosinedist(ij) = 1 - V(i)V(j)/(Length(V(i))*Length(V(j))) // 1. calculate submatrix size int bsize = num / nprow; // blocking factor pnum = num / nprow; pdim = dim; if ( myrow < (num/bsize)%nprow) { pnum += bsize; } else if ( myrow == (num/bsize)%nprow) { pnum += (num % bsize); } else { } if(DEBUG_MODE) printf("myproc=%d: pnum=%d, pdim=%d, bsize=%d\n", myproc, pnum, pdim, bsize); int desc_input[9], desc_v[9], desc_ip[9], desc_n[9], desc_result[9]; descinit_(desc_input, &num, &dim, &num, &dim, &zero, &zero, &ictxt, &num, &info); descinit_(desc_v, &num, &dim, &bsize, &pdim, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_ip, &num, &num, &bsize, &num, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_n, &num, &one, &bsize, &one, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_result, &num, &num, &num, &num, &zero, &zero, &ictxt, &num, &info); // 2. read input data double* input; if (myproc == 0) { input = (double*)malloc(sizeof(double)*num*dim); memset(input, 0, sizeof(double)*num*dim); read_data(ifilename, num, dim, input); printArray("input", myproc, input, num, dim); } // 3. distribute input data array double* V = (double*)malloc(sizeof(double)*pnum*pdim); memset(V, 0, sizeof(double)*pnum*pdim); Cpdgemr2d(num, dim, input, 1, 1, desc_input, V, 1, 1, desc_v, ictxt); printArray("V", myproc, V, pnum, pdim); // 4. InnerProduct = VV' double* InnerProduct = (double*)malloc(sizeof(double)*pnum*num); memset(InnerProduct, 0, sizeof(double)*pnum*num); char transa = 'N', transb = 'T'; int m = num, n = num, k = dim; int lda = num, ldb = num, ldc = num; double alpha = 1.0f, beta = 0.0f; pdgemm_(&transa, &transb, &m, &n, &k, &alpha, V, &one, &one, desc_v, V, &one, &one, desc_v, &beta, InnerProduct, &one, &one, desc_ip); printArray("InnerProduct", myproc, InnerProduct, pnum, num); // 5. Norm of each vector double* Norm = (double*)malloc(sizeof(double)*pnum); for (int i = 0; i < pnum; i++) { int n = ((myproc*bsize)+(i/bsize)*(nprocs-1)*bsize+i)*pnum + i; Norm[i] = sqrt(InnerProduct[n]); } printArray("Norm", myproc, Norm, 1, pnum); // 6. Norm product matrix double* NormProduct = (double*)malloc(sizeof(double)*pnum*num); memset(NormProduct, 0, sizeof(double)*pnum*num); char uplo = 'U'; n = num; alpha = 1.0f; int incx = 1; lda = num; pdsyr_(&uplo, &n, &alpha, Norm, &one, &one, desc_n, &incx, NormProduct, &one, &one, desc_ip); printArray("NormProduct", myproc, NormProduct, pnum, num); // 7. CosineDistance(ij) = 1-InnerProduct(ij)/NormProduct(ij) double* CosineDistance = (double*)malloc(sizeof(double)*pnum*num); memset(CosineDistance, 0, sizeof(double)*pnum*num); for (int j = 0; j < num; j++) { for (int i = 0; i < pnum; i++) { int n = ((myproc*bsize)+i+(i/bsize)*(nprocs-1)*bsize)*pnum+i; int p = i+j*pnum; if (p<=n) { CosineDistance[p] = 0.0; } else { CosineDistance[p] = 1 - InnerProduct[p]/NormProduct[p]; } } } printArray("CosineDistance", myproc, CosineDistance, pnum, num); // 8. gather result double* result; if ( myproc == 0 ) { result = (double*)malloc(sizeof(double)*num*num); memset(result, 0, sizeof(double)*num*num); } Cpdgemr2d(num, num, CosineDistance, 1, 1, desc_ip, result, 1, 1, desc_result, ictxt); // 9. output to file if ( myproc == 0 ) { output_results(ofilename, result, num, num); } // a. cleanup memory free(V); free(InnerProduct); free(Norm); free(NormProduct); free(CosineDistance); if ( myproc == 0 ) { free(input); free(result); } blacs_exit_(&zero); return 0; }
/* Test program * created 23/09/2014 * author Alex Bombrun * * icc -O1 -o eigen.exe lapackReadStore.c mpiutil.c normals.c matrixBlockStore.c -mkl * ./eigen.exe 4 4 * */ int main(int argc, char **argv) { FILE* store; FILE* scaStore; int N , M; int i, j; int n_blocks; int scalapack_size; int NB, MB; int i_block, j_block; int dim[4]; double * mat; // local matrix block use for reading int t, t_block; const char* profileG_file_name= "./data/NormalsG/profile.txt"; const char* store_location = "./data/ReducedNormals"; const char* scaStore_location ="./data/DiagCholeskyReducedNormals"; int mp; // number of rows in the processor grid int mla; // number of rows in the local array int mb; // number of rows in a block int np; // number of columns in the processor grid int nla; // number of columns in the local array int nb; // number of columns in a block int mype,npe; // rank and total number of process int idescal[9]; // matrix descriptors double *la; // matrix values: al is the local array int idescbl[9]; double *lb; double normb; int idescxl[9]; double *lx; double normx; int idesczl[9]; // matrix descriptors double *lz; // matrix values: al is the local array double *w; int ierr; // error output int mp_ret, np_ret, myrow, mycol; // to store grid info int zero=0; // value used for the descriptor initialization int one=1; // value used for the descriptor initialization int m,n; // matrix A dimensions double norm, cond; double *work = NULL; double * work2 = NULL; int *iwork = NULL; int lwork, liwork; float ll,mm,cr,cc; int ii,jj,pr,pc,h,g; // ii,jj coordinates of local array element int rsrc=0,csrc=0; // assume that 0,0 element should be stored in the 0,0 process int n_b = 1; int index; int icon; // scalapack cblacs context char normJob, jobz, uplo, trans, diag; double MPIt1, MPIt2, MPIelapsed; jobz= 'N'; uplo='U'; Cblacs_pinfo( &mype, &npe ); if (argc == 3) { //printf("%s %s %s\n", argv[0], argv[1], argv[2]); n_blocks= (int) strtol(argv[1], NULL, 10); scalapack_size= (int) strtol(argv[2], NULL, 10); } else { printf("Usage: expect 2 integers \n"); printf(" 1 : the number of diagonal blocks \n"); printf(" 2 : scalapack number to define block size (assume n is divisible by sqrt(p) and that n/sqrt(p) is divisible by this number)\n"); exit( -1); } printf("%d/%d: read store\n",mype,npe); N = getNumberOfLine(profileG_file_name); // the dimension of the matrix; M = N; // square matrix m=M; //mla*mp; n=N; //nla*np; np = isqrt(npe); // assume that the number of process is a square mp = np; // square grid mla = m/mp; // assume that the matrix dimension if a multiple of the process grid dimension nla = n/np; mb = mla/scalapack_size; // assume that the dimension of the matrix is a multiple of the number of the number of diagonal blocks nb = nla/scalapack_size; // init CBLACS Cblacs_get( -1, 0, &icon ); Cblacs_gridinit( &icon,"c", mp, np ); Cblacs_gridinfo( icon, &mp_ret, &np_ret, &myrow, &mycol); // allocate local matrix la=malloc(sizeof(double)*mla*nla); printf("%d/%d: full matrix (%d,%d), local matrix (%d,%d), processor grid (%d,%d), block (%d,%d) \n", mype, npe, m, n, mla, nla, np, mp, mb, nb); // set identity matrix for(i = 0;i<M;i++){ for(j = i;j<i+1;j++){ cr = (float)( i/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( j/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( i/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( j/(mp*nb) ) ); ii = i%mb + (int)(ll)*mb; jj = j%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = 1; } } } /* for(i_block=0;i_block<n_blocks;i_block++){ printf("%d/%d: process store block %d \n", mype, npe, i_block); readStore(&store,i_block,store_location); t_block = 0; while(readNextBlockDimension(dim,store)!=-1) { // loop B over all block tasks j_block = mpi_get_diag_block_id(i_block, t_block, n_blocks); mat = malloc((dim[1]-dim[0])*(dim[3]-dim[2]) * sizeof(double)); readNextBlock(dim[0],dim[1],dim[2],dim[3],mat,store); if (dim[0]==dim[2]){ // process only the diagonal blocks // printf("%d/%d: read block (%d,%d) with global indices (%d,%d,%d,%d) \n",mype, npe, i_block,j_block,dim[0],dim[1],dim[2],dim[3]); NB = dim[1]-dim[0]; MB = dim[3]-dim[2]; for(i = dim[0];i<dim[1];i++){ for(j = dim[2];j<dim[3];j++){ //matA[i*M+j] = mat[(i-dim[0])*MB+(j-dim[2])]; // finding out which pe gets this i,j element cr = (float)( i/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( j/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( i/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( j/(mp*nb) ) ); ii = i%mb + (int)(ll)*mb; jj = j%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = mat[(i-dim[0])*MB+(j-dim[2])]; } } } // transpose if(j_block != i_block){ for(i = dim[0];i<dim[1];i++){ for(j = dim[2];j<dim[3];j++){ //matA[j*M+i] = mat[(i-dim[0])*MB+(j-dim[2])]; // finding out which pe gets this j,i element cr = (float)( j/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( i/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( j/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( i/(mp*nb) ) ); ii = j%mb + (int)(ll)*mb; jj = i%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = mat[(i-dim[0])*MB+(j-dim[2])]; } } } } } free(mat); t_block++; } closeStore(store); } */ printf("%d/%d: finished scaterring the matrix \n",mype,npe); printf("%d/%d: start computing \n",mype,npe); // set the matrix descriptor ierr=0; descinit_(idescal, &m, &n , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); // processor grip id start at 0 if (mype==0) saveMatrixDescriptor(idescal, scaStore_location); ierr=0; descinit_(idescbl, &m, &one , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0 lb = calloc(sizeof(double),mla); ierr=0; // set x descinit_(idescxl, &n, &one , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0 lx = calloc(sizeof(double),mla); for(i=0;i<mla;i++){ lx[i] = 1.0/m; } pddot_(&n,&normx,lx,&one,&one,idescxl,&one,lx,&one,&one,idescxl,&one); // normx <- x'x if (mype==0) printf("%d/%d: normx2 %E \n",mype,npe,normx); ierr=0; // set b double alpha =1.0; double beta =0.0; trans = 'N'; pdgemv_(&trans,&m,&n,&alpha,la,&one,&one,idescal,lx,&one,&one,idescxl,&one,&beta,lb,&one,&one,idescbl,&one); // b <- A x pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b if (mype==0) printf("%d/%d: normb2 %E \n",mype,npe,normb); ierr = 0; // compute norm 1 of the reduced normal matrix /* DO NOT WORK lwork = 2*mla+2*nla; work = malloc(sizeof(double)*lwork); normJob = '1'; norm = pdlansy_(&normJob, &uplo, &n, la, &one, &one, idescal, work); // matrix index start at one printf("%d/%d: norm %f \n",mype,npe,norm); free(work); */ ierr = 0; // compute the cholesky decomposition printf("%d/%d: start computing cholesky factor\n",mype,npe); pdpotrf_(&uplo,&n,la,&one,&one,idescal,&ierr); printf("%d/%d: finish computing cholesky factor\n",mype,npe); openScalapackStore(&scaStore,myrow,mycol,scaStore_location); saveLocalMatrix(la,nla,mla,scaStore); double test=0.0; for(i=0;i<nla*mla;i++){ test += la[i]*la[i]; } printf("%d/%d: finished computing cholesky, test=%f \n",mype,npe,test); ierr =0; // assume x and b set // assume cholesky decomposition // compute the soluation A x = b diag = 'N'; printf("%d/%d: start solving\n",mype,npe); //pdpptrs_(&uplo, &trans , &diag , &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // solve triangular system //pdtrtrs (&uplo, &trans , &diag , &n , &n , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); pdpotrs_(&uplo, &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // b<- A-1 b alpha = -1.0; normb=0; pdaxpy_(&n,&alpha,lx,&one,&one,idescxl,&one,lb,&one,&one,idescbl,&one); // b<-b-x pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b if (mype==0) printf("%d/%d: finish solving, norm2(sol-true) %E \n",mype,npe,normb); ierr = 0; /* // compute the eigen values jobz= 'N'; uplo='U'; // with N z is ignored descinit_(idesczl, &m, &n , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); lz = malloc(sizeof(double)*mla*nla); w = malloc(sizeof(double)*m); lwork = -1; work = malloc(sizeof(double)*2); pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr); // only compute lwork //pdsyev_( &jobz, &uplo, &n, A, &ione, &ione, descA, W, Z, &ione, &ione, descZ, work, &lwork, &info ); lwork= (int) work[0]; free(work); work = (double *)calloc(lwork,sizeof(double)) ; //MPIt1 = MPI_Wtime(); pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr); // compute the eigen values //MPIt2 = MPI_Wtime(); //MPIelapsed=MPIt2-MPIt1; if (mype == 0) { saveMatrix(n,w,"eigenvalues.txt"); //printf("%d/%d: finished job in %8.2fs\n",mype,npe,MPIelapsed); // not working } */ ierr = 0; // compute the conditioner number assume that the norm and the cholesky decomposition have been computed /* DO NOT WORK lwork = 2*mla+3*nla; printf("%d/%d: lwork=%d @%p\n",mype,npe,lwork,&lwork); work2 = malloc(sizeof(double)*lwork); liwork = 2*mla+3*nla; iwork = malloc(sizeof(int)*liwork); pdpocon_(&uplo,&n,la,&one,&one,idescal,&norm,&cond,work2,&lwork,iwork,&liwork,&ierr); printf("%d/%d: condition number %f \n",mype,npe,cond); */ free(la); Cblacs_gridexit(icon); Cblacs_exit( 0 ); return 0; }
void CG(StrumpackDensePackage<myscalar,myreal> *sdp, myscalar *X, myscalar *B, int *descVec, int n, int nrhs, int niter, myreal threshold) { /* Conjugate Gradients. Calls sdp.schur_product for matvecs. Fills only the bottom part of X. */ int printit=250; int IA; int ctxt; int nprow, npcol; int myrow, mycol; int rsrc, csrc; int nb; int locr, locc; int i, j; int idummy, ierr; int neff; int it; int desc[BLACSCTXTSIZE], descScal[BLACSCTXTSIZE]; bool ingrid; myreal res; myscalar *x=NULL, *b=NULL; myscalar *r=NULL, *p=NULL, *Ap=NULL; myscalar alpha, rrprev, rrnext; IA=sdp->split_HSS+1; neff=n-IA+1; ctxt=descVec[BLACSctxt]; rsrc=descVec[BLACSrsrc]; csrc=descVec[BLACScsrc]; nb=descVec[BLACSmb]; blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); ingrid=myrow>=0 && mycol>=0; /* X and B have n rows. We create versions with * the last neff rows only that we pass to SDP. */ if(ingrid) { locr=numroc_(&neff,&nb,&myrow,&rsrc,&nprow); locc=numroc_(&IONE,&nb,&mycol,&csrc,&npcol); idummy=locr>1?locr:1; descinit_(desc,&neff,&IONE,&nb,&nb,&rsrc,&csrc,&ctxt,&idummy,&ierr); x=new myscalar[locr*locc](); b=new myscalar[locr*locc](); r=new myscalar[locr*locc](); p=new myscalar[locr*locc](); Ap=new myscalar[locr*locc](); } else { locr=0; locc=0; descset_(desc,&neff,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Descriptor for the scalar containing the result of the dot product */ if(ingrid) descinit_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&IONE,&ierr); else descset_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); for(j=1;j<=nrhs;j++) { /* One RHS at a time */ if(ingrid) { pgeadd('N',neff,IONE,ONE,B,IA,j,descVec,ZERO,b,IONE,IONE,desc); pgeadd('N',neff,IONE,ONE,X,IA,j,descVec,ZERO,x,IONE,IONE,desc); } /* r = b - A x * If the Schur was explitly in matrix A, the BLAS call would be * pgemm('N','N',neff,IONE,neff,NONE,A,IA,IA,descA,x,IONE,IONE,desc,ONE,r,IONE,IONE,desc); * */ if(ingrid) placpy('N',neff,IONE,b,IONE,IONE,desc,r,IONE,IONE,desc); sdp->schur_product('N',NONE,x,desc,ONE,r,desc); /* p = r */ if(ingrid) placpy('N',neff,IONE,r,IONE,IONE,desc,p,IONE,IONE,desc); /* "Previous" r' * r */ rrprev=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrprev,IONE,IONE,descScal); MPI_Bcast((void *)&rrprev,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); it=1; while(it<=niter) { /* Ap = A * p * If the Schur was explitly in matrix A, the BLAS call would be * pgemm('N','N',neff,IONE,neff,ONE,A,IA,IA,descA,p,IONE,IONE,desc,ZERO,Ap,IONE,IONE,desc); * */ sdp->schur_product('N',ONE,p,desc,ZERO,Ap,desc); /* alpha = r'*r / (p' * A * p) = rrprev/(p' * Ap) */ alpha=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,p,IONE,IONE,desc,Ap,IONE,IONE,desc,ZERO,&alpha,IONE,IONE,descScal); MPI_Bcast((void *)&alpha,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); alpha=rrprev/alpha; /* x = x + alpha * p */ for(i=0;i<locr*locc;i++) x[i]+=alpha*p[i]; /* r = r - alpha * Ap */ for(i=0;i<locr*locc;i++) r[i]-=alpha*Ap[i]; /* "Next" r' * r */ rrnext=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrnext,IONE,IONE,descScal); MPI_Bcast((void *)&rrnext,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); /* Residual */ res=sqrt(rrnext.real()); if(it%printit==0) if(!myrow && !mycol) std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl; if(res<threshold) break; /* p = r + rrnext/rrprev * p */ for(i=0;i<locr*locc;i++) p[i]=r[i]+rrnext/rrprev*p[i]; /* "Previous" r' * r */ rrprev=rrnext; it++; } if(it>niter) it=niter; if(it%printit) if(!myrow && !mycol) std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl << std::endl; /* Back to n-sized vector */ if(ingrid) pgeadd('N',neff,IONE,ONE,x,IONE,IONE,desc,ZERO,X,IA,j,descVec); } delete[] b; delete[] x; delete[] r; delete[] p; delete[] Ap; }
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in, double *A, int *ia_in, int *ja_in, int *descA, double *dB, int *ib_in, int *jb_in, int *descB ) { /* Copy m by n distributed submatrix from host to GPU */ int m = *m_in; int n = *n_in; int ia = *ia_in; int ja = *ja_in; int ib = *ib_in; int jb = *jb_in; int nprow = 0; int npcol = 0; int myprow = 0; int mypcol = 0; int mmb = 0; int nnb = 0; int istart = 0; int iend = 0; int isize = 0; int Locp = 0; int Locq = 0; int lld = 0; int jstart = 0; int jend = 0; int jsize = 0; int iib = 0; int jjb = 0; cuDoubleComplex *dBptr = 0; double *Btmp = 0; F_CHAR_T NoTrans = "N"; int descBtmp[DLEN_]; int elmSize = sizeof(cuDoubleComplex); double z_one[2]; double z_zero[2]; /* Tuneable parameters */ int mfactor = 4; int nfactor = 4; int rsrc = 0; int csrc = 0; int irsrc = 0; int jcsrc = 0; int info = 0; int notran = 0; int TrA = 0; int lrindx = 0; int lcindx = 0; int nrow = 0; int ncol = 0; int mm = 0; int nn = 0; int iia = 0; int jja = 0; cublasStatus cu_status; z_one[REAL_PART] = 1; z_one[IMAG_PART] = 0; z_zero[REAL_PART] = 0; z_zero[IMAG_PART] = 0; Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol ); notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); /* * check arguments */ if ((m <= 0) || (n <= 0)) { return; }; assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) ); assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) ); assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) ); assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) ); /* * Create a temp matrix that is aligned to descB. * Assume size is mmb by nnb */ if (notran) { mmb = MIN( m, descB[MB_] * nprow * mfactor); nnb = MIN( n, descB[NB_] * npcol * nfactor); } else { nnb = MIN( m, descB[MB_] * nprow * mfactor); mmb = MIN( n, descB[NB_] * npcol * nfactor); }; mmb = MAX( mmb,1); nnb = MAX( nnb,1); rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow); csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol); Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow ); Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol ); Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize ); assert( Btmp != 0 ); lld = MAX(1, Locp); descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_], &rsrc, &csrc, &descB[CTXT_], &lld, &info ); assert( info == 0); for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) { jend = MIN( ja + n -1, jstart + nnb - 1); jsize = jend - jstart + 1; for( istart=ia; istart <= ia + m-1; istart = iend + 1) { iend = MIN( ia + m-1, istart + mmb -1); isize = iend - istart + 1; iia = ia + (istart-1); jja = ja + (jstart-1); iib = 1; jjb = 1; if (notran) { mm = isize; nn = jsize; } else { mm = jsize; nn = isize; }; pzgeadd_( TRAN, &mm, &nn, z_one, A, &iia, &jja, descA, z_zero, Btmp, &iib, &jjb, descBtmp ); /* * find local extent */ if (notran) { iib = ib + (istart-1); jjb = jb + (jstart-1); } else { iib = ib + (jstart-1); jjb = jb + (istart-1); }; if (notran) { nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow); ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol); } else { nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow); ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol); }; /* Perform global dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize) Perform local dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <- B(1:nrow, 1:ncol) */ infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol, &lrindx, &lcindx, &irsrc, &jcsrc ); dBptr = (cuDoubleComplex *) dB; dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]); cu_status = cublasSetMatrix( nrow,ncol,elmSize, (cuDoubleComplex *) Btmp, descBtmp[LLD_], dBptr, descB[LLD_] ); assert( cu_status == CUBLAS_STATUS_SUCCESS ); }; }; free( Btmp ); return; }
int main (int argc, char *argv[]) { myscalar *A=NULL, *B=NULL, *Btrue=NULL; int descA[BLACSCTXTSIZE], descB[BLACSCTXTSIZE]; int n; int nb; int locr, locc; int i, j, ii, jj; int *I, *J; int nI, nJ; int ierr; int dummy; int myid, np; int myrow, mycol, nprow, npcol; int ctxt; myreal err; n=1024; /* Size of the problem */ nb=16; /* Blocksize for the 2D block-cyclic distribution */ /* Initialize MPI */ if((ierr=MPI_Init(&argc,&argv))) return 1; myid=-1; if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid))) return 1; np=-1; if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np))) return 1; /* Initialize the BLACS grid */ nprow=floor(sqrt((float)np)); npcol=np/nprow; blacs_get_(&IZERO,&IZERO,&ctxt); blacs_gridinit_(&ctxt,"R",&nprow,&npcol); blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); /* A is a dense n x n distributed Toeplitz matrix */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol); A=new myscalar[locr*locc]; dummy=std::max(1,locr); descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); for(i=1;i<=locr;i++) for(j=1;j<=locc;j++) { ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow); jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol); // Toeplitz matrix from Quantum Chemistry. myreal pi=3.1416, d=0.1; A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2); } } else { descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initialize the solver and set parameters */ StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD); sdp.use_HSS=true; sdp.levels_HSS=4; sdp.min_rand_HSS=64; sdp.lim_rand_HSS=0; sdp.tol_HSS=1e-6; /* Compression */ sdp.compress(A,descA); /* Accuracy checking */ sdp.check_compression(A,descA); /* Element extraction: a bunch of random indices. * Not that duplicates do not matter (the code works). */ nI=1+rand()%n; MPI_Bcast((void*)&nI,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); I=new int[nI]; if(!myid) for(i=0;i<nI;i++) I[i]=1+rand()%n; MPI_Bcast((void*)I,nI,MPI_INTEGER,IZERO,MPI_COMM_WORLD); nJ=1+rand()%n; MPI_Bcast((void*)&nJ,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); J=new int[nJ]; if(!myid) for(j=0;j<nJ;j++) J[j]=1+rand()%n; MPI_Bcast((void*)J,nJ,MPI_INTEGER,IZERO,MPI_COMM_WORLD); /* Extraction for the HSS form */ if(myid<nprow*npcol) { locr=numroc_(&nI,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&nJ,&nb,&mycol,&IZERO,&npcol); B=new myscalar[locr*locc](); dummy=std::max(1,locr); descinit_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); } else descset_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); sdp.extract(A,descA,B,descB,I,nI,J,nJ); /* Extraction from the original matrix just to compare */ sdp.use_HSS=false; if(myid<nprow*npcol) Btrue=new myscalar[locr*locc]; sdp.extract(A,descA,Btrue,descB,I,nI,J,nJ); /* Comparison with elements of input matrix */ if(myid<nprow*npcol){ err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL); for(i=0;i<locr*locc;i++) Btrue[i]-=B[i]; err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL)/err; } if(!myid) std::cout << "Element extraction (" << nI << "x" << nJ << " submatrix): maximum relative error max ||A(I,J)-HSS(I,J)||//||A(I,J)|| = " << err << std::endl << std::endl; /* Statistics */ sdp.print_statistics(); /* Clean-up */ delete[] A; delete[] B; delete[] Btrue; delete[] I; delete[] J; /* The end */ MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int iam, nprocs; int myrank_mpi, nprocs_mpi; int ictxt, nprow, npcol, myrow, mycol; int nb, m, n; int mpA, nqA, mpU, nqU, mpVT, nqVT; int i, j, k, itemp, min_mn; int descA[9], descU[9], descVT[9]; float *A=NULL; int info, infoNN, infoVV, infoNV, infoVN; float *U_NN=NULL, *U_VV=NULL, *U_NV=NULL, *U_VN=NULL; float *VT_NN=NULL, *VT_VV=NULL, *VT_NV=NULL, *VT_VN=NULL; float *S_NN=NULL, *S_VV=NULL, *S_NV=NULL, *S_VN=NULL; float *S_res_NN=NULL; float orthU_VV, residF, orthVT_VV; float orthU_VN, orthVT_NV; float residS_NN, eps; float res_repres_NV, res_repres_VN; /**/ int izero=0,ione=1; float rtmone=-1.0e+00; /**/ double MPIelapsedVV, MPIelapsedNN, MPIelapsedVN, MPIelapsedNV; char jobU, jobVT; int nbfailure=0, nbtestcase=0,inputfromfile, nbhetereogeneity=0; float threshold=100e+00; char buf[1024]; FILE *fd; char *c; char *t_jobU, *t_jobVT; int *t_m, *t_n, *t_nb, *t_nprow, *t_npcol; int nb_expe, expe; char hetereogeneityVV, hetereogeneityNN, hetereogeneityVN, hetereogeneityNV; int iseed[4], idist; /**/ MPI_Init( &argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); /**/ m = 100; n = 100; nprow = 1; npcol = 1; nb = 64; jobU='A'; jobVT='A'; inputfromfile = 0; for( i = 1; i < argc; i++ ) { if( strcmp( argv[i], "-f" ) == 0 ) { inputfromfile = 1; } if( strcmp( argv[i], "-jobvt" ) == 0 ) { if (i+1<argc) { if( strcmp( argv[i+1], "V" ) == 0 ){ jobVT = 'V'; i++; } else if( strcmp( argv[i+1], "N" ) == 0 ){ jobVT = 'N'; i++; } else if( strcmp( argv[i+1], "A" ) == 0 ){ jobVT = 'A'; i++; } else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n"); } else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n"); } if( strcmp( argv[i], "-jobu" ) == 0 ) { if (i+1<argc) { if( strcmp( argv[i+1], "V" ) == 0 ){ jobU = 'V'; i++; } else if( strcmp( argv[i+1], "N" ) == 0 ){ jobU = 'N'; i++; } else if( strcmp( argv[i+1], "A" ) == 0 ){ jobU = 'A'; i++; } else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n"); } else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n"); } if( strcmp( argv[i], "-m" ) == 0 ) { m = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-n" ) == 0 ) { n = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-p" ) == 0 ) { nprow = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-q" ) == 0 ) { npcol = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-nb" ) == 0 ) { nb = atoi(argv[i+1]); i++; } } /**/ if (inputfromfile){ nb_expe = 0; fd = fopen("svd.dat", "r"); if (fd == NULL) { printf("File failed to open svd.dat from processor mpirank(%d/%d): \n",myrank_mpi,nprocs_mpi); exit(-1); } do { c = fgets(buf, 1024, fd); /* get one line from the file */ if (c != NULL) if (c[0] != '#') nb_expe++; } while (c != NULL); /* repeat until NULL */ fclose(fd); t_jobU = (char *)calloc(nb_expe,sizeof(char)) ; t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ; t_m = (int *)calloc(nb_expe,sizeof(int )) ; t_n = (int *)calloc(nb_expe,sizeof(int )) ; t_nb = (int *)calloc(nb_expe,sizeof(int )) ; t_nprow = (int *)calloc(nb_expe,sizeof(int )) ; t_npcol = (int *)calloc(nb_expe,sizeof(int )) ; fd = fopen("svd.dat", "r"); expe=0; do { c = fgets(buf, 1024, fd); /* get one line from the file */ if (c != NULL) if (c[0] != '#'){ //printf("NBEXPE = %d\n",expe); sscanf(c,"%c %c %d %d %d %d %d", &(t_jobU[expe]),&(t_jobVT[expe]),&(t_m[expe]),&(t_n[expe]), &(t_nb[expe]),(&t_nprow[expe]),&(t_npcol[expe])); expe++; } } while (c != NULL); /* repeat until NULL */ fclose(fd); } else { nb_expe = 1; t_jobU = (char *)calloc(nb_expe,sizeof(char)) ; t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ; t_m = (int *)calloc(nb_expe,sizeof(int )) ; t_n = (int *)calloc(nb_expe,sizeof(int )) ; t_nb = (int *)calloc(nb_expe,sizeof(int )) ; t_nprow = (int *)calloc(nb_expe,sizeof(int )) ; t_npcol = (int *)calloc(nb_expe,sizeof(int )) ; t_jobU[0] = jobU; t_jobVT[0] = jobVT; t_m[0] = m; t_n[0] = n; t_nb[0] = nb; t_nprow[0] = nprow; t_npcol[0] = npcol; } if (myrank_mpi==0){ printf("\n"); printf("--------------------------------------------------------------------------------------------------------------------\n"); printf(" Testing psgsevd -- float precision SVD ScaLAPACK routine \n"); printf("jobU jobVT m n nb p q || info heter resid orthU orthVT |SNN-SVV| time(s) cond(A) \n"); printf("--------------------------------------------------------------------------------------------------------------------\n"); } /**/ for (expe = 0; expe<nb_expe; expe++){ jobU = t_jobU[expe] ; jobVT = t_jobVT[expe] ; m = t_m[expe] ; n = t_n[expe] ; nb = t_nb[expe] ; nprow = t_nprow[expe] ; npcol = t_npcol[expe] ; if (nb>n) nb = n; if (nprow*npcol>nprocs_mpi){ if (myrank_mpi==0) printf(" **** ERROR : we do not have enough processes available to make a p-by-q process grid ***\n"); printf(" **** Bye-bye ***\n"); MPI_Finalize(); exit(1); } /**/ Cblacs_pinfo( &iam, &nprocs ) ; Cblacs_get( -1, 0, &ictxt ); Cblacs_gridinit( &ictxt, "Row", nprow, npcol ); Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); /**/ min_mn = min(m,n); /**/ //if (iam==0) //printf("\tm=%d\tn = %d\t\t(%d,%d)\t%dx%d\n",m,n,nprow,npcol,nb,nb); //printf("Hello World, I am proc %d over %d for MPI, proc %d over %d for BLACS in position (%d,%d) in the process grid\n", //myrank_mpi,nprocs_mpi,iam,nprocs,myrow,mycol); /* * * Work only the process in the process grid * */ //if ((myrow < nprow)&(mycol < npcol)){ if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)){ /* * * Compute the size of the local matrices (thanks to numroc) * */ mpA = numroc_( &m , &nb, &myrow, &izero, &nprow ); nqA = numroc_( &n , &nb, &mycol, &izero, &npcol ); mpU = numroc_( &m , &nb, &myrow, &izero, &nprow ); nqU = numroc_( &min_mn, &nb, &mycol, &izero, &npcol ); mpVT = numroc_( &min_mn, &nb, &myrow, &izero, &nprow ); nqVT = numroc_( &n , &nb, &mycol, &izero, &npcol ); /* * * Allocate and fill the matrices A and B * */ A = (float *)calloc(mpA*nqA,sizeof(float)) ; if (A==NULL){ printf("error of memory allocation A on proc %dx%d\n",myrow,mycol); exit(0); } /**/ // seed = iam*(mpA*nqA*2); srand(seed); idist = 2; iseed[0] = mpA%4096; iseed[1] = iam%4096; iseed[2] = nqA%4096; iseed[3] = 23; /**/ k = 0; for (i = 0; i < mpA; i++) { for (j = 0; j < nqA; j++) { slarnv_( &idist, iseed, &ione, &(A[k]) ); k++; } } /* * * Initialize the array descriptor for the distributed matrices xA, U and VT * */ itemp = max( 1, mpA ); descinit_( descA, &m, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); itemp = max( 1, mpA ); descinit_( descU, &m, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); itemp = max( 1, mpVT ); descinit_( descVT, &min_mn, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); /**/ eps = pslamch_( &ictxt, "Epsilon" ); /**/ if ( ((jobU=='V')&(jobVT=='N')) ||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; U_VN = (float *)calloc(mpU*nqU,sizeof(float)) ; if (U_VN==NULL){ printf("error of memory allocation U_VN on proc %dx%d\n",myrow,mycol); exit(0); } S_VN = (float *)calloc(min_mn,sizeof(float)) ; if (S_VN==NULL){ printf("error of memory allocation S_VN on proc %dx%d\n",myrow,mycol); exit(0); } infoVN = driver_psgesvd( 'V', 'N', m, n, A, 1, 1, descA, S_VN, U_VN, 1, 1, descU, VT_VN, 1, 1, descVT, &MPIelapsedVN); orthU_VN = verif_orthogonality(m,min_mn,U_VN , 1, 1, descU); res_repres_VN = verif_repres_VN( m, n, A, 1, 1, descA, U_VN, 1, 1, descU, S_VN); if (infoVN==min_mn+1) hetereogeneityVN = 'H'; else hetereogeneityVN = 'N'; if ( iam==0 ) printf(" V N %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoVN,hetereogeneityVN,res_repres_VN/(S_VN[0]/S_VN[min_mn-1]), orthU_VN,MPIelapsedVN,S_VN[0]/S_VN[min_mn-1]); if (infoVN==min_mn+1) nbhetereogeneity++ ; else if ((res_repres_VN/eps/(S_VN[0]/S_VN[min_mn-1])>threshold)||(orthU_VN/eps>threshold)||(infoVN!=0)) nbfailure++; } /**/ if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; VT_NV = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VT_NV==NULL){ printf("error of memory allocation VT_NV on proc %dx%d\n",myrow,mycol); exit(0); } S_NV = (float *)calloc(min_mn,sizeof(float)) ; if (S_NV==NULL){ printf("error of memory allocation S_NV on proc %dx%d\n",myrow,mycol); exit(0); } infoNV = driver_psgesvd( 'N', 'V', m, n, A, 1, 1, descA, S_NV, U_NV, 1, 1, descU, VT_NV, 1, 1, descVT, &MPIelapsedNV); orthVT_NV = verif_orthogonality(min_mn,n,VT_NV, 1, 1, descVT); res_repres_NV = verif_repres_NV( m, n, A, 1, 1, descA, VT_NV, 1, 1, descVT, S_NV); if (infoNV==min_mn+1) hetereogeneityNV = 'H'; else hetereogeneityNV = 'N'; if ( iam==0 ) printf(" N V %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoNV,hetereogeneityNV,res_repres_NV/(S_NV[0]/S_NV[min_mn-1]), orthVT_NV,MPIelapsedNV,S_NV[0]/S_NV[min_mn-1]); if (infoNV==min_mn+1) nbhetereogeneity++ ; else if ((res_repres_NV/eps/(S_NV[0]/S_NV[min_mn-1])>threshold)||(orthVT_NV/eps>threshold)||(infoNV!=0)) nbfailure++; } /**/ if ( ((jobU=='N')&(jobVT=='N')) || ((jobU=='V')&(jobVT=='V')) || (jobU == 'A' ) || (jobVT=='A') ) { nbtestcase++; U_VV = (float *)calloc(mpU*nqU,sizeof(float)) ; if (U_VV==NULL){ printf("error of memory allocation U_VV on proc %dx%d\n",myrow,mycol); exit(0); } VT_VV = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VT_VV==NULL){ printf("error of memory allocation VT_VV on proc %dx%d\n",myrow,mycol); exit(0); } S_VV = (float *)calloc(min_mn,sizeof(float)) ; if (S_VV==NULL){ printf("error of memory allocation S_VV on proc %dx%d\n",myrow,mycol); exit(0); } infoVV = driver_psgesvd( 'V', 'V', m, n, A, 1, 1, descA, S_VV, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT, &MPIelapsedVV); orthU_VV = verif_orthogonality(m,min_mn,U_VV , 1, 1, descU); orthVT_VV = verif_orthogonality(min_mn,n,VT_VV, 1, 1, descVT); residF = verif_representativity( m, n, A, 1, 1, descA, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT, S_VV); if (infoVV==min_mn+1) hetereogeneityVV = 'H'; else hetereogeneityVV = 'N'; if ( iam==0 ) printf(" V V %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoVV,hetereogeneityVV,residF,orthU_VV,orthVT_VV,MPIelapsedVV,S_VV[0]/S_VV[min_mn-1]); if (infoVV==min_mn+1) nbhetereogeneity++ ; else if ((residF/eps>threshold)||(orthU_VV/eps>threshold)||(orthVT_VV/eps>threshold)||(infoVV!=0)) nbfailure++; } /**/ if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; S_NN = (float *)calloc(min_mn,sizeof(float)) ; if (S_NN==NULL){ printf("error of memory allocation S_NN on proc %dx%d\n",myrow,mycol); exit(0); } infoNN = driver_psgesvd( 'N', 'N', m, n, A, 1, 1, descA, S_NN, U_NN, 1, 1, descU, VT_NN, 1, 1, descVT, &MPIelapsedNN); S_res_NN = (float *)calloc(min_mn,sizeof(float)) ; if (S_res_NN==NULL){ printf("error of memory allocation S on proc %dx%d\n",myrow,mycol); exit(0); } scopy_(&min_mn,S_VV,&ione,S_res_NN,&ione); saxpy_ (&min_mn,&rtmone,S_NN,&ione,S_res_NN,&ione); residS_NN = snrm2_(&min_mn,S_res_NN,&ione) / snrm2_(&min_mn,S_VV,&ione); free(S_res_NN); if (infoNN==min_mn+1) hetereogeneityNN = 'H'; else hetereogeneityNN = 'N'; if ( iam==0 ) printf(" N N %6d %6d %3d %3d %3d || %3d %c %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoNN,hetereogeneityNN,residS_NN,MPIelapsedNN,S_NN[0]/S_NN[min_mn-1]); if (infoNN==min_mn+1) nbhetereogeneity++ ; else if ((residS_NN/eps>threshold)||(infoNN!=0)) nbfailure++; } /**/ if (((jobU=='V')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_VN); free(U_VN); } if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(VT_NV); free(S_NV); } if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_NN); } if (((jobU=='N')&(jobVT=='N'))||((jobU=='V')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(U_VV); free(S_VV); free(VT_VV);} free(A); Cblacs_gridexit( 0 ); } /* * Print ending messages */ } if ( iam==0 ){ printf("--------------------------------------------------------------------------------------------------------------------\n"); printf(" [ nbhetereogeneity = %d / %d ]\n",nbhetereogeneity, nbtestcase); printf(" [ nbfailure = %d / %d ]\n",nbfailure, nbtestcase-nbhetereogeneity); printf("--------------------------------------------------------------------------------------------------------------------\n"); printf("\n"); } /**/ free(t_jobU ); free(t_jobVT ); free(t_m ); free(t_n ); free(t_nb ); free(t_nprow ); free(t_npcol ); MPI_Finalize(); exit(0); }
int main() { const MKL_INT m = 1000; const MKL_INT k = 100000; const MKL_INT n = 1000; const MKL_INT nb = 100; const MKL_INT nprow = 2; const MKL_INT npcol = 2; MKL_INT iam, nprocs, ictxt, myrow, mycol; MDESC descA, descB, descC, descA_local, descB_local, descC_local; MKL_INT info; MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local; MKL_INT a_lld, b_lld, c_lld; blacs_pinfo_( &iam, &nprocs ); blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); double *a = 0; double *b = 0; double *c = 0; if (iam==0) { a = gen_a(m, k); b = gen_b(k, n); c = (double*)calloc(m*n, sizeof(double)); puts("a="); print(a, m, k); puts("b="); print(b, k, n); } a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol ); b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow ); b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) ); double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) ); double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) ); a_lld = MAX( a_m_local, 1 ); b_lld = MAX( b_m_local, 1 ); c_lld = MAX( c_m_local, 1 ); if (iam==0) { printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local); printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld); } descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info ); descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info ); descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info ); descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info ); printf("Rank %d: start distribute data\n", iam); pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); printf("Rank %d: finished distribute data\n", iam); if (iam==0) { puts("a"); print(A, a_m_local, a_n_local); puts("b"); print(B, b_m_local, b_n_local); } pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); printf("Rank %d: finished dgemm\n", iam); if (iam == 0) { puts("c"); print(C, c_m_local, c_n_local); } pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local); if (iam==0) { puts("global c"); print(c, m, n); } free(A); free(B); free(C); if (iam==0) { free(a); free(b); free(c); } blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); }
int main(int argc, char **argv) { int info, i, j, pcol, Adim; double *D; int *DESCD; CSRdouble BT_i, B_j, Xsparse, Zsparse, Btsparse; /*BT_i.allocate(0,0,0); B_j.allocate(0,0,0); Xsparse.allocate(0,0,0); Zsparse.allocate(0,0,0); Btsparse.allocate(0,0,0);*/ //Initialise MPI and some MPI-variables info = MPI_Init ( &argc, &argv ); if ( info != 0 ) { printf ( "Error in MPI initialisation: %d\n",info ); return info; } position= ( int* ) calloc ( 2,sizeof ( int ) ); if ( position==NULL ) { printf ( "unable to allocate memory for processor position coordinate\n" ); return EXIT_FAILURE; } dims= ( int* ) calloc ( 2,sizeof ( int ) ); if ( dims==NULL ) { printf ( "unable to allocate memory for grid dimensions coordinate\n" ); return EXIT_FAILURE; } //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI blacs_pinfo_ ( &iam,&size ); //determine the number of processes involved info=MPI_Dims_create ( size, 2, dims ); //determine the best 2D cartesian grid with the number of processes if ( info != 0 ) { printf ( "Error in MPI creation of dimensions: %d\n",info ); return info; } //Until now the code can only work with square process grids //So we try to get the biggest square grid possible with the number of processes involved if (*dims != *(dims+1)) { while (*dims * *dims > size) *dims -=1; *(dims+1)= *dims; if (iam==0) printf("WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - (*dims * *dims)); size = *dims * *dims; //cout << "New size of process grid: " << size << endl; } blacs_get_ ( &i_negone,&i_zero,&ICTXT2D ); //Initialisation of the BLACS process grid, which is referenced as ICTXT2D blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 ); if (iam < size) { //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column) blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 ); if ( *position ==-1 ) { printf ( "Error in proces grid\n" ); return -1; } //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp) info=read_input ( *++argv ); if ( info!=0 ) { printf ( "Something went wrong when reading input file for processor %d\n",iam ); return -1; } //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point. blacs_barrier_ ( &ICTXT2D,"ALL" ); if ( * ( position+1 ) ==0 && *position==0 ) printf ( "Reading of input-file succesful\n" ); if ( * ( position+1 ) ==0 && *position==0 ) { printf("\nA linear mixed model with %d observations, %d genotypes, %d random effects and %d fixed effects\n", n,k,m,l); printf("was analyzed using %d (%d x %d) processors\n",size,*dims,*(dims+1)); } //Dimension of A (sparse matrix) is the number of fixed effects(m) + the sparse random effects (l) Adim=m+l; //Dimension of D (dense matrix) is the number of dense effects (k) Ddim=k; pcol= * ( position+1 ); //Define number of blocks needed to store a complete column/row of D Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1; //Define the number of rowblocks needed by the current process to store its part of the dense matrix D Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1; Drows= Drows<1? 1 : Drows; //Define the number of columnblocks needed by the current process to store its part of the dense matrix D Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1; Dcols=Dcols<1? 1 : Dcols; //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise) lld_D=Drows*blocksize; //Initialise the descriptor of the dense distributed matrix DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCD==NULL ) { printf ( "unable to allocate memory for descriptor for C\n" ); return -1; } //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0) //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } //Allocate the space necessary to store the part of D that is held into memory of this process. D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) ); if ( D==NULL ) { printf ( "unable to allocate memory for Matrix D (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) ); return EXIT_FAILURE; } blacs_barrier_ ( &ICTXT2D,"ALL" ); if (iam==0) printf ( "Start set up of B & D\n" ); blacs_barrier_ ( &ICTXT2D,"ALL" ); //set_up_BD is declared in readdist.cpp and constructs the parts of matrices B & D in each processor //which are necessary to create the distributed Schur complement of D info = set_up_BD ( DESCD, D, BT_i, B_j, Btsparse ); //printdense(Drows*blocksize, Dcols * blocksize,D,"matrix_D.txt"); blacs_barrier_ ( &ICTXT2D,"ALL" ); if (iam==0) printf ( "Matrices B & D set up\n" ); if(printD_bool) { int array_of_gsizes[2], array_of_distribs[2], array_of_dargs[2], array_of_psize[2] ; int buffersize; MPI_Datatype file_type; MPI_File fh; MPI_Status status; array_of_gsizes[0]=Dblocks * blocksize; array_of_gsizes[1]=Dblocks * blocksize; array_of_distribs[0]=MPI_DISTRIBUTE_CYCLIC; array_of_distribs[1]=MPI_DISTRIBUTE_CYCLIC; array_of_dargs[0]=blocksize; array_of_dargs[1]=blocksize; array_of_psize[0]=*dims; array_of_psize[1]=*(dims + 1); MPI_Type_create_darray(size,iam,2,array_of_gsizes, array_of_distribs, array_of_dargs, array_of_psize, MPI_ORDER_FORTRAN, MPI_DOUBLE, &file_type); MPI_Type_commit(&file_type); info = MPI_File_open(MPI_COMM_WORLD, filenameD, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); /*if ( ( Drows-1 ) % *(dims+1) == *position && ( Dcols-1 ) % *(dims) == pcol && Ddim%blocksize !=0 ) buffersize=((Drows-1) * blocksize + Ddim % blocksize) * ((Dcols-1) * blocksize + Ddim % blocksize); else if ( ( Drows-1 ) % *(dims+1) == *position && Ddim%blocksize !=0 ) buffersize=((Drows-1) * blocksize + Ddim % blocksize) * Dcols * blocksize; else if ( ( Dcols-1 ) % *(dims) == *position && Ddim%blocksize !=0 ) buffersize=((Dcols-1) * blocksize + Ddim % blocksize) * Drows * blocksize; else*/ buffersize= Dcols * Drows * blocksize * blocksize; MPI_File_set_view(fh, 0, MPI_DOUBLE, file_type, "native", MPI_INFO_NULL); info =MPI_File_write_all(fh, D,buffersize, MPI_DOUBLE, &status); MPI_File_close(&fh); if(iam==0) { printf("Matrix D (dimension %d) is printed in file %s\n", Dblocks*blocksize,filenameD); } if(filenameD != NULL) free(filenameD); filenameD=NULL; //delete[] array_of_gsizes, delete[] array_of_distribs, delete[] array_of_dargs, delete[] array_of_psize; } //Now every matrix has to set up the sparse matrix A, consisting of X'X, X'Z, Z'X and Z'Z + lambda*I Xsparse.loadFromFile ( filenameX ); Zsparse.loadFromFile ( filenameZ ); if(filenameX != NULL) free(filenameX); filenameX=NULL; if(filenameZ != NULL) free(filenameZ); filenameZ=NULL; smat_t *X_smat, *Z_smat; X_smat= (smat_t *) calloc(1,sizeof(smat_t)); Z_smat= (smat_t *) calloc(1,sizeof(smat_t)); X_smat = smat_new_from ( Xsparse.nrows,Xsparse.ncols,Xsparse.pRows,Xsparse.pCols,Xsparse.pData,0,0 ); Z_smat = smat_new_from ( Zsparse.nrows,Zsparse.ncols,Zsparse.pRows,Zsparse.pCols,Zsparse.pData,0,0 ); smat_t *Xt_smat, *Zt_smat; Xt_smat= (smat_t *) calloc(1,sizeof(smat_t)); Zt_smat= (smat_t *) calloc(1,sizeof(smat_t)); Xt_smat = smat_copy_trans ( X_smat ); Zt_smat = smat_copy_trans ( Z_smat ); CSRdouble Asparse; smat_t *XtX_smat, *XtZ_smat, *ZtZ_smat, *lambda_smat, *ZtZlambda_smat; XtX_smat= (smat_t *) calloc(1,sizeof(smat_t)); XtZ_smat= (smat_t *) calloc(1,sizeof(smat_t)); ZtZ_smat= (smat_t *) calloc(1,sizeof(smat_t)); XtX_smat = smat_matmul ( Xt_smat, X_smat ); XtZ_smat = smat_matmul ( Xt_smat, Z_smat ); ZtZ_smat = smat_matmul ( Zt_smat,Z_smat ); Xsparse.clear(); Zsparse.clear(); smat_free(Xt_smat); smat_free(Zt_smat); /*smat_free(X_smat); smat_free(Z_smat);*/ CSRdouble Imat; makeIdentity ( l, Imat ); lambda_smat= (smat_t *) calloc(1,sizeof(smat_t)); lambda_smat = smat_new_from ( Imat.nrows,Imat.ncols,Imat.pRows,Imat.pCols,Imat.pData,0,0 ); smat_scale_diag ( lambda_smat, -lambda ); ZtZlambda_smat= (smat_t *) calloc(1,sizeof(smat_t)); ZtZlambda_smat = smat_add ( lambda_smat, ZtZ_smat ); smat_free(ZtZ_smat); //smat_free(lambda_smat); smat_to_symmetric_structure ( XtX_smat ); smat_to_symmetric_structure ( ZtZlambda_smat ); CSRdouble XtX_sparse, XtZ_sparse, ZtZ_sparse; XtX_sparse.make2 ( XtX_smat->m,XtX_smat->n,XtX_smat->nnz,XtX_smat->ia,XtX_smat->ja,XtX_smat->a ); XtZ_sparse.make2 ( XtZ_smat->m,XtZ_smat->n,XtZ_smat->nnz,XtZ_smat->ia,XtZ_smat->ja,XtZ_smat->a ); ZtZ_sparse.make2 ( ZtZlambda_smat->m,ZtZlambda_smat->n,ZtZlambda_smat->nnz,ZtZlambda_smat->ia,ZtZlambda_smat->ja,ZtZlambda_smat->a ); /*smat_free(XtX_smat); smat_free(XtZ_smat); smat_free(ZtZlambda_smat);*/ Imat.clear(); if (iam==0) { cout << "*** [ t t ] *** " << endl; cout << "*** [ X X X Z ] *** " << endl; cout << "*** [ ] *** " << endl; cout << "*** G e n e r a t i n g m a t r i x A = [ ] *** " << endl; cout << "*** [ t t ] *** " << endl; cout << "*** [ Z X Z Z ] *** " << endl; } //Sparse matrix A only contains the upper triangular part of A create2x2SymBlockMatrix ( XtX_sparse, XtZ_sparse, ZtZ_sparse, Asparse ); //Asparse.writeToFile("A_sparse.csr"); smat_free(XtX_smat); smat_free(XtZ_smat); smat_free(ZtZlambda_smat); XtX_sparse.clear(); XtZ_sparse.clear(); ZtZ_sparse.clear(); blacs_barrier_ ( &ICTXT2D,"ALL" ); if(printsparseC_bool) { CSRdouble Dmat, Dblock, Csparse; Dblock.nrows=Dblocks * blocksize; Dblock.ncols=Dblocks * blocksize; Dblock.allocate(Dblocks * blocksize, Dblocks * blocksize, 0); Dmat.allocate(0,0,0); for (i=0; i<Drows; ++i) { for(j=0; j<Dcols; ++j) { dense2CSR_sub(D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock,( * ( dims) * i + *position ) *blocksize, ( * ( dims+1 ) * j + pcol ) *blocksize); if ( Dblock.nonzeros>0 ) { if ( Dmat.nonzeros==0 ) { Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData ); } else { Dmat.addBCSR ( Dblock ); } } Dblock.clear(); } } blacs_barrier_(&ICTXT2D,"A"); if ( iam!=0 ) { //Each process other than root sends its Dmat to the root process. MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD ); Dmat.clear(); } else { for ( i=1; i<size; ++i ) { // The root process receives parts of Dmat sequentially from all processes and directly adds them together. int nonzeroes, count; MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ if(nonzeroes>0) { printf("Nonzeroes : %d\n ",nonzeroes); Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes ); MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_DOUBLE, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ Dmat.addBCSR ( Dblock ); } } //Dmat.writeToFile("D_sparse.csr"); Dmat.reduceSymmetric(); Btsparse.transposeIt(1); create2x2SymBlockMatrix(Asparse,Btsparse, Dmat, Csparse); Btsparse.clear(); Dmat.clear(); Csparse.writeToFile(filenameC); Csparse.clear(); if(filenameC != NULL) free(filenameC); filenameC=NULL; } } Btsparse.clear(); blacs_barrier_(&ICTXT2D,"A"); //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol double * AB_sol; int * DESCAB_sol; DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCAB_sol==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize) descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } AB_sol=(double *) calloc(Adim * Dcols*blocksize,sizeof(double)); // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp) // The solution of A * Y = B_j is stored in AB_sol (= A^-1 * B_j) blacs_barrier_(&ICTXT2D,"A"); make_Sij_parallel_denseB ( Asparse, BT_i, B_j, D, lld_D, AB_sol ); BT_i.clear(); B_j.clear(); //From here on the Schur complement S of D is stored in D blacs_barrier_ ( &ICTXT2D,"ALL" ); //The Schur complement is factorised (by ScaLAPACK) pdpotrf_ ( "U",&k,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info ); return -1; } //From here on the factorization of the Schur complement S is stored in D blacs_barrier_ ( &ICTXT2D,"ALL" ); //The Schur complement is inverted (by ScaLAPACK) pdpotri_ ( "U",&k,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Inverse of D was unsuccessful, error returned: %d\n",info ); return -1; } //From here on the inverse of the Schur complement S is stored in D blacs_barrier_(&ICTXT2D,"A"); double* InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) ); //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process. if(*position == pcol) { for (i=0; i<Ddim; ++i) { if (pcol == (i/blocksize) % *dims) { int Dpos = i%blocksize + ((i/blocksize) / *dims) * blocksize ; *(InvD_T_Block + Adim +i) = *( D + Dpos + lld_D * Dpos); } } for ( i=0,j=0; i<Dblocks; ++i,++j ) { if ( j==*dims ) j=0; if ( *position==j ) { dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero ); } if ( *position==0 ) { dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j ); } } } blacs_barrier_(&ICTXT2D,"A"); //Only the root process performs a selected inversion of A. if (iam==0) { int pardiso_message_level = 1; int pardiso_mtype=-2; ParDiSO pardiso ( pardiso_mtype, pardiso_message_level ); int number_of_processors = 1; char* var = getenv("OMP_NUM_THREADS"); if(var != NULL) { sscanf( var, "%d", &number_of_processors ); } else { printf("Set environment OMP_NUM_THREADS to 1"); exit(1); } pardiso.iparm[2] = 2; pardiso.iparm[3] = number_of_processors; pardiso.iparm[8] = 0; pardiso.iparm[11] = 1; pardiso.iparm[13] = 0; pardiso.iparm[28] = 0; //This function calculates the factorisation of A once again so this might be optimized. pardiso.findInverseOfA ( Asparse ); printf("Processor %d inverted matrix A\n",iam); } blacs_barrier_(&ICTXT2D,"A"); // To minimize memory usage, and because only the diagonal elements of the inverse are needed, Y' * S is calculated row by rowblocks // the diagonal element is calculates as the dot product of this row and the corresponding column of Y. (Y is solution of AY=B) double* YSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) ); int * DESCYSROW; DESCYSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCYSROW==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //YSrow (1,Ddim) is distributed across processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize) descinit_ ( DESCYSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } blacs_barrier_(&ICTXT2D,"A"); //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1. for (i=1; i<=Adim; ++i) { pdsymm_ ("R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,YSrow,&i_one,&i_one,DESCYSROW); pddot_(&Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,YSrow,&i_one,&i_one,DESCYSROW,&i_one); /*if(*position==1 && pcol==1) printf("Dot product in process (1,1) is: %g\n", *(InvD_T_Block+i-1)); if(*position==0 && pcol==1) printf("Dot product in process (0,1) is: %g\n",*(InvD_T_Block+i-1));*/ } blacs_barrier_(&ICTXT2D,"A"); if(YSrow != NULL) free(YSrow); YSrow = NULL; if(DESCYSROW != NULL) free(DESCYSROW); DESCYSROW = NULL; if(AB_sol != NULL) free(AB_sol); AB_sol = NULL; if(DESCAB_sol != NULL) free(DESCAB_sol); DESCAB_sol = NULL; if(D != NULL) free(D); D = NULL; if(DESCD != NULL) free(DESCD); DESCD = NULL; //Only in the root process we add the diagonal elements of A^-1 if (iam ==0) { for(i=0; i<Adim; ++i) { j=Asparse.pRows[i]; *(InvD_T_Block+i) += Asparse.pData[j]; } Asparse.clear(); printdense ( Adim+k,1,InvD_T_Block,"diag_inverse_C_parallel.txt" ); } if(InvD_T_Block != NULL) free(InvD_T_Block); InvD_T_Block = NULL; blacs_gridexit_(&ICTXT2D); } //cout << iam << " reached end before MPI_Barrier" << endl; MPI_Barrier(MPI_COMM_WORLD); //MPI_Finalize(); return 0; }
int main (int argc, char *argv[]) { myscalar *A=NULL, *X=NULL, *B=NULL, *Y=NULL, *Xtrue=NULL; myscalar elem; int descA[BLACSCTXTSIZE], descVec[BLACSCTXTSIZE], descelem[BLACSCTXTSIZE]; int n; int nrhs; int nb; int locr, locc; int i, j, ii, jj; int ierr; int dummy; int myid, np; int myrow, mycol, nprow, npcol; int ctxt; myreal rdummy, res; n=1024; /* Size of the problem */ nrhs=3; /* Number of RHS */ nb=16; /* Blocksize for the 2D block-cyclic distribution */ /* Initialize MPI */ if((ierr=MPI_Init(&argc,&argv))) return 1; myid=-1; if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid))) return 1; np=-1; if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np))) return 1; /* Initialize the BLACS grid */ nprow=floor(sqrt((float)np)); npcol=np/nprow; blacs_get_(&IZERO,&IZERO,&ctxt); blacs_gridinit_(&ctxt,"R",&nprow,&npcol); blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); /* A is a dense n x n distributed Toeplitz matrix */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol); A=new myscalar[locr*locc]; dummy=std::max(1,locr); descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); myreal pi=3.1416, d=0.1; for(i=1;i<=locr;i++) { for(j=1;j<=locc;j++) { ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow); jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol); // Toeplitz matrix from Quantum Chemistry. A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2); } } } else { descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initializing solution */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&nrhs,&nb,&mycol,&IZERO,&npcol); dummy=std::max(1,locr); descinit_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); Xtrue=new myscalar[locr*locc](); for(i=0;i<locr*locc;i++) Xtrue[i]=static_cast<myscalar>(rand())/(static_cast<myscalar>(RAND_MAX)); } else { descset_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initializing solution and intermediate vector space */ if(myid<nprow*npcol) { X=new myscalar[locr*locc](); Y=new myscalar[locr*locc](); } /* Initializing RHS as A*Xtrue */ if(myid<nprow*npcol) { B=new myscalar[locr*locc](); pgemm('N','N',n,nrhs,n,ONE,A,IONE,IONE,descA,Xtrue,IONE,IONE,descVec,ZERO,B,IONE,IONE,descVec); } /* Initialize the solver and set parameters */ StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD); sdp.use_HSS=true; sdp.levels_HSS=4; sdp.min_rand_HSS=64; sdp.lim_rand_HSS=0; sdp.tol_HSS=1e-12; sdp.split_HSS=768; /* Size of A11 */ /* Compression */ sdp.compress(A,descA); /* Accuracy checking */ sdp.check_compression(A,descA); /* Factorization */ sdp.partially_factor(A,descA,sdp.split_HSS); /* Schur complement update */ sdp.compute_schur(); /* Extracting a random element from the Schur complement */ if(myid<nprow*npcol) { descinit_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); } else { descset_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } i=1+rand()%(n-sdp.split_HSS); j=1+rand()%(n-sdp.split_HSS); MPI_Bcast((void *)&i,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); MPI_Bcast((void *)&j,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); sdp.extract_schur(&elem,descelem,&i,IONE,&j,IONE); if(!myid) std::cout << "Element (" << i << "," << j << ") of Schur complement = " << elem << std::endl << std::endl; /* Condensation */ sdp.reduce_RHS(Y,descVec,B,descVec); /* Solve the Schur complement system (touches only the bottom part of X)*/ sdp.verbose=false; if(!myid) std::cout << "Solving Schur complement system with Conjugate Gradient..." << std::endl << std::endl; CG(&sdp,X,Y,descVec,n,nrhs,3000,1e-14); sdp.verbose=true; /* Expansion (touches only the top part of X) */ sdp.expand_solution(X,descVec,Y,descVec); /* Accuracy checking */ sdp.check_solution(A,descA,X,descVec,B,descVec); /* Forward error */ if(myid<nprow*npcol) { for(i=0;i<locr*locc;i++) Y[i]=X[i]-Xtrue[i]; res=plange('F',n,nrhs,Y,IONE,IONE,descVec,&rdummy); res/=plange('F',n,nrhs,Xtrue,IONE,IONE,descVec,&rdummy); if(!myid) std::cout << "Forward error = " << res << std::endl; } /* Statistics */ sdp.print_statistics(); /* Clean-up */ delete[] A; delete[] B; delete[] X; delete[] Y; delete[] Xtrue; /* The end */ MPI_Finalize(); return 0; }
int set_up_BD ( int * DESCD, double * Dmat, CSRdouble& BT_i, CSRdouble& B_j, CSRdouble& Btsparse ) { // Read-in of matrices X, Z and T from file (filename[X,Z,T]) // X and Z are read in entrely by every process // T is read in strip by strip (number of rows in each process is at maximum = blocksize) // D is constructed directly in a distributed way // B is first assembled sparse in root process and afterwards the necessary parts // for constructing the distributed Schur complement are sent to each process FILE *fT; int ni, i,j, info; int *DESCT; double *Tblock, *temp; int nTblocks, nstrips, pTblocks, stripcols, lld_T, pcol, colcur,rowcur; CSRdouble Xtsparse, Ztsparse,XtT_sparse,ZtT_sparse,XtT_temp, ZtT_temp; Xtsparse.loadFromFile ( filenameX ); Ztsparse.loadFromFile ( filenameZ ); Xtsparse.transposeIt ( 1 ); Ztsparse.transposeIt ( 1 ); XtT_sparse.allocate ( m,k,0 ); ZtT_sparse.allocate ( l,k,0 ); pcol= * ( position+1 ); // Matrix T is read in by strips of size (blocksize * *(dims+1), k) // Strips of T are read in row-wise and thus it is as if we store strips of T' (transpose) column-wise with dimensions (k, blocksize * *(dims+1)) // However we must then also transpose the process grid to distribute T' correctly // number of strips in which we divide matrix T' nstrips= n % ( blocksize * * ( dims+1 ) ) ==0 ? n / ( blocksize * * ( dims+1 ) ) : ( n / ( blocksize * * ( dims+1 ) ) ) +1; //the number of columns of T' included in each strip stripcols= blocksize * * ( dims+1 ); //number of blocks necessary to store complete column of T' nTblocks= k%blocksize==0 ? k/blocksize : k/blocksize +1; //number of blocks necessary in this process to store complete column of T' pTblocks= ( nTblocks - *position ) % *dims == 0 ? ( nTblocks- *position ) / *dims : ( nTblocks- *position ) / *dims +1; pTblocks= pTblocks <1? 1:pTblocks; //local leading dimension of the strip of T' (different from process to process) lld_T=pTblocks*blocksize; // Initialisation of descriptor of strips of matrix T' DESCT= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCT==NULL ) { printf ( "unable to allocate memory for descriptor for Z\n" ); return -1; } // strip of T (k,stripcols) is distributed across ICTXT2D starting in process (0,0) in blocks of size (blocksize,blocksize) // the local leading dimension in this process is lld_T descinit_ ( DESCT, &k, &stripcols, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_T, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix Z returns info: %d\n",info ); return info; } // Allocation of memory for the strip of T' in all processes Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) ); if ( Tblock==NULL ) { printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)",*position,* ( position+1 ) ); return -1; } // Initialisation of matrix D (all diagonal elements of D equal to lambda) temp=Dmat; for ( i=0,rowcur=0,colcur=0; i<Dblocks; ++i, ++colcur, ++rowcur ) { if ( rowcur==*dims ) { rowcur=0; temp += blocksize; } if ( colcur==* ( dims+1 ) ) { colcur=0; temp += blocksize*lld_D; } if ( *position==rowcur && * ( position+1 ) == colcur ) { for ( j=0; j<blocksize; ++j ) { * ( temp + j * lld_D +j ) =lambda; } if ( i==Dblocks-1 && Ddim % blocksize != 0 ) { for ( j=blocksize-1; j>= Ddim % blocksize; --j ) { * ( temp + j * lld_D + j ) =0.0; } } } } fT=fopen ( filenameT,"rb" ); if ( fT==NULL ) { printf ( "Error opening file\n" ); return -1; } // Set up of matrix D and B per strip of T' for ( ni=0; ni<nstrips; ++ni ) { if ( ni==nstrips-1 ) { if(Tblock != NULL) free ( Tblock ); Tblock=NULL; Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) ); if ( Tblock==NULL ) { printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)\n",*position,* ( position+1 ) ); return -1; } } //Each process only reads in a part of the strip of T' //When k is not a multiple of blocksize, read-in of the last elements of the rows of T is tricky if ( ( nTblocks-1 ) % *dims == *position && k%blocksize !=0 ) { if ( ni==0 ) { info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } else { info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } for ( i=0; i<blocksize; ++i ) { info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } for ( j=0; j < pTblocks-1; ++j ) { fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( ( *dims ) -1 ) * blocksize * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),k%blocksize,fT ); } //Normal read-in of the strips of T from a binary file (each time blocksize elements are read in) } else { if ( ni==0 ) { info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } else { info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } for ( i=0; i<blocksize; ++i ) { info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } for ( j=0; j < pTblocks-1; ++j ) { fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( * ( dims )-1 ) * blocksize * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( k - blocksize * ( ( pTblocks-1 ) * *dims + *position +1 ) ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } } blacs_barrier_ ( &ICTXT2D,"A" ); // End of read-in // Matrix D is the sum of the multiplications of all strips of T' by their transpose // Up unitl now, the entire matrix is stored, not only upper/lower triangular, which is possible since D is symmetric // Be aware, that you akways have to allocate memory for the enitre matrix, even when only dealing with the upper/lower triangular part pdgemm_ ( "N","T",&k,&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &i_one, &i_one, DESCD ); //Z'Z //pdsyrk_ ( "U","N",&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &t_plus, &t_plus, DESCD ); // Matrix B consists of X'T and Z'T, since each process only has some parts of T at its disposal, // we need to make sure that the correct columns of Z and X are multiplied with the correct columns of T. for ( i=0; i<pTblocks; ++i ) { XtT_temp.ncols=k; //This function multiplies the correct columns of X' with the blocks of T at the disposal of the process // The result is also stored immediately at the correct positions of X'T. (see src/tools.cpp) XtT_temp.clear(); mult_colsA_colsC ( Xtsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize, ( *dims * i + *position ) *blocksize, blocksize, XtT_temp, 0 ); if ( XtT_temp.nonzeros>0 ) { if ( XtT_sparse.nonzeros==0 ){ XtT_sparse.clear(); XtT_sparse.make2 ( XtT_temp.nrows,XtT_temp.ncols,XtT_temp.nonzeros,XtT_temp.pRows,XtT_temp.pCols,XtT_temp.pData ); } else { XtT_sparse.addBCSR ( XtT_temp ); } } } //Same as above for calculating Z'T for ( i=0; i<pTblocks; ++i ) { ZtT_temp.ncols=k; ZtT_temp.clear(); mult_colsA_colsC ( Ztsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize, blocksize * ( *dims * i + *position ), blocksize, ZtT_temp, 0 ); if ( ZtT_temp.nonzeros>0 ) { if ( ZtT_sparse.nonzeros==0 ){ ZtT_sparse.clear(); ZtT_sparse.make2 ( ZtT_temp.nrows,ZtT_temp.ncols,ZtT_temp.nonzeros,ZtT_temp.pRows,ZtT_temp.pCols,ZtT_temp.pData ); } else ZtT_sparse.addBCSR ( ZtT_temp ); } } blacs_barrier_ ( &ICTXT2D,"A" ); } XtT_temp.clear(); ZtT_temp.clear(); Xtsparse.clear(); Ztsparse.clear(); if(DESCT != NULL) free ( DESCT ); DESCT=NULL; if(Tblock != NULL) free ( Tblock ); Tblock=NULL; //printf("T read in\n"); info=fclose ( fT ); if ( info!=0 ) { printf ( "Error in closing open streams" ); return -1; } if(filenameT != NULL) free(filenameT); filenameT=NULL; //Each process only has calculated some parts of B //All parts are collected by the root process (iam==0), which assembles B //Each process then receives BT_i and B_j corresponding to the D_ij available to the process if ( iam!=0 ) { //Each process other than root sends its X' * T and Z' * T to the root process. MPI_Send ( & ( XtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pRows[0] ),XtT_sparse.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pCols[0] ),XtT_sparse.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pData[0] ),XtT_sparse.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD ); XtT_sparse.clear(); MPI_Send ( & ( ZtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pRows[0] ),ZtT_sparse.nrows + 1, MPI_INT,0,4*size + iam,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pCols[0] ),ZtT_sparse.nonzeros, MPI_INT,0,iam+ 5*size,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pData[0] ),ZtT_sparse.nonzeros, MPI_DOUBLE,0,iam+6*size,MPI_COMM_WORLD ); ZtT_sparse.clear(); //printf("Process %d sent ZtT and XtT\n",iam); // And eventually receives the necessary BT_i and B_j // Blocking sends are used, which is why the order of the receives is critical depending on the coordinates of the process int nonzeroes; if (*position >= pcol) { MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status ); BT_i.allocate ( blocksize*Drows,m+l,nonzeroes ); MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status ); int count; MPI_Get_count(&status,MPI_INT,&count); BT_i.nrows=count-1; MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status ); MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status ); B_j.allocate ( blocksize*Dcols,m+l,nonzeroes ); MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status ); MPI_Get_count(&status,MPI_INT,&count); B_j.nrows=count-1; MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status ); //Actually BT_j is sent, so it still needs to be transposed B_j.transposeIt ( 1 ); } else { MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status ); B_j.allocate ( blocksize*Dcols,m+l,nonzeroes ); MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status ); int count; MPI_Get_count(&status,MPI_INT,&count); B_j.nrows=count-1; MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status ); B_j.transposeIt ( 1 ); MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status ); BT_i.allocate ( blocksize*Drows,m+l,nonzeroes ); MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status ); MPI_Get_count(&status,MPI_INT,&count); BT_i.nrows=count-1; MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status ); } } else { for ( i=1; i<size; ++i ) { // The root process receives parts of X' * T and Z' * T sequentially from all processes and directly adds them together. int nonzeroes; MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status ); if(nonzeroes>0) { XtT_temp.allocate ( m,k,nonzeroes ); MPI_Recv ( & ( XtT_temp.pRows[0] ),m + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( XtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( XtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status ); XtT_sparse.addBCSR ( XtT_temp ); XtT_temp.clear(); } MPI_Recv ( &nonzeroes,1, MPI_INT,i,i,MPI_COMM_WORLD,&status ); if(nonzeroes>0) { ZtT_temp.allocate ( l,k,nonzeroes ); MPI_Recv ( & ( ZtT_temp.pRows[0] ),l + 1, MPI_INT,i,4*size + i,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( ZtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+ 5*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( ZtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+6*size,MPI_COMM_WORLD,&status ); ZtT_sparse.addBCSR ( ZtT_temp ); ZtT_temp.clear(); } } XtT_sparse.transposeIt ( 1 ); ZtT_sparse.transposeIt ( 1 ); // B' is created by concatening blocks X'T and Z'T create1x2BlockMatrix ( XtT_sparse, ZtT_sparse,Btsparse ); XtT_sparse.clear(); ZtT_sparse.clear(); /*Btsparse.transposeIt(1); Btsparse.writeToFile("B_sparse.csr"); Btsparse.transposeIt(1);*/ // For each process row i BT_i is created which is also sent to processes in column i to become B_j. for ( int rowproc= *dims - 1; rowproc>= 0; --rowproc ) { BT_i.ncols=Btsparse.ncols; BT_i.nrows=0; BT_i.nonzeros=0; int Drows_rowproc; if (rowproc!=0) { Drows_rowproc= ( Dblocks - rowproc ) % *dims == 0 ? ( Dblocks- rowproc ) / *dims : ( Dblocks- rowproc ) / *dims +1; Drows_rowproc= Drows_rowproc<1? 1 : Drows_rowproc; } else Drows_rowproc=Drows; for ( i=0; i<Drows_rowproc; ++i ) { //Each process in row i can hold several blocks of contiguous rows of D for which we need the corresponding rows of B_T // Therefore we use the function extendrows to create BT_i (see src/tools.cpp) BT_i.extendrows ( Btsparse, ( i * *dims + rowproc ) * blocksize,blocksize ); } for ( int colproc= ( rowproc==0 ? 1 : 0 ); colproc < * ( dims+1 ); ++colproc ) { int rankproc; rankproc= blacs_pnum_ (&ICTXT2D, &rowproc,&colproc); MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+3*size,MPI_COMM_WORLD ); //printf("BT_i's sent to processor %d\n",rankproc); rankproc= blacs_pnum_ (&ICTXT2D, &colproc,&rowproc); MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc+4*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+5*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+6*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+7*size,MPI_COMM_WORLD ); //printf("B_j's sent to processor %d\n",rankproc); } } B_j.make2 ( BT_i.nrows,BT_i.ncols,BT_i.nonzeros,BT_i.pRows,BT_i.pCols,BT_i.pData ); B_j.transposeIt ( 1 ); } return 0; }
/*==== MAIN FUNCTION =================================================*/ int main( int argc, char *argv[] ){ /* ==== Declarations =================================================== */ /* File variables */ FILE *fin; /* Matrix descriptors */ MDESC descA, descB, descC, descA_local, descB_local; /* Local scalars */ MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol; MKL_INT n, nb, mp, nq, lld, lld_local; MKL_INT i, j, info; int n_int, nb_int, nprow_int, npcol_int; double thresh, diffnorm, anorm, bnorm, residual, eps; /* Local arrays */ double *A_local, *B_local, *A, *B, *C, *work; MKL_INT iwork[ 4 ]; /* ==== Executable statements ========================================== */ /* Get information about how many processes are used for program execution and number of current process */ blacs_pinfo_( &iam, &nprocs ); /* Init temporary 1D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "C", &nprocs, &i_one ); /* Open input file */ if ( iam == 0 ) { fin = fopen( "../in/pblas3ex.in", "r" ); if ( fin == NULL ) { printf( "Error while open input file." ); return 2; } } /* Read data and send it to all processes */ if ( iam == 0 ) { /* Read parameters */ fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int ); fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int ); fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int ); fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int ); fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh ); n = (MKL_INT) n_int; nb = (MKL_INT) nb_int; nprow = (MKL_INT) nprow_int; npcol = (MKL_INT) npcol_int; /* Check if all parameters are correct */ if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) { printf( "One or several input parameters has incorrect value. Limitations:\n" ); printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" ); printf( "p*q = number of processes\n" ); printf( "threshold - double (set to negative to swicth off check)\n"); return 2; } /* Pack data into array and send it to other processes */ iwork[ 0 ] = n; iwork[ 1 ] = nb; iwork[ 2 ] = nprow; iwork[ 3 ] = npcol; igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four ); dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one ); } else { /* Recieve and unpack data */ igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero ); dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero ); n = iwork[ 0 ]; nb = iwork[ 1 ]; nprow = iwork[ 2 ]; npcol = iwork[ 3 ]; } if ( iam == 0 ) { fclose( fin ); } /* Destroy temporary process grid */ blacs_gridexit_( &ictxt ); /* Init workind 2D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* Create on process 0 two matrices: A - orthonormal, B -random */ if ( ( myrow == 0 ) && ( mycol == 0 ) ){ /* Allocate arrays */ A_local = (double*) calloc( n*n, sizeof( double ) ); B_local = (double*) calloc( n*n, sizeof( double ) ); /* Set arrays */ for ( i=0; i<n; i++ ){ for ( j=0; j<n; j++ ){ B_local[ i+n*j ] = one*rand()/RAND_MAX; } B_local[ i+n*i ] += two; } for ( j=0; j<n; j++ ){ for ( i=0; i<n; i++ ){ if ( j < n-1 ){ if ( i <= j ){ A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) ); } else if ( i == j+1 ) { A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) ); } else { A_local[ i+n*j ] = zero; } } else { A_local[ i+n*(n-1) ] = one / sqrt( ( double )n ); } } } /* Print information of task */ printf( "=== START OF EXAMPLE ===================\n" ); printf( "Matrix-matrix multiplication: A*B = C\n\n" ); printf( "/ 1/q_1 ........ 1/q_n-1 1/q_n \\ \n" ); printf( "| . | \n" ); printf( "| `. : : | \n" ); printf( "| -1/q_1 `. : : | \n" ); printf( "| . `. : : | = A \n" ); printf( "| 0 `. ` | \n" ); printf( "| : `. `. 1/q_n-1 1/q_n | \n" ); printf( "| : `. `. | \n" ); printf( "\\ 0 .... 0 -(n-1)/q_n-1 1/q_n / \n\n" ); printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" ); printf( "A - n*n real matrix (orthonormal) \n" ); printf( "B - random n*n real matrix\n\n" ); printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol ); printf( "=== PROGRESS ===========================\n" ); } else { /* Other processes don't contain parts of initial arrays */ A_local = NULL; B_local = NULL; } /* Compute precise length of local pieces and allocate array on each process for parts of distributed vectors */ mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow ); nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); A = (double*) calloc( mp*nq, sizeof( double ) ); B = (double*) calloc( mp*nq, sizeof( double ) ); C = (double*) calloc( mp*nq, sizeof( double ) ); /* Compute leading dimensions */ lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 ); lld = MAX( mp, 1 ); /* Initialize descriptors for initial arrays located on 0 process */ descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); /* Initialize descriptors for distributed arrays */ descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); /* Distribute matrices from 0 process over process grid */ pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); } /* Destroy arrays on 0 process - they are not necessary anymore */ if( ( myrow == 0 ) && ( mycol == 0 ) ){ free( A_local ); free( B_local ); } /* Compute norm of A and B */ work = (double*) calloc( mp, sizeof( double ) ); anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work ); bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); } /* Compute product C = A*B */ pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); } /* Compute difference B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */ pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC, &negone, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); } /* Compute norm of B - inv_A*C (which is contained in B) */ diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); free( work ); if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); } /* Print results */ if( iam == 0 ){ printf( ".. Solutions are compared ..\n" ); printf( "== Results ==\n" ); printf( "||A|| = %03.11f\n", anorm ); printf( "||B|| = %03.11f\n", bnorm ); printf( "=== END OF EXAMPLE =====================\n" ); } /* Compute machine epsilon */ eps = pdlamch_( &ictxt, "e" ); /* Compute residual */ residual = diffnorm /( two*anorm*bnorm*eps ); /* Destroy arrays */ free( A ); free( B ); free( C ); /* Destroy process grid */ blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); /* Check if residual passed or failed the threshold */ if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){ printf( "FAILED. Residual = %05.16f\n", residual ); return 1; } else { return 0; } /*======================================================================== ====== End of PBLAS Level 3 example ==================================== ======================================================================*/ }
void test_gemr2d(int M, int N) { int repeat = 10; int32_t one = 1; int32_t isrc = 0; int32_t num_ranks; MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); int32_t rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); int32_t bs_row_A = M / num_ranks + std::min(1, M % num_ranks); int32_t bs_col_A = 1; int32_t bs_row_B = 1; int32_t bs_col_B = 1; int32_t blacs_handler = Csys2blacs_handle(MPI_COMM_WORLD); int32_t context1 = blacs_handler; int32_t context2 = blacs_handler; /* create BLACS context */ Cblacs_gridinit(&context1, "C", num_ranks, 1); /* get row and column ranks */ int32_t rank_row1, rank_col1; Cblacs_gridinfo(context1, &num_ranks, &one, &rank_row1, &rank_col1); /* get local number of rows and columns of a matrix */ int32_t num_rows_local1, num_cols_local1; num_rows_local1 = numroc_(&M, &bs_row_A, &rank_row1, &isrc, &num_ranks); num_cols_local1 = numroc_(&N, &bs_col_A, &rank_col1, &isrc, &one); Cblacs_gridinit(&context2, "C", 1, num_ranks); int32_t rank_row2, rank_col2; Cblacs_gridinfo(context2, &one, &num_ranks, &rank_row2, &rank_col2); int32_t num_rows_local2, num_cols_local2; num_rows_local2 = numroc_(&M, &bs_row_B, &rank_row2, &isrc, &one); num_cols_local2 = numroc_(&N, &bs_col_B, &rank_col2, &isrc, &num_ranks); if (rank == 0) { printf("local dimensions of A: %i x %i\n", num_rows_local1, num_cols_local1); printf("local dimensions of B: %i x %i\n", num_rows_local2, num_cols_local2); } int32_t descA[9], descB[9], info; descinit_(descA, &M, &N, &bs_row_A, &bs_col_A, &isrc, &isrc, &context1, &num_rows_local1, &info); descinit_(descB, &M, &N, &bs_row_B, &bs_col_B, &isrc, &isrc, &context2, &num_rows_local2, &info); std::vector<double_complex> A(num_rows_local1 * num_cols_local1); std::vector<double_complex> B(num_rows_local2 * num_cols_local2, double_complex(0, 0)); std::vector<double_complex> C(num_rows_local1 * num_cols_local1); for (int i = 0; i < num_rows_local1 * num_cols_local1; i++) { A[i] = double_complex(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX); C[i] = A[i]; } double time = -MPI_Wtime(); for (int i = 0; i < repeat; i++) { pzgemr2d_(&M, &N, &A[0], &one, &one, descA, &B[0], &one, &one, descB, &context1); } time += MPI_Wtime(); if (rank == 0) { printf("average time %.4f sec, swap speed: %.4f GB/sec\n", time / repeat, sizeof(double_complex) * repeat * M * N / double(1 << 30) / time); } /* check correctness */ pzgemr2d_(&M, &N, &B[0], &one, &one, descB, &A[0], &one, &one, descA, &context1); for (int i = 0; i < num_rows_local1 * num_cols_local1; i++) { if (std::abs(A[i] - C[i]) > 1e-14) { printf("Fail.\n"); exit(0); } } Cblacs_gridexit(context1); Cblacs_gridexit(context2); Cfree_blacs_system_handle(blacs_handler); }
int main ( int argc, char **argv ) { int info, i, j, pcol; double *D, *AB_sol, *InvD_T_Block, *XSrow; int *DESCD, *DESCAB_sol, *DESCXSROW; bool readAFromFile = false; //CSRdouble BT_i, B_j; double* BT_i; double* B_j; int s_BT_i = 0; int s_B_j = 0; // size of BT_i (#rows) and B_j (#cols) int nx, ny, nz; CSRdouble Asparse, Btsparse; if (argc < 6) { // needed args: nx, ny, nz, Ddim, blocksize; // optional: C.csr - if provided, the output file is written. cout << "Too few arguments." << endl; cout << "Usage: " << argv[0] << " nx ny nz Ddim blocksize [C.csr]" << endl; cout << "Usage: " << argv[0] << " -fileA filename Adim Ddim blocksize [C.csr]" << endl; exit(-1); } if (strcmp(argv[1], "-fileA") == 0) // if (argv[1] == "-fileA") { readAFromFile = true; filenameA = new char[250]; strcpy(filenameA, argv[2]); Adim = atoi(argv[3]); } else { nx = atoi(argv[1]); ny = atoi(argv[2]); nz = atoi(argv[3]); Adim = nx * ny * nz; } Ddim = atoi(argv[4]); blocksize = atoi(argv[5]); //printf("Adim=%d, Ddim=%d, blocksize=%d *** %s", Adim, Ddim, blocksize, outputc); //exit(-123); //Initialise MPI and some MPI-variables info = MPI_Init ( &argc, &argv ); if ( info != 0 ) { printf ( "Error in MPI initialisation: %d\n",info ); return info; } position= ( int* ) calloc ( 2,sizeof ( int ) ); if ( position==NULL ) { printf ( "unable to allocate memory for processor position coordinate\n" ); return EXIT_FAILURE; } dims= ( int* ) calloc ( 2,sizeof ( int ) ); if ( dims==NULL ) { printf ( "unable to allocate memory for grid dimensions coordinate\n" ); return EXIT_FAILURE; } //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI blacs_pinfo_ ( &iam,&size ); //determine the number of processes involved info=MPI_Dims_create ( size, 2, dims ); //determine the best 2D cartesian grid with the number of processes if ( info != 0 ) { printf ( "Error in MPI creation of dimensions: %d\n",info ); return info; } //Until now the code can only work with square process grids //So we try to get the biggest square grid possible with the number of processes involved if ( *dims != * ( dims+1 ) ) { while ( *dims * *dims > size ) *dims -=1; * ( dims+1 ) = *dims; if ( iam==0 ) printf ( "WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - ( *dims * *dims ) ); size = *dims * *dims; //cout << "New size of process grid: " << size << endl; } blacs_get_ ( &i_negone,&i_zero,&ICTXT2D ); //Initialisation of the BLACS process grid, which is referenced as ICTXT2D blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 ); if ( iam < size ) { //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column) blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 ); if ( *position ==-1 ) { printf ( "Error in proces grid\n" ); return -1; } /* //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp) info=read_input ( *++argv ); if ( info!=0 ) { printf ( "Something went wrong when reading input file for processor %d\n",iam ); return -1; } */ //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point. blacs_barrier_ ( &ICTXT2D,"ALL" ); if ( * ( position+1 ) ==0 && *position==0 ) printf ( "Reading of input-file succesful\n" ); if ( * ( position+1 ) ==0 && *position==0 ) { printf ( "\nA sparse square matrix of dimension %d with a dense square submatrix with dimension %d \n", Adim+Ddim,Ddim ); printf ( "was analyzed using %d (%d x %d) processors\n",size,*dims,* ( dims+1 ) ); } pcol= * ( position+1 ); //Define number of blocks needed to store a complete column/row of D Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1; //Define the number of rowblocks needed by the current process to store its part of the dense matrix D Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1; Drows= Drows<1? 1 : Drows; //Define the number of columnblocks needed by the current process to store its part of the dense matrix D Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1; Dcols=Dcols<1? 1 : Dcols; //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise) lld_D=Drows*blocksize; // cout << "Hi! I am " << iam << ". My position is ( " << *position << "," << *(position+1) << ") and I have... Dblocks: " << Dblocks << "; Drows: " << Drows << "; Dcols: " << Dcols << "; blocksize: " << blocksize << endl; //Initialise the descriptor of the dense distributed matrix DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCD==NULL ) { printf ( "unable to allocate memory for descriptor for C\n" ); return -1; } //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0) //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } //Allocate the space necessary to store the part of D that is held into memory of this process. D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) ); if ( D==NULL ) { printf ( "unable to allocate memory for Matrix D (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) ); return EXIT_FAILURE; } blacs_barrier_ ( &ICTXT2D,"ALL" ); //added B_j = new double[Adim * Dcols * blocksize]; BT_i = new double[Adim * Drows * blocksize]; //read_in_BD ( DESCD,D, BT_i, B_j, Btsparse ) ; if ( iam == 0 ) cout << "Generating A, B and D... \n" << endl; generate_BD(D, BT_i, B_j, &s_BT_i, &s_B_j); cout << "- B, D generated." << endl; //Now every process has to read in the sparse matrix A //makeDiagonalPerturbD(Adim, 1000.0, 1e-10, Asparse); cout << "A is a pert. diag." << endl; //makeRandCSRUpper(Adim, 0.001, Asparse); //cout << "nnz(A) = " << Asparse.nonzeros << endl; //Asparse.loadFromFileSym("/users/drosos/simple/matrices/NornePrimaryJacobian.csr"); if (readAFromFile) { Asparse.loadFromFile(filenameA); cout << "A loaded from file" << endl; //Asparse.reduceSymmetric(); } else { make3DLaplace(nx, ny, nz, Asparse); cout << "A is Laplacian" << endl; shiftIndices(Asparse, -1); } cout << "- A generated." << endl; Asparse.matrixType = SYMMETRIC; assert(Asparse.nrows == Adim); assert(Asparse.ncols == Adim); //if (iam == 0) Asparse.writeToFile("A_debug.csr"); exit(-1234); if (argc == 7) // if the name of the output file for C is given as parameter { filenameC = new char[250]; //strcpy(filenameC, argv[6]); sprintf(filenameC, "/scratch/daint/verbof/sparsedense/C_%d_%d.csr", Adim, Ddim); CSRdouble Dmat, Dblock, Csparse, Bblock; Dblock.nrows=Dblocks * blocksize; Dblock.ncols=Dblocks * blocksize; Dblock.allocate ( Dblocks * blocksize, Dblocks * blocksize, 0 ); Dmat.allocate ( 0,0,0 ); for ( i=0; i<Drows; ++i ) { for ( j=0; j<Dcols; ++j ) { dense2CSR_sub ( D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock, ( * ( dims ) * i + *position ) *blocksize, ( * ( dims+1 ) * j + pcol ) *blocksize ); if ( Dblock.nonzeros>0 ) { if ( Dmat.nonzeros==0 ) { Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData ); } else { Dmat.addBCSR ( Dblock ); } } Dblock.clear(); } } if ( *position==0 ) { Bblock.nrows=Adim; Bblock.ncols=Dblocks * blocksize; Bblock.allocate ( Adim, Dblocks * blocksize, 0 ); Btsparse.allocate ( 0,0,0 ); for ( j=0; j<Dcols; ++j ) { dense2CSR_sub ( B_j + j * Adim * blocksize,Adim,blocksize,Adim,Bblock,0, ( * ( dims+1 ) * j + pcol ) *blocksize ); if ( Bblock.nonzeros>0 ) { if ( Btsparse.nonzeros==0 ) { Btsparse.make2 ( Bblock.nrows,Bblock.ncols,Bblock.nonzeros,Bblock.pRows,Bblock.pCols,Bblock.pData ); } else { Btsparse.addBCSR ( Bblock ); } } Bblock.clear(); } } blacs_barrier_ ( &ICTXT2D,"A" ); if ( iam!=0 ) { //Each process other than root sends its Dmat to the root process. MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD ); Dmat.clear(); if ( *position==0 ) { MPI_Send ( & ( Btsparse.nonzeros ),1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD ); MPI_Send ( & ( Btsparse.pRows[0] ),Btsparse.nrows + 1, MPI_INT,0,iam+5*size,MPI_COMM_WORLD ); MPI_Send ( & ( Btsparse.pCols[0] ),Btsparse.nonzeros, MPI_INT,0,iam+6*size,MPI_COMM_WORLD ); MPI_Send ( & ( Btsparse.pData[0] ),Btsparse.nonzeros, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD ); Btsparse.clear(); } } else { //Btsparse.writeToFile("Btsparse_pre.csr"); for ( i=1; i<size; ++i ) { // The root process receives parts of Dmat sequentially from all processes and directly adds them together. int nonzeroes, count; MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ if ( nonzeroes>0 ) { printf ( "Nonzeroes : %d\n ",nonzeroes ); Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes ); MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_DOUBLE, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ Dmat.addBCSR ( Dblock ); Dblock.clear(); } if ( i / *dims == 0 ) { MPI_Recv ( &nonzeroes,1,MPI_INT,i,i+4*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ if ( nonzeroes>0 ) { printf ( "Nonzeroes : %d\n ",nonzeroes ); Bblock.allocate ( Adim,Dblocks * blocksize,nonzeroes ); MPI_Recv ( & ( Bblock.pRows[0] ), Adim + 1, MPI_INT,i,i+5*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Bblock.pCols[0] ),nonzeroes, MPI_INT,i,i+6*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Bblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+7*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_DOUBLE, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ Btsparse.addBCSR ( Bblock ); Bblock.clear(); } } } //Dmat.writeToFile("D_sparse.csr"); printf ( "Number of nonzeroes in D: %d\n",Dmat.nonzeros ); Dmat.reduceSymmetric(); //Dmat.writeToFile("D_sparse_symm.csr"); //Btsparse.writeToFile("Btsparse.csr"); Dmat.changeCols(Ddim); Dmat.changeRows(Ddim); //Dmat.writeToFile("Dsparse_red.csr"); Btsparse.changeCols(Ddim); create2x2SymBlockMatrix ( Asparse,Btsparse, Dmat, Csparse ); Btsparse.clear(); Dmat.clear(); ParDiSO p(-2,0); p.init(Csparse, 1); p.factorize(Csparse); //Csparse.fillSymmetric(); //Csparse.writeToFilePSelInv(filenameC); //Csparse.writeToFile(filenameC); Csparse.clear(); //double* Cdense = new double[Csparse.nrows * Csparse.ncols]; //CSR2dense(Csparse, Cdense); //printdense(Adim+Ddim, Adim+Ddim, Cdense, "C.txt"); if ( filenameC != NULL ) free ( filenameC ); filenameC=NULL; } if (iam == 0) { cout << "\n - C saved in file " << filenameC << "! Exiting... \n\n" << endl; exit(-12345); } } //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCAB_sol==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize) descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } AB_sol= ( double * ) calloc ( Adim * s_B_j,sizeof ( double ) ); blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( totaltime ); if ( iam == 0 ) watch.tick ( cresctime ); // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp) // The solution of A * X = B_j is stored in AB_sol (= A^-1 * B_j) /* char * BT_i_debugFile = new char[100]; char * B_j_debugFile = new char[100]; sprintf(BT_i_debugFile, "BT_i_debug_%d.txt", iam); sprintf(B_j_debugFile, "B_j_debug_%d.txt", iam); BT_i.writeToFile(BT_i_debugFile); B_j.writeToFile(B_j_debugFile); */ make_Sij_denseB ( Asparse, BT_i, B_j, s_BT_i, s_B_j, D, lld_D, AB_sol ); /* char * AB_sol_debugFile = new char[100]; char * D_debugFile = new char[100]; sprintf(AB_sol_debugFile, "AB_sol_debug_%d.txt", iam); sprintf(D_debugFile, "D_debug_%d.txt", iam); printDenseDouble(AB_sol_debugFile, ios::out, Drows*blocksize, Dcols*blocksize, AB_sol); printDenseDouble(D_debugFile, ios::out, Ddim, Ddim, D); cout << iam << " just wrote debug stuff... " << endl; */ blacs_barrier_ ( &ICTXT2D,"ALL" ); if ( iam == 0 ) watch.tack ( cresctime ); if ( iam !=0 ) { Asparse.clear(); pardiso_var.clear(); } //BT_i.clear(); //B_j.clear(); delete[] BT_i; delete[] B_j; blacs_barrier_ ( &ICTXT2D,"ALL" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( facsctime ); //The Schur complement is factorised (by ScaLAPACK) pdpotrf_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info ); return -1; } blacs_barrier_ ( &ICTXT2D,"ALL" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( facsctime ); if ( iam == 0 ) watch.tick ( invsctime ); //The Schur complement is inverteded (by ScaLAPACK) pdpotri_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Inverse of D was unsuccessful, error returned: %d\n",info ); return -1; } blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( invsctime ); InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) ); if ( iam == 0 ) watch.tick ( gathrtime ); blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( sndrctime ); //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process. if ( *position == pcol ) { for ( i=0; i<Ddim; ++i ) { if ( pcol == ( i/blocksize ) % *dims ) { int Dpos = i%blocksize + ( ( i/blocksize ) / *dims ) * blocksize ; * ( InvD_T_Block + Adim +i ) = * ( D + Dpos + lld_D * Dpos ); } } for ( i=0,j=0; i<Dblocks; ++i,++j ) { if ( j==*dims ) j=0; if ( *position==j ) { dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero ); } if ( *position==0 ) { dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j ); } } } blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( sndrctime ); if ( position != NULL ) { free ( position ); position=NULL; } if ( dims != NULL ) { free ( dims ); dims=NULL; } //Only the root process performs a selected inversion of A. if ( iam==0 ) { watch.tick ( invrAtime ); /*int pardiso_message_level = 1; int pardiso_mtype=-2; ParDiSO pardiso ( pardiso_mtype, pardiso_message_level );*/ int number_of_processors = 1; char* var = getenv ( "OMP_NUM_THREADS" ); if ( var != NULL ) sscanf ( var, "%d", &number_of_processors ); else { printf ( "Set environment OMP_NUM_THREADS to 1" ); exit ( 1 ); } pardiso_var.iparm[2] = 2; pardiso_var.iparm[3] = number_of_processors; pardiso_var.iparm[8] = 0; pardiso_var.iparm[11] = 1; pardiso_var.iparm[13] = 0; pardiso_var.iparm[28] = 0; //This function calculates the factorisation of A once again so this might be optimized. pardiso_var.findInverseOfA ( Asparse ); cout << "Memory allocated by pardiso: " << pardiso_var.memoryAllocated() << endl; printf ( "Processor %d inverted matrix A\n",iam ); watch.tack ( invrAtime ); } blacs_barrier_ ( &ICTXT2D,"A" ); // To minimize memory usage, and because only the diagonal elements of the inverse are needed, X' * S is calculated row by row // the diagonal element is calculated as the dot product of this row and the corresponding column of X. (X is solution of AX=B) XSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) ); DESCXSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCXSROW==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //XSrow (1,Ddim) is distributed acrros processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize) descinit_ ( DESCXSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } blacs_barrier_ ( &ICTXT2D,"A" ); if ( iam == 0 ) cout << "Calculating diagonal elements of the first block of the inverse... \n" << endl; blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( dotprtime ); //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1. for ( i=1; i<=Adim; ++i ) { pdsymm_ ( "R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,XSrow,&i_one,&i_one,DESCXSROW ); pddot_ ( &Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,XSrow,&i_one,&i_one,DESCXSROW,&i_one ); } blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( dotprtime ); if ( D!=NULL ) { free ( D ); D=NULL; } if ( AB_sol!=NULL ) { free ( AB_sol ); AB_sol=NULL; } if ( XSrow !=NULL ) { free ( XSrow ); XSrow=NULL; } if ( DESCD!=NULL ) { free ( DESCD ); DESCD=NULL; } if ( DESCAB_sol!=NULL ) { free ( DESCAB_sol ); DESCAB_sol=NULL; } if ( DESCXSROW!=NULL ) { free ( DESCXSROW ); DESCXSROW=NULL; } //Only in the root process we add the diagonal elements of A^-1 if ( iam ==0 ) { for ( i = 0; i < Adim; i++ ) { j = Asparse.pRows[i]; * ( InvD_T_Block+i ) += Asparse.pData[j]; } /********************** TIMING **********************/ watch.tack ( gathrtime ); watch.tack ( totaltime ); /* //cout << "Extraction completed by "; for (i = 0; i < Ddim; i++) { cout << "Extracting row " << i << "/" << Ddim << endl; //cout << setw(3) << std::setfill('0') << int(i*100.0 / (Ddim-1)) << "%" << "\b\b\b\b"; diagonal[Asparse.nrows + i] = InvD_T_Block[i*Ddim + i]; } cout << endl; */ Asparse.clear(); /* cout << "Extracting diagonal... \n" << endl; cout << "Saving diagonal... \n" << endl; char* diagOutFile = new char[50]; sprintf ( diagOutFile, "diag_inverse_C_parallel_%d.txt", size ); printdense ( Adim+Ddim, 1, InvD_T_Block, diagOutFile ); delete[] diagOutFile; */ } if ( InvD_T_Block !=NULL ) { free ( InvD_T_Block ); InvD_T_Block=NULL; } if ( iam == 0 ) { // Conversion milliseconds -> seconds cresctime /= 1000.0; facsctime /= 1000.0; invsctime /= 1000.0; gathrtime /= 1000.0; invrAtime /= 1000.0; totaltime /= 1000.0; cout << "********************************* TIME REPORT ********************************** \n" << endl; cout << " SCHUR COMPLEMENT BUILDING: " << cresctime << " seconds" << endl; cout << " SCHUR COMPLEMENT FACTORIZATION: " << facsctime << " seconds" << endl; cout << " SCHUR COMPLEMENT INVERSION: " << invsctime << " seconds" << endl; cout << " " << endl; cout << " FINAL OPERATIONS (INVERSION OF A INCLUDED): " << gathrtime << " seconds" << endl; cout << " INVERSION OF A: " << invrAtime << " seconds" << endl; cout << " TOTAL TIME: " << totaltime << " seconds" << endl; cout << "******************************************************************************** \n" << endl; /* * double totaltime = 0.0; // Total execution time * double cresctime = 0.0; // Schur-complement (total) * double facsctime = 0.0; // Schur-complement factorization * double invsctime = 0.0; // Schur-complement inversion * double gathrtime = 0.0; // Last operationsdouble invrAtime = 0.0; // Inversion of A * */ char* timingFile = new char[50]; sprintf ( timingFile, "weak_tests.csv" ); std::fstream timeF; timeF.open ( timingFile, std::fstream::out | std::fstream::app ); timeF.setf ( ios::scientific, ios::floatfield ); //timeF << "PROBLEM SIZE: " << Adim/1000 << "k + " << Ddim/1000 << "k" << endl; //timeF << //"#PROCS,SCHUR_BUILD,SCHUR_FACT,SCHUR_INV,INV(A),FINAL_OPS, //SEND_RECV, DOT_PROD,TOTAL" << endl; timeF << size << "," << cresctime << "," << facsctime << "," << invsctime << "," << invrAtime << "," << gathrtime << "," << sndrctime << "," << dotprtime << "," << totaltime << endl; timeF.close(); } blacs_barrier_ ( &ICTXT2D,"A" ); blacs_gridexit_ ( &ICTXT2D ); } //cout << iam << " reached end before MPI_Barrier" << endl; MPI_Barrier ( MPI_COMM_WORLD ); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { // Some constants int minusone = -1; int zero = 0; int one = 1; double dzero = 0.0; // ConText int ConTxt = minusone; // order char order = 'R'; char scope = 'A'; // root process int root = zero; // BLACS/SCALAPACK parameters // the size of the blocks the distributed matrix is split into // (applies to both rows and columns) int mb = 32; int nb = mb; // PDSYEVxxx constraint // the number of rows and columns in the processor grid // only square processor grids due to C vs. Fortran ordering int nprow = 2; int npcol = nprow; // only square processor grids, // starting row and column in grid, do not change int rsrc = zero; int csrc = zero; // dimensions of the matrix to diagonalize int m = 1000; int n = m; // only square matrices int info = zero; // Rest of code will only work for: // nprow = npcol // mb = nb; // m = n; // rsrc = crsc; // Paramteres for Trivial Matrix double alpha = 0.1; // off-diagonal double beta = 75.0; // diagonal // For timing: double tdiag0, tdiag, ttotal0, ttotal; // BLACS Communicator MPI_Comm blacs_comm; int nprocs; int iam; int myrow, mycol; MPI_Init(&argc, &argv); MPI_Barrier(MPI_COMM_WORLD); ttotal0 = MPI_Wtime(); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &iam); if (argc > one) { nprow = strtod(argv[1],NULL); m = strtod(argv[2],NULL); npcol = nprow; n = m; } if (iam == root) { printf("world size %d \n",nprocs); printf("n %d \n", n); printf("nprow %d \n", nprow); printf("npcol %d \n", npcol); } // We can do this on any subcommunicator. #ifdef CartComm int dim[2]; int pbc[2]; dim[0] = nprow; dim[1] = npcol; pbc[0] = 0; pbc[1] = 0; MPI_Cart_create(MPI_COMM_WORLD, 2, dim, pbc, 1, &blacs_comm); #else blacs_comm = MPI_COMM_WORLD; #endif // initialize the grid // The lines below are equivalent to the one call to: if (blacs_comm != MPI_COMM_NULL) { ConTxt = Csys2blacs_handle_(blacs_comm); Cblacs_gridinit_(&ConTxt, &order, nprow, npcol); // get information back about the grid Cblacs_gridinfo_(ConTxt, &nprow, &npcol, &myrow, &mycol); } if (ConTxt != minusone) { int desc[9]; // get the size of the distributed matrix int locM = numroc_(&m, &mb, &myrow, &rsrc, &nprow); int locN = numroc_(&n, &nb, &mycol, &csrc, &npcol); // printf ("locM = %d \n", locM); // printf ("locN = %d \n", locN); int lld = MAX(one,locM); // build the descriptor descinit_(desc, &m, &n, &mb, &nb, &rsrc, &csrc, &ConTxt, &lld, &info); // Allocate arrays // eigenvalues double* eigvals = malloc(n * sizeof(double)); // allocate the distributed matrices double* mata = malloc(locM*locN * sizeof(double)); // allocate the distributed matrix of eigenvectors double* z = malloc(locM*locN * sizeof(double)); // Eigensolver parameters int ibtype = one; char jobz = 'V'; // eigenvectors also char range = 'A'; // all eiganvalues char uplo = 'L'; // work with upper double vl, vu; int il, iu; char cmach = 'U'; double abstol = 2.0 * pdlamch_(&ConTxt, &cmach); int eigvalm, nz; double orfac = -1.0; //double orfac = 0.001; int* ifail; ifail = malloc(m * sizeof(int)); int* iclustr; iclustr = malloc(2*nprow*npcol * sizeof(int)); double* gap; gap = malloc(nprow*npcol * sizeof(double)); double* work; work = malloc(3 * sizeof(double)); int querylwork = minusone; int* iwork; iwork = malloc(1 * sizeof(int)); int queryliwork = minusone; // Build a trivial distributed matrix: Diagonal matrix pdlaset_(&uplo, &m, &n, &alpha, &beta, mata, &one, &one, desc); // First there is a workspace query // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl, // &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one, // &one, desc, work, &querylwork, iwork, &queryliwork, ifail, iclustr, gap, &info); pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals, z, &one, &one, desc, work, &querylwork, iwork, &queryliwork, &info); //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals, // z, &one, &one, desc, work, &querylwork, &info); int lwork = (int)work[0]; //printf("lwork %d\n", lwork); free(work); int liwork = (int)iwork[0]; //printf("liwork %d\n", liwork); free(iwork); work = (double*)malloc(lwork * sizeof(double)); iwork = (int*)malloc(liwork * sizeof(int)); // This is actually diagonalizes the matrix // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl, // &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one, // &one, desc, work, &lwork, iwork, &liwork, ifail, iclustr, gap, &info); Cblacs_barrier(ConTxt, &scope); tdiag0 = MPI_Wtime(); pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals, z, &one, &one, desc, work, &lwork, iwork, &liwork, &info); //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals, // z, &one, &one, desc, work, &lwork, &info); Cblacs_barrier(ConTxt, &scope); tdiag = MPI_Wtime() - tdiag0; free(work); free(iwork); free(gap); free(iclustr); free(ifail); free(z); free(mata); // Destroy BLACS grid Cblacs_gridexit_(ConTxt); // Check eigenvalues if (myrow == zero && mycol == zero) { for (int i = 0; i < n; i++) { if (fabs(eigvals[i] - beta) > 0.0001) printf("Problem: eigval %d != %f5.2 but %f\n", i, beta, eigvals[i]); } if (info != zero) { printf("info = %d \n", info); } printf("Time (s) diag: %f\n", tdiag); } free(eigvals); } MPI_Barrier(MPI_COMM_WORLD); ttotal = MPI_Wtime() - ttotal0; if (iam == 0) printf("Time (s) total: %f\n", ttotal); MPI_Finalize(); }
int main(int argc, char **argv) { int ictxt, nside, ngrid, nblock, nthread; int rank, size; int ic, ir, nc, nr; int i, j; char *fname; int info, ZERO=0, ONE=1; struct timeval st, et; double dtnn, dtnt, dttn, dttt; double gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt; /* Initialising MPI stuff */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); printf("Process %i of %i.\n", rank, size); /* Parsing arguments */ if(argc < 6) { exit(-3); } nside = atoi(argv[1]); ngrid = atoi(argv[2]); nblock = atoi(argv[3]); nthread = atoi(argv[4]); fname = argv[5]; if(rank == 0) { printf("Multiplying matrices of size %i x %i\n", nside, nside); printf("Process grid size %i x %i\n", ngrid, ngrid); printf("Block size %i x %i\n", nblock, nblock); printf("Using %i OpenMP threads\n", nthread); } #ifdef _OPENMP if(rank == 0) printf("Setting OMP_NUM_THREADS=%i\n", nthread); omp_set_num_threads(nthread); #endif /* Setting up BLACS */ Cblacs_pinfo( &rank, &size ) ; Cblacs_get(-1, 0, &ictxt ); Cblacs_gridinit(&ictxt, "Row", ngrid, ngrid); Cblacs_gridinfo(ictxt, &nr, &nc, &ir, &ic); int descA[9], descB[9], descC[9]; /* Fetch local array sizes */ int Ar, Ac, Br, Bc, Cr, Cc; Ar = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Ac = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Br = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Bc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Cr = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Cc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); printf("Local array section %i x %i\n", Ar, Ac); /* Set descriptors */ descinit_(descA, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Ar, &info); descinit_(descB, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Br, &info); descinit_(descC, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Cr, &info); /* Initialise and fill arrays */ double *A = (double *)malloc(Ar*Ac*sizeof(double)); double *B = (double *)malloc(Br*Bc*sizeof(double)); double *C = (double *)malloc(Cr*Cc*sizeof(double)); for(i = 0; i < Ar; i++) { for(j = 0; j < Ac; j++) { A[j*Ar + i] = drand48(); B[j*Br + i] = drand48(); C[j*Cr + i] = 0.0; } } double alpha = 1.0, beta = 0.0; //======================== if(rank == 0) printf("Starting multiplication (NN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nn = 2.0*pow(nside, 3) / (dtnn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnn, gfpc_nn); //======================== //======================== if(rank == 0) printf("Starting multiplication (NT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nt = 2.0*pow(nside, 3) / (dtnt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnt, gfpc_nt); //======================== //======================== if(rank == 0) printf("Starting multiplication (TN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tn = 2.0*pow(nside, 3) / (dttn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttn, gfpc_tn); //======================== //======================== if(rank == 0) printf("Starting multiplication (TT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tt = 2.0*pow(nside, 3) / (dttt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttt, gfpc_tt); //======================== if(rank == 0) { FILE * fd; fd = fopen(fname, "w"); fprintf(fd, "%g %g %g %g %i %i %i %i %g %g %g %g\n", gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt, nside, ngrid, nblock, nthread, dtnn, dtnt, dttn, dttt); fclose(fd); } Cblacs_gridexit( 0 ); MPI_Finalize(); }