void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){ int i,j,k; /* BLACS_PARAMS: [0] = ictxt; [1] = my_proc; [2] = total_procs; [3] = Nproc_row; [4] = Nproc_col; [5] = my_row; [6] = my_col; [7] = Global_rows; [8] = Global_columns; [9] = Block_Size_row; [10] = Block_Size_col; [11] = LOC_rows; [12] = LOC_columns; */ blacs_pinfo_( i, j); BLACS_PARAMS[1] = i; BLACS_PARAMS[2] = j; blacs_get_( -1, 0, k); // <- LG check the arguments of this call BLACS_PARAMS[0] = k; i = BLACS_PARAMS[3]; j = BLACS_PARAMS[4]; blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j); blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j); BLACS_PARAMS[5] = i; BLACS_PARAMS[6] = j; i = 0; BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]); BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]); i = 0; j = 0; descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8], BLACS_PARAMS[9], BLACS_PARAMS[10], i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); descinit_(DESCB, BLACS_PARAMS[7], 1, BLACS_PARAMS[9], 1, i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); }
int main(int argc, char **argv) { #define test_A(i,j) test_A[(size_t)(j)*N+(i)] #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)] int N,NB,w,LDA,BB; size_t memsize; //bytes int iam, nprocs, mydevice; int ICTXT, nprow, npcol, myprow, mypcol; int i_one = 1, i_zero = 0, i_negone = -1; double d_one = 1.0, d_zero = 0.0, d_negone = -1.0; int IASEED = 100; /* printf("N=?\n"); scanf("%ld",&N); printf("NB=?\n"); scanf("%d", &NB); printf("width of Y panel=?\n"); scanf("%ld",&w); */ if(argc < 4){ printf("invalid arguments N NB memsize(M)\n"); exit(1); } N = atoi(argv[1]); NB = atoi(argv[2]); memsize = (size_t)atoi(argv[3])*1024*1024; BB = (N + NB - 1) / NB; w = memsize/sizeof(double)/BB/NB/NB - 1; assert(w > 0); LDA = N + 0; //padding int do_io = (N <= NSIZE); double llttime; double gflops; nprow = npcol = 1; blacs_pinfo_(&iam, &nprocs); blacs_get_(&i_negone, &i_zero, &ICTXT); blacs_gridinit_(&ICTXT, "R", &nprow, &npcol); blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol); #ifdef USE_MIC #ifdef __INTEL_OFFLOAD printf("offload compilation enabled\ninitialize each MIC\n"); offload_init(&iam, &mydevice); #pragma offload target(mic:0) { mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE); } #else if(isroot) printf("offload compilation not enabled\n"); exit(0); #endif #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasCreate(&worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasInit(); #endif #endif double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol #ifdef VERIFY double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify #endif /*Initialize A */ int i,j; printf("Initialize A ... "); fflush(stdout); llttime = MPI_Wtime(); pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); llttime = MPI_Wtime() - llttime; printf("time %lf\n", llttime); /*print test_A*/ if(do_io){ printf("Original A=\n\n"); matprint(test_A, N, LDA, 'A'); } /*Use directed unblocked Cholesky factorization*/ /* t1 = clock(); Test_dpotrf(test_A2,N); t2 = clock(); printf ("time for unblocked Cholesky factorization on host %f \n", ((float) (t2 - t1)) / CLOCKS_PER_SEC); */ /*print test_A*/ /* if(do_io){ printf("Unblocked result:\n\n"); matprint(test_A2,N,'L'); } */ /*Use tile algorithm*/ Quark *quark = QUARK_New(OOC_NTHREADS); QUARK_DOT_DAG_Enable(quark, 0); #ifdef USE_MIC // mklmem(NB); printf("QUARK MIC affinity binding\n"); QUARK_bind(quark); printf("offload warm up\n"); warmup(quark); #endif QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0)); printf("LLT start %lf\n", MPI_Wtime()); llttime = Cholesky(quark,test_A,N,NB,LDA,memsize); printf("LLT end %lf\n", MPI_Wtime()); QUARK_Delete(quark); #ifdef USE_MIC offload_destroy(); #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasDestroy(worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasShutdown(); #endif #endif gflops = (double) N; gflops = gflops/3.0 + 0.5; gflops = gflops*(double)(N)*(double)(N); gflops = gflops/llttime/1024.0/1024.0/1024.0; printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n", N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops); #ifdef USE_MIC #pragma offload target(mic:0) { memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); } printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0); #endif /*Update and print L*/ if(do_io){ printf("L:\n\n"); matprint(test_A,N,LDA,'L'); } #ifdef VERIFY printf("Verify... "); llttime = MPI_Wtime(); /* * ------------------------ * check difference betwen * test_A and test_A2 * ------------------------ */ /* { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = j; i < N; i++) { double err = (test_A (i, j) - test_A2 (i, j)); err = ABS (err); maxerr = MAX (err, maxerr); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf ("max difference between test_A and test_A2 %lf \n", maxerr); printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2); }; */ /* * ------------------ * over-write test_A2 * ------------------ */ pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A2, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); /* * --------------------------------------- * after solve, test_A2 should be identity * --------------------------------------- */ // test_A = chol(B) = L; // test_A2 = B // solve L*L'*X = B // if L is correct, X is identity */ { int uplo = 'L'; const char *uplo_char = ((uplo == (int) 'U') || (uplo == (int) 'u')) ? "U" : "L"; int info = 0; int nrhs = N; int LDA = N; int ldb = N; dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info); assert (info == 0); } { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = 0; i < N; i++) { double eyeij = (i == j) ? 1.0 : 0.0; double err = (test_A2 (i, j) - eyeij); err = ABS (err); maxerr = MAX (maxerr, err); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf("time %lf\n", MPI_Wtime() - llttime); printf ("max error %lf \n", maxerr); printf ("max L2 error %lf \n", maxerr2); } #endif free(test_A);test_A=NULL; #ifdef VERIFY free(test_A2);test_A2=NULL; #endif blacs_gridexit_(&ICTXT); blacs_exit_(&i_zero); return 0; #undef test_A #undef test_A2 }
int main(int argc, char **argv) { int info, i, j, pcol, Adim; double *D; int *DESCD; CSRdouble BT_i, B_j, Xsparse, Zsparse, Btsparse; /*BT_i.allocate(0,0,0); B_j.allocate(0,0,0); Xsparse.allocate(0,0,0); Zsparse.allocate(0,0,0); Btsparse.allocate(0,0,0);*/ //Initialise MPI and some MPI-variables info = MPI_Init ( &argc, &argv ); if ( info != 0 ) { printf ( "Error in MPI initialisation: %d\n",info ); return info; } position= ( int* ) calloc ( 2,sizeof ( int ) ); if ( position==NULL ) { printf ( "unable to allocate memory for processor position coordinate\n" ); return EXIT_FAILURE; } dims= ( int* ) calloc ( 2,sizeof ( int ) ); if ( dims==NULL ) { printf ( "unable to allocate memory for grid dimensions coordinate\n" ); return EXIT_FAILURE; } //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI blacs_pinfo_ ( &iam,&size ); //determine the number of processes involved info=MPI_Dims_create ( size, 2, dims ); //determine the best 2D cartesian grid with the number of processes if ( info != 0 ) { printf ( "Error in MPI creation of dimensions: %d\n",info ); return info; } //Until now the code can only work with square process grids //So we try to get the biggest square grid possible with the number of processes involved if (*dims != *(dims+1)) { while (*dims * *dims > size) *dims -=1; *(dims+1)= *dims; if (iam==0) printf("WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - (*dims * *dims)); size = *dims * *dims; //cout << "New size of process grid: " << size << endl; } blacs_get_ ( &i_negone,&i_zero,&ICTXT2D ); //Initialisation of the BLACS process grid, which is referenced as ICTXT2D blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 ); if (iam < size) { //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column) blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 ); if ( *position ==-1 ) { printf ( "Error in proces grid\n" ); return -1; } //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp) info=read_input ( *++argv ); if ( info!=0 ) { printf ( "Something went wrong when reading input file for processor %d\n",iam ); return -1; } //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point. blacs_barrier_ ( &ICTXT2D,"ALL" ); if ( * ( position+1 ) ==0 && *position==0 ) printf ( "Reading of input-file succesful\n" ); if ( * ( position+1 ) ==0 && *position==0 ) { printf("\nA linear mixed model with %d observations, %d genotypes, %d random effects and %d fixed effects\n", n,k,m,l); printf("was analyzed using %d (%d x %d) processors\n",size,*dims,*(dims+1)); } //Dimension of A (sparse matrix) is the number of fixed effects(m) + the sparse random effects (l) Adim=m+l; //Dimension of D (dense matrix) is the number of dense effects (k) Ddim=k; pcol= * ( position+1 ); //Define number of blocks needed to store a complete column/row of D Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1; //Define the number of rowblocks needed by the current process to store its part of the dense matrix D Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1; Drows= Drows<1? 1 : Drows; //Define the number of columnblocks needed by the current process to store its part of the dense matrix D Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1; Dcols=Dcols<1? 1 : Dcols; //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise) lld_D=Drows*blocksize; //Initialise the descriptor of the dense distributed matrix DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCD==NULL ) { printf ( "unable to allocate memory for descriptor for C\n" ); return -1; } //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0) //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } //Allocate the space necessary to store the part of D that is held into memory of this process. D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) ); if ( D==NULL ) { printf ( "unable to allocate memory for Matrix D (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) ); return EXIT_FAILURE; } blacs_barrier_ ( &ICTXT2D,"ALL" ); if (iam==0) printf ( "Start set up of B & D\n" ); blacs_barrier_ ( &ICTXT2D,"ALL" ); //set_up_BD is declared in readdist.cpp and constructs the parts of matrices B & D in each processor //which are necessary to create the distributed Schur complement of D info = set_up_BD ( DESCD, D, BT_i, B_j, Btsparse ); //printdense(Drows*blocksize, Dcols * blocksize,D,"matrix_D.txt"); blacs_barrier_ ( &ICTXT2D,"ALL" ); if (iam==0) printf ( "Matrices B & D set up\n" ); if(printD_bool) { int array_of_gsizes[2], array_of_distribs[2], array_of_dargs[2], array_of_psize[2] ; int buffersize; MPI_Datatype file_type; MPI_File fh; MPI_Status status; array_of_gsizes[0]=Dblocks * blocksize; array_of_gsizes[1]=Dblocks * blocksize; array_of_distribs[0]=MPI_DISTRIBUTE_CYCLIC; array_of_distribs[1]=MPI_DISTRIBUTE_CYCLIC; array_of_dargs[0]=blocksize; array_of_dargs[1]=blocksize; array_of_psize[0]=*dims; array_of_psize[1]=*(dims + 1); MPI_Type_create_darray(size,iam,2,array_of_gsizes, array_of_distribs, array_of_dargs, array_of_psize, MPI_ORDER_FORTRAN, MPI_DOUBLE, &file_type); MPI_Type_commit(&file_type); info = MPI_File_open(MPI_COMM_WORLD, filenameD, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); /*if ( ( Drows-1 ) % *(dims+1) == *position && ( Dcols-1 ) % *(dims) == pcol && Ddim%blocksize !=0 ) buffersize=((Drows-1) * blocksize + Ddim % blocksize) * ((Dcols-1) * blocksize + Ddim % blocksize); else if ( ( Drows-1 ) % *(dims+1) == *position && Ddim%blocksize !=0 ) buffersize=((Drows-1) * blocksize + Ddim % blocksize) * Dcols * blocksize; else if ( ( Dcols-1 ) % *(dims) == *position && Ddim%blocksize !=0 ) buffersize=((Dcols-1) * blocksize + Ddim % blocksize) * Drows * blocksize; else*/ buffersize= Dcols * Drows * blocksize * blocksize; MPI_File_set_view(fh, 0, MPI_DOUBLE, file_type, "native", MPI_INFO_NULL); info =MPI_File_write_all(fh, D,buffersize, MPI_DOUBLE, &status); MPI_File_close(&fh); if(iam==0) { printf("Matrix D (dimension %d) is printed in file %s\n", Dblocks*blocksize,filenameD); } if(filenameD != NULL) free(filenameD); filenameD=NULL; //delete[] array_of_gsizes, delete[] array_of_distribs, delete[] array_of_dargs, delete[] array_of_psize; } //Now every matrix has to set up the sparse matrix A, consisting of X'X, X'Z, Z'X and Z'Z + lambda*I Xsparse.loadFromFile ( filenameX ); Zsparse.loadFromFile ( filenameZ ); if(filenameX != NULL) free(filenameX); filenameX=NULL; if(filenameZ != NULL) free(filenameZ); filenameZ=NULL; smat_t *X_smat, *Z_smat; X_smat= (smat_t *) calloc(1,sizeof(smat_t)); Z_smat= (smat_t *) calloc(1,sizeof(smat_t)); X_smat = smat_new_from ( Xsparse.nrows,Xsparse.ncols,Xsparse.pRows,Xsparse.pCols,Xsparse.pData,0,0 ); Z_smat = smat_new_from ( Zsparse.nrows,Zsparse.ncols,Zsparse.pRows,Zsparse.pCols,Zsparse.pData,0,0 ); smat_t *Xt_smat, *Zt_smat; Xt_smat= (smat_t *) calloc(1,sizeof(smat_t)); Zt_smat= (smat_t *) calloc(1,sizeof(smat_t)); Xt_smat = smat_copy_trans ( X_smat ); Zt_smat = smat_copy_trans ( Z_smat ); CSRdouble Asparse; smat_t *XtX_smat, *XtZ_smat, *ZtZ_smat, *lambda_smat, *ZtZlambda_smat; XtX_smat= (smat_t *) calloc(1,sizeof(smat_t)); XtZ_smat= (smat_t *) calloc(1,sizeof(smat_t)); ZtZ_smat= (smat_t *) calloc(1,sizeof(smat_t)); XtX_smat = smat_matmul ( Xt_smat, X_smat ); XtZ_smat = smat_matmul ( Xt_smat, Z_smat ); ZtZ_smat = smat_matmul ( Zt_smat,Z_smat ); Xsparse.clear(); Zsparse.clear(); smat_free(Xt_smat); smat_free(Zt_smat); /*smat_free(X_smat); smat_free(Z_smat);*/ CSRdouble Imat; makeIdentity ( l, Imat ); lambda_smat= (smat_t *) calloc(1,sizeof(smat_t)); lambda_smat = smat_new_from ( Imat.nrows,Imat.ncols,Imat.pRows,Imat.pCols,Imat.pData,0,0 ); smat_scale_diag ( lambda_smat, -lambda ); ZtZlambda_smat= (smat_t *) calloc(1,sizeof(smat_t)); ZtZlambda_smat = smat_add ( lambda_smat, ZtZ_smat ); smat_free(ZtZ_smat); //smat_free(lambda_smat); smat_to_symmetric_structure ( XtX_smat ); smat_to_symmetric_structure ( ZtZlambda_smat ); CSRdouble XtX_sparse, XtZ_sparse, ZtZ_sparse; XtX_sparse.make2 ( XtX_smat->m,XtX_smat->n,XtX_smat->nnz,XtX_smat->ia,XtX_smat->ja,XtX_smat->a ); XtZ_sparse.make2 ( XtZ_smat->m,XtZ_smat->n,XtZ_smat->nnz,XtZ_smat->ia,XtZ_smat->ja,XtZ_smat->a ); ZtZ_sparse.make2 ( ZtZlambda_smat->m,ZtZlambda_smat->n,ZtZlambda_smat->nnz,ZtZlambda_smat->ia,ZtZlambda_smat->ja,ZtZlambda_smat->a ); /*smat_free(XtX_smat); smat_free(XtZ_smat); smat_free(ZtZlambda_smat);*/ Imat.clear(); if (iam==0) { cout << "*** [ t t ] *** " << endl; cout << "*** [ X X X Z ] *** " << endl; cout << "*** [ ] *** " << endl; cout << "*** G e n e r a t i n g m a t r i x A = [ ] *** " << endl; cout << "*** [ t t ] *** " << endl; cout << "*** [ Z X Z Z ] *** " << endl; } //Sparse matrix A only contains the upper triangular part of A create2x2SymBlockMatrix ( XtX_sparse, XtZ_sparse, ZtZ_sparse, Asparse ); //Asparse.writeToFile("A_sparse.csr"); smat_free(XtX_smat); smat_free(XtZ_smat); smat_free(ZtZlambda_smat); XtX_sparse.clear(); XtZ_sparse.clear(); ZtZ_sparse.clear(); blacs_barrier_ ( &ICTXT2D,"ALL" ); if(printsparseC_bool) { CSRdouble Dmat, Dblock, Csparse; Dblock.nrows=Dblocks * blocksize; Dblock.ncols=Dblocks * blocksize; Dblock.allocate(Dblocks * blocksize, Dblocks * blocksize, 0); Dmat.allocate(0,0,0); for (i=0; i<Drows; ++i) { for(j=0; j<Dcols; ++j) { dense2CSR_sub(D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock,( * ( dims) * i + *position ) *blocksize, ( * ( dims+1 ) * j + pcol ) *blocksize); if ( Dblock.nonzeros>0 ) { if ( Dmat.nonzeros==0 ) { Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData ); } else { Dmat.addBCSR ( Dblock ); } } Dblock.clear(); } } blacs_barrier_(&ICTXT2D,"A"); if ( iam!=0 ) { //Each process other than root sends its Dmat to the root process. MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD ); Dmat.clear(); } else { for ( i=1; i<size; ++i ) { // The root process receives parts of Dmat sequentially from all processes and directly adds them together. int nonzeroes, count; MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ if(nonzeroes>0) { printf("Nonzeroes : %d\n ",nonzeroes); Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes ); MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_DOUBLE, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ Dmat.addBCSR ( Dblock ); } } //Dmat.writeToFile("D_sparse.csr"); Dmat.reduceSymmetric(); Btsparse.transposeIt(1); create2x2SymBlockMatrix(Asparse,Btsparse, Dmat, Csparse); Btsparse.clear(); Dmat.clear(); Csparse.writeToFile(filenameC); Csparse.clear(); if(filenameC != NULL) free(filenameC); filenameC=NULL; } } Btsparse.clear(); blacs_barrier_(&ICTXT2D,"A"); //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol double * AB_sol; int * DESCAB_sol; DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCAB_sol==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize) descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } AB_sol=(double *) calloc(Adim * Dcols*blocksize,sizeof(double)); // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp) // The solution of A * Y = B_j is stored in AB_sol (= A^-1 * B_j) blacs_barrier_(&ICTXT2D,"A"); make_Sij_parallel_denseB ( Asparse, BT_i, B_j, D, lld_D, AB_sol ); BT_i.clear(); B_j.clear(); //From here on the Schur complement S of D is stored in D blacs_barrier_ ( &ICTXT2D,"ALL" ); //The Schur complement is factorised (by ScaLAPACK) pdpotrf_ ( "U",&k,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info ); return -1; } //From here on the factorization of the Schur complement S is stored in D blacs_barrier_ ( &ICTXT2D,"ALL" ); //The Schur complement is inverted (by ScaLAPACK) pdpotri_ ( "U",&k,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Inverse of D was unsuccessful, error returned: %d\n",info ); return -1; } //From here on the inverse of the Schur complement S is stored in D blacs_barrier_(&ICTXT2D,"A"); double* InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) ); //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process. if(*position == pcol) { for (i=0; i<Ddim; ++i) { if (pcol == (i/blocksize) % *dims) { int Dpos = i%blocksize + ((i/blocksize) / *dims) * blocksize ; *(InvD_T_Block + Adim +i) = *( D + Dpos + lld_D * Dpos); } } for ( i=0,j=0; i<Dblocks; ++i,++j ) { if ( j==*dims ) j=0; if ( *position==j ) { dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero ); } if ( *position==0 ) { dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j ); } } } blacs_barrier_(&ICTXT2D,"A"); //Only the root process performs a selected inversion of A. if (iam==0) { int pardiso_message_level = 1; int pardiso_mtype=-2; ParDiSO pardiso ( pardiso_mtype, pardiso_message_level ); int number_of_processors = 1; char* var = getenv("OMP_NUM_THREADS"); if(var != NULL) { sscanf( var, "%d", &number_of_processors ); } else { printf("Set environment OMP_NUM_THREADS to 1"); exit(1); } pardiso.iparm[2] = 2; pardiso.iparm[3] = number_of_processors; pardiso.iparm[8] = 0; pardiso.iparm[11] = 1; pardiso.iparm[13] = 0; pardiso.iparm[28] = 0; //This function calculates the factorisation of A once again so this might be optimized. pardiso.findInverseOfA ( Asparse ); printf("Processor %d inverted matrix A\n",iam); } blacs_barrier_(&ICTXT2D,"A"); // To minimize memory usage, and because only the diagonal elements of the inverse are needed, Y' * S is calculated row by rowblocks // the diagonal element is calculates as the dot product of this row and the corresponding column of Y. (Y is solution of AY=B) double* YSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) ); int * DESCYSROW; DESCYSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCYSROW==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //YSrow (1,Ddim) is distributed across processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize) descinit_ ( DESCYSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } blacs_barrier_(&ICTXT2D,"A"); //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1. for (i=1; i<=Adim; ++i) { pdsymm_ ("R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,YSrow,&i_one,&i_one,DESCYSROW); pddot_(&Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,YSrow,&i_one,&i_one,DESCYSROW,&i_one); /*if(*position==1 && pcol==1) printf("Dot product in process (1,1) is: %g\n", *(InvD_T_Block+i-1)); if(*position==0 && pcol==1) printf("Dot product in process (0,1) is: %g\n",*(InvD_T_Block+i-1));*/ } blacs_barrier_(&ICTXT2D,"A"); if(YSrow != NULL) free(YSrow); YSrow = NULL; if(DESCYSROW != NULL) free(DESCYSROW); DESCYSROW = NULL; if(AB_sol != NULL) free(AB_sol); AB_sol = NULL; if(DESCAB_sol != NULL) free(DESCAB_sol); DESCAB_sol = NULL; if(D != NULL) free(D); D = NULL; if(DESCD != NULL) free(DESCD); DESCD = NULL; //Only in the root process we add the diagonal elements of A^-1 if (iam ==0) { for(i=0; i<Adim; ++i) { j=Asparse.pRows[i]; *(InvD_T_Block+i) += Asparse.pData[j]; } Asparse.clear(); printdense ( Adim+k,1,InvD_T_Block,"diag_inverse_C_parallel.txt" ); } if(InvD_T_Block != NULL) free(InvD_T_Block); InvD_T_Block = NULL; blacs_gridexit_(&ICTXT2D); } //cout << iam << " reached end before MPI_Barrier" << endl; MPI_Barrier(MPI_COMM_WORLD); //MPI_Finalize(); return 0; }
int main() { const MKL_INT m = 1000; const MKL_INT k = 100000; const MKL_INT n = 1000; const MKL_INT nb = 100; const MKL_INT nprow = 2; const MKL_INT npcol = 2; MKL_INT iam, nprocs, ictxt, myrow, mycol; MDESC descA, descB, descC, descA_local, descB_local, descC_local; MKL_INT info; MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local; MKL_INT a_lld, b_lld, c_lld; blacs_pinfo_( &iam, &nprocs ); blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); double *a = 0; double *b = 0; double *c = 0; if (iam==0) { a = gen_a(m, k); b = gen_b(k, n); c = (double*)calloc(m*n, sizeof(double)); puts("a="); print(a, m, k); puts("b="); print(b, k, n); } a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol ); b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow ); b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) ); double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) ); double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) ); a_lld = MAX( a_m_local, 1 ); b_lld = MAX( b_m_local, 1 ); c_lld = MAX( c_m_local, 1 ); if (iam==0) { printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local); printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld); } descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info ); descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info ); descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info ); descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info ); printf("Rank %d: start distribute data\n", iam); pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); printf("Rank %d: finished distribute data\n", iam); if (iam==0) { puts("a"); print(A, a_m_local, a_n_local); puts("b"); print(B, b_m_local, b_n_local); } pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); printf("Rank %d: finished dgemm\n", iam); if (iam == 0) { puts("c"); print(C, c_m_local, c_n_local); } pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local); if (iam==0) { puts("global c"); print(c, m, n); } free(A); free(B); free(C); if (iam==0) { free(a); free(b); free(c); } blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); }
int main ( int argc, char **argv ) { int info, i, j, pcol; double *D, *AB_sol, *InvD_T_Block, *XSrow; int *DESCD, *DESCAB_sol, *DESCXSROW; bool readAFromFile = false; //CSRdouble BT_i, B_j; double* BT_i; double* B_j; int s_BT_i = 0; int s_B_j = 0; // size of BT_i (#rows) and B_j (#cols) int nx, ny, nz; CSRdouble Asparse, Btsparse; if (argc < 6) { // needed args: nx, ny, nz, Ddim, blocksize; // optional: C.csr - if provided, the output file is written. cout << "Too few arguments." << endl; cout << "Usage: " << argv[0] << " nx ny nz Ddim blocksize [C.csr]" << endl; cout << "Usage: " << argv[0] << " -fileA filename Adim Ddim blocksize [C.csr]" << endl; exit(-1); } if (strcmp(argv[1], "-fileA") == 0) // if (argv[1] == "-fileA") { readAFromFile = true; filenameA = new char[250]; strcpy(filenameA, argv[2]); Adim = atoi(argv[3]); } else { nx = atoi(argv[1]); ny = atoi(argv[2]); nz = atoi(argv[3]); Adim = nx * ny * nz; } Ddim = atoi(argv[4]); blocksize = atoi(argv[5]); //printf("Adim=%d, Ddim=%d, blocksize=%d *** %s", Adim, Ddim, blocksize, outputc); //exit(-123); //Initialise MPI and some MPI-variables info = MPI_Init ( &argc, &argv ); if ( info != 0 ) { printf ( "Error in MPI initialisation: %d\n",info ); return info; } position= ( int* ) calloc ( 2,sizeof ( int ) ); if ( position==NULL ) { printf ( "unable to allocate memory for processor position coordinate\n" ); return EXIT_FAILURE; } dims= ( int* ) calloc ( 2,sizeof ( int ) ); if ( dims==NULL ) { printf ( "unable to allocate memory for grid dimensions coordinate\n" ); return EXIT_FAILURE; } //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI blacs_pinfo_ ( &iam,&size ); //determine the number of processes involved info=MPI_Dims_create ( size, 2, dims ); //determine the best 2D cartesian grid with the number of processes if ( info != 0 ) { printf ( "Error in MPI creation of dimensions: %d\n",info ); return info; } //Until now the code can only work with square process grids //So we try to get the biggest square grid possible with the number of processes involved if ( *dims != * ( dims+1 ) ) { while ( *dims * *dims > size ) *dims -=1; * ( dims+1 ) = *dims; if ( iam==0 ) printf ( "WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - ( *dims * *dims ) ); size = *dims * *dims; //cout << "New size of process grid: " << size << endl; } blacs_get_ ( &i_negone,&i_zero,&ICTXT2D ); //Initialisation of the BLACS process grid, which is referenced as ICTXT2D blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 ); if ( iam < size ) { //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column) blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 ); if ( *position ==-1 ) { printf ( "Error in proces grid\n" ); return -1; } /* //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp) info=read_input ( *++argv ); if ( info!=0 ) { printf ( "Something went wrong when reading input file for processor %d\n",iam ); return -1; } */ //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point. blacs_barrier_ ( &ICTXT2D,"ALL" ); if ( * ( position+1 ) ==0 && *position==0 ) printf ( "Reading of input-file succesful\n" ); if ( * ( position+1 ) ==0 && *position==0 ) { printf ( "\nA sparse square matrix of dimension %d with a dense square submatrix with dimension %d \n", Adim+Ddim,Ddim ); printf ( "was analyzed using %d (%d x %d) processors\n",size,*dims,* ( dims+1 ) ); } pcol= * ( position+1 ); //Define number of blocks needed to store a complete column/row of D Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1; //Define the number of rowblocks needed by the current process to store its part of the dense matrix D Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1; Drows= Drows<1? 1 : Drows; //Define the number of columnblocks needed by the current process to store its part of the dense matrix D Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1; Dcols=Dcols<1? 1 : Dcols; //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise) lld_D=Drows*blocksize; // cout << "Hi! I am " << iam << ". My position is ( " << *position << "," << *(position+1) << ") and I have... Dblocks: " << Dblocks << "; Drows: " << Drows << "; Dcols: " << Dcols << "; blocksize: " << blocksize << endl; //Initialise the descriptor of the dense distributed matrix DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCD==NULL ) { printf ( "unable to allocate memory for descriptor for C\n" ); return -1; } //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0) //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } //Allocate the space necessary to store the part of D that is held into memory of this process. D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) ); if ( D==NULL ) { printf ( "unable to allocate memory for Matrix D (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) ); return EXIT_FAILURE; } blacs_barrier_ ( &ICTXT2D,"ALL" ); //added B_j = new double[Adim * Dcols * blocksize]; BT_i = new double[Adim * Drows * blocksize]; //read_in_BD ( DESCD,D, BT_i, B_j, Btsparse ) ; if ( iam == 0 ) cout << "Generating A, B and D... \n" << endl; generate_BD(D, BT_i, B_j, &s_BT_i, &s_B_j); cout << "- B, D generated." << endl; //Now every process has to read in the sparse matrix A //makeDiagonalPerturbD(Adim, 1000.0, 1e-10, Asparse); cout << "A is a pert. diag." << endl; //makeRandCSRUpper(Adim, 0.001, Asparse); //cout << "nnz(A) = " << Asparse.nonzeros << endl; //Asparse.loadFromFileSym("/users/drosos/simple/matrices/NornePrimaryJacobian.csr"); if (readAFromFile) { Asparse.loadFromFile(filenameA); cout << "A loaded from file" << endl; //Asparse.reduceSymmetric(); } else { make3DLaplace(nx, ny, nz, Asparse); cout << "A is Laplacian" << endl; shiftIndices(Asparse, -1); } cout << "- A generated." << endl; Asparse.matrixType = SYMMETRIC; assert(Asparse.nrows == Adim); assert(Asparse.ncols == Adim); //if (iam == 0) Asparse.writeToFile("A_debug.csr"); exit(-1234); if (argc == 7) // if the name of the output file for C is given as parameter { filenameC = new char[250]; //strcpy(filenameC, argv[6]); sprintf(filenameC, "/scratch/daint/verbof/sparsedense/C_%d_%d.csr", Adim, Ddim); CSRdouble Dmat, Dblock, Csparse, Bblock; Dblock.nrows=Dblocks * blocksize; Dblock.ncols=Dblocks * blocksize; Dblock.allocate ( Dblocks * blocksize, Dblocks * blocksize, 0 ); Dmat.allocate ( 0,0,0 ); for ( i=0; i<Drows; ++i ) { for ( j=0; j<Dcols; ++j ) { dense2CSR_sub ( D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock, ( * ( dims ) * i + *position ) *blocksize, ( * ( dims+1 ) * j + pcol ) *blocksize ); if ( Dblock.nonzeros>0 ) { if ( Dmat.nonzeros==0 ) { Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData ); } else { Dmat.addBCSR ( Dblock ); } } Dblock.clear(); } } if ( *position==0 ) { Bblock.nrows=Adim; Bblock.ncols=Dblocks * blocksize; Bblock.allocate ( Adim, Dblocks * blocksize, 0 ); Btsparse.allocate ( 0,0,0 ); for ( j=0; j<Dcols; ++j ) { dense2CSR_sub ( B_j + j * Adim * blocksize,Adim,blocksize,Adim,Bblock,0, ( * ( dims+1 ) * j + pcol ) *blocksize ); if ( Bblock.nonzeros>0 ) { if ( Btsparse.nonzeros==0 ) { Btsparse.make2 ( Bblock.nrows,Bblock.ncols,Bblock.nonzeros,Bblock.pRows,Bblock.pCols,Bblock.pData ); } else { Btsparse.addBCSR ( Bblock ); } } Bblock.clear(); } } blacs_barrier_ ( &ICTXT2D,"A" ); if ( iam!=0 ) { //Each process other than root sends its Dmat to the root process. MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD ); Dmat.clear(); if ( *position==0 ) { MPI_Send ( & ( Btsparse.nonzeros ),1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD ); MPI_Send ( & ( Btsparse.pRows[0] ),Btsparse.nrows + 1, MPI_INT,0,iam+5*size,MPI_COMM_WORLD ); MPI_Send ( & ( Btsparse.pCols[0] ),Btsparse.nonzeros, MPI_INT,0,iam+6*size,MPI_COMM_WORLD ); MPI_Send ( & ( Btsparse.pData[0] ),Btsparse.nonzeros, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD ); Btsparse.clear(); } } else { //Btsparse.writeToFile("Btsparse_pre.csr"); for ( i=1; i<size; ++i ) { // The root process receives parts of Dmat sequentially from all processes and directly adds them together. int nonzeroes, count; MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ if ( nonzeroes>0 ) { printf ( "Nonzeroes : %d\n ",nonzeroes ); Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes ); MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_DOUBLE, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ Dmat.addBCSR ( Dblock ); Dblock.clear(); } if ( i / *dims == 0 ) { MPI_Recv ( &nonzeroes,1,MPI_INT,i,i+4*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ if ( nonzeroes>0 ) { printf ( "Nonzeroes : %d\n ",nonzeroes ); Bblock.allocate ( Adim,Dblocks * blocksize,nonzeroes ); MPI_Recv ( & ( Bblock.pRows[0] ), Adim + 1, MPI_INT,i,i+5*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Bblock.pCols[0] ),nonzeroes, MPI_INT,i,i+6*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_INT, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ MPI_Recv ( & ( Bblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+7*size,MPI_COMM_WORLD,&status ); /*MPI_Get_count(&status, MPI_DOUBLE, &count); printf("Process 0 received %d elements of process %d\n",count,i);*/ Btsparse.addBCSR ( Bblock ); Bblock.clear(); } } } //Dmat.writeToFile("D_sparse.csr"); printf ( "Number of nonzeroes in D: %d\n",Dmat.nonzeros ); Dmat.reduceSymmetric(); //Dmat.writeToFile("D_sparse_symm.csr"); //Btsparse.writeToFile("Btsparse.csr"); Dmat.changeCols(Ddim); Dmat.changeRows(Ddim); //Dmat.writeToFile("Dsparse_red.csr"); Btsparse.changeCols(Ddim); create2x2SymBlockMatrix ( Asparse,Btsparse, Dmat, Csparse ); Btsparse.clear(); Dmat.clear(); ParDiSO p(-2,0); p.init(Csparse, 1); p.factorize(Csparse); //Csparse.fillSymmetric(); //Csparse.writeToFilePSelInv(filenameC); //Csparse.writeToFile(filenameC); Csparse.clear(); //double* Cdense = new double[Csparse.nrows * Csparse.ncols]; //CSR2dense(Csparse, Cdense); //printdense(Adim+Ddim, Adim+Ddim, Cdense, "C.txt"); if ( filenameC != NULL ) free ( filenameC ); filenameC=NULL; } if (iam == 0) { cout << "\n - C saved in file " << filenameC << "! Exiting... \n\n" << endl; exit(-12345); } } //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCAB_sol==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize) descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } AB_sol= ( double * ) calloc ( Adim * s_B_j,sizeof ( double ) ); blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( totaltime ); if ( iam == 0 ) watch.tick ( cresctime ); // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp) // The solution of A * X = B_j is stored in AB_sol (= A^-1 * B_j) /* char * BT_i_debugFile = new char[100]; char * B_j_debugFile = new char[100]; sprintf(BT_i_debugFile, "BT_i_debug_%d.txt", iam); sprintf(B_j_debugFile, "B_j_debug_%d.txt", iam); BT_i.writeToFile(BT_i_debugFile); B_j.writeToFile(B_j_debugFile); */ make_Sij_denseB ( Asparse, BT_i, B_j, s_BT_i, s_B_j, D, lld_D, AB_sol ); /* char * AB_sol_debugFile = new char[100]; char * D_debugFile = new char[100]; sprintf(AB_sol_debugFile, "AB_sol_debug_%d.txt", iam); sprintf(D_debugFile, "D_debug_%d.txt", iam); printDenseDouble(AB_sol_debugFile, ios::out, Drows*blocksize, Dcols*blocksize, AB_sol); printDenseDouble(D_debugFile, ios::out, Ddim, Ddim, D); cout << iam << " just wrote debug stuff... " << endl; */ blacs_barrier_ ( &ICTXT2D,"ALL" ); if ( iam == 0 ) watch.tack ( cresctime ); if ( iam !=0 ) { Asparse.clear(); pardiso_var.clear(); } //BT_i.clear(); //B_j.clear(); delete[] BT_i; delete[] B_j; blacs_barrier_ ( &ICTXT2D,"ALL" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( facsctime ); //The Schur complement is factorised (by ScaLAPACK) pdpotrf_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info ); return -1; } blacs_barrier_ ( &ICTXT2D,"ALL" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( facsctime ); if ( iam == 0 ) watch.tick ( invsctime ); //The Schur complement is inverteded (by ScaLAPACK) pdpotri_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info ); if ( info != 0 ) { printf ( "Inverse of D was unsuccessful, error returned: %d\n",info ); return -1; } blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( invsctime ); InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) ); if ( iam == 0 ) watch.tick ( gathrtime ); blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( sndrctime ); //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process. if ( *position == pcol ) { for ( i=0; i<Ddim; ++i ) { if ( pcol == ( i/blocksize ) % *dims ) { int Dpos = i%blocksize + ( ( i/blocksize ) / *dims ) * blocksize ; * ( InvD_T_Block + Adim +i ) = * ( D + Dpos + lld_D * Dpos ); } } for ( i=0,j=0; i<Dblocks; ++i,++j ) { if ( j==*dims ) j=0; if ( *position==j ) { dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero ); } if ( *position==0 ) { dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j ); } } } blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( sndrctime ); if ( position != NULL ) { free ( position ); position=NULL; } if ( dims != NULL ) { free ( dims ); dims=NULL; } //Only the root process performs a selected inversion of A. if ( iam==0 ) { watch.tick ( invrAtime ); /*int pardiso_message_level = 1; int pardiso_mtype=-2; ParDiSO pardiso ( pardiso_mtype, pardiso_message_level );*/ int number_of_processors = 1; char* var = getenv ( "OMP_NUM_THREADS" ); if ( var != NULL ) sscanf ( var, "%d", &number_of_processors ); else { printf ( "Set environment OMP_NUM_THREADS to 1" ); exit ( 1 ); } pardiso_var.iparm[2] = 2; pardiso_var.iparm[3] = number_of_processors; pardiso_var.iparm[8] = 0; pardiso_var.iparm[11] = 1; pardiso_var.iparm[13] = 0; pardiso_var.iparm[28] = 0; //This function calculates the factorisation of A once again so this might be optimized. pardiso_var.findInverseOfA ( Asparse ); cout << "Memory allocated by pardiso: " << pardiso_var.memoryAllocated() << endl; printf ( "Processor %d inverted matrix A\n",iam ); watch.tack ( invrAtime ); } blacs_barrier_ ( &ICTXT2D,"A" ); // To minimize memory usage, and because only the diagonal elements of the inverse are needed, X' * S is calculated row by row // the diagonal element is calculated as the dot product of this row and the corresponding column of X. (X is solution of AX=B) XSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) ); DESCXSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCXSROW==NULL ) { printf ( "unable to allocate memory for descriptor for AB_sol\n" ); return -1; } //XSrow (1,Ddim) is distributed acrros processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize) descinit_ ( DESCXSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix C returns info: %d\n",info ); return info; } blacs_barrier_ ( &ICTXT2D,"A" ); if ( iam == 0 ) cout << "Calculating diagonal elements of the first block of the inverse... \n" << endl; blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tick ( dotprtime ); //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1. for ( i=1; i<=Adim; ++i ) { pdsymm_ ( "R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,XSrow,&i_one,&i_one,DESCXSROW ); pddot_ ( &Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,XSrow,&i_one,&i_one,DESCXSROW,&i_one ); } blacs_barrier_ ( &ICTXT2D,"A" ); /********************** TIMING **********************/ if ( iam == 0 ) watch.tack ( dotprtime ); if ( D!=NULL ) { free ( D ); D=NULL; } if ( AB_sol!=NULL ) { free ( AB_sol ); AB_sol=NULL; } if ( XSrow !=NULL ) { free ( XSrow ); XSrow=NULL; } if ( DESCD!=NULL ) { free ( DESCD ); DESCD=NULL; } if ( DESCAB_sol!=NULL ) { free ( DESCAB_sol ); DESCAB_sol=NULL; } if ( DESCXSROW!=NULL ) { free ( DESCXSROW ); DESCXSROW=NULL; } //Only in the root process we add the diagonal elements of A^-1 if ( iam ==0 ) { for ( i = 0; i < Adim; i++ ) { j = Asparse.pRows[i]; * ( InvD_T_Block+i ) += Asparse.pData[j]; } /********************** TIMING **********************/ watch.tack ( gathrtime ); watch.tack ( totaltime ); /* //cout << "Extraction completed by "; for (i = 0; i < Ddim; i++) { cout << "Extracting row " << i << "/" << Ddim << endl; //cout << setw(3) << std::setfill('0') << int(i*100.0 / (Ddim-1)) << "%" << "\b\b\b\b"; diagonal[Asparse.nrows + i] = InvD_T_Block[i*Ddim + i]; } cout << endl; */ Asparse.clear(); /* cout << "Extracting diagonal... \n" << endl; cout << "Saving diagonal... \n" << endl; char* diagOutFile = new char[50]; sprintf ( diagOutFile, "diag_inverse_C_parallel_%d.txt", size ); printdense ( Adim+Ddim, 1, InvD_T_Block, diagOutFile ); delete[] diagOutFile; */ } if ( InvD_T_Block !=NULL ) { free ( InvD_T_Block ); InvD_T_Block=NULL; } if ( iam == 0 ) { // Conversion milliseconds -> seconds cresctime /= 1000.0; facsctime /= 1000.0; invsctime /= 1000.0; gathrtime /= 1000.0; invrAtime /= 1000.0; totaltime /= 1000.0; cout << "********************************* TIME REPORT ********************************** \n" << endl; cout << " SCHUR COMPLEMENT BUILDING: " << cresctime << " seconds" << endl; cout << " SCHUR COMPLEMENT FACTORIZATION: " << facsctime << " seconds" << endl; cout << " SCHUR COMPLEMENT INVERSION: " << invsctime << " seconds" << endl; cout << " " << endl; cout << " FINAL OPERATIONS (INVERSION OF A INCLUDED): " << gathrtime << " seconds" << endl; cout << " INVERSION OF A: " << invrAtime << " seconds" << endl; cout << " TOTAL TIME: " << totaltime << " seconds" << endl; cout << "******************************************************************************** \n" << endl; /* * double totaltime = 0.0; // Total execution time * double cresctime = 0.0; // Schur-complement (total) * double facsctime = 0.0; // Schur-complement factorization * double invsctime = 0.0; // Schur-complement inversion * double gathrtime = 0.0; // Last operationsdouble invrAtime = 0.0; // Inversion of A * */ char* timingFile = new char[50]; sprintf ( timingFile, "weak_tests.csv" ); std::fstream timeF; timeF.open ( timingFile, std::fstream::out | std::fstream::app ); timeF.setf ( ios::scientific, ios::floatfield ); //timeF << "PROBLEM SIZE: " << Adim/1000 << "k + " << Ddim/1000 << "k" << endl; //timeF << //"#PROCS,SCHUR_BUILD,SCHUR_FACT,SCHUR_INV,INV(A),FINAL_OPS, //SEND_RECV, DOT_PROD,TOTAL" << endl; timeF << size << "," << cresctime << "," << facsctime << "," << invsctime << "," << invrAtime << "," << gathrtime << "," << sndrctime << "," << dotprtime << "," << totaltime << endl; timeF.close(); } blacs_barrier_ ( &ICTXT2D,"A" ); blacs_gridexit_ ( &ICTXT2D ); } //cout << iam << " reached end before MPI_Barrier" << endl; MPI_Barrier ( MPI_COMM_WORLD ); MPI_Finalize(); return 0; }
/*==== MAIN FUNCTION =================================================*/ int main( int argc, char *argv[] ){ /* ==== Declarations =================================================== */ /* File variables */ FILE *fin; /* Matrix descriptors */ MDESC descA, descB, descC, descA_local, descB_local; /* Local scalars */ MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol; MKL_INT n, nb, mp, nq, lld, lld_local; MKL_INT i, j, info; int n_int, nb_int, nprow_int, npcol_int; double thresh, diffnorm, anorm, bnorm, residual, eps; /* Local arrays */ double *A_local, *B_local, *A, *B, *C, *work; MKL_INT iwork[ 4 ]; /* ==== Executable statements ========================================== */ /* Get information about how many processes are used for program execution and number of current process */ blacs_pinfo_( &iam, &nprocs ); /* Init temporary 1D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "C", &nprocs, &i_one ); /* Open input file */ if ( iam == 0 ) { fin = fopen( "../in/pblas3ex.in", "r" ); if ( fin == NULL ) { printf( "Error while open input file." ); return 2; } } /* Read data and send it to all processes */ if ( iam == 0 ) { /* Read parameters */ fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int ); fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int ); fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int ); fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int ); fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh ); n = (MKL_INT) n_int; nb = (MKL_INT) nb_int; nprow = (MKL_INT) nprow_int; npcol = (MKL_INT) npcol_int; /* Check if all parameters are correct */ if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) { printf( "One or several input parameters has incorrect value. Limitations:\n" ); printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" ); printf( "p*q = number of processes\n" ); printf( "threshold - double (set to negative to swicth off check)\n"); return 2; } /* Pack data into array and send it to other processes */ iwork[ 0 ] = n; iwork[ 1 ] = nb; iwork[ 2 ] = nprow; iwork[ 3 ] = npcol; igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four ); dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one ); } else { /* Recieve and unpack data */ igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero ); dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero ); n = iwork[ 0 ]; nb = iwork[ 1 ]; nprow = iwork[ 2 ]; npcol = iwork[ 3 ]; } if ( iam == 0 ) { fclose( fin ); } /* Destroy temporary process grid */ blacs_gridexit_( &ictxt ); /* Init workind 2D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* Create on process 0 two matrices: A - orthonormal, B -random */ if ( ( myrow == 0 ) && ( mycol == 0 ) ){ /* Allocate arrays */ A_local = (double*) calloc( n*n, sizeof( double ) ); B_local = (double*) calloc( n*n, sizeof( double ) ); /* Set arrays */ for ( i=0; i<n; i++ ){ for ( j=0; j<n; j++ ){ B_local[ i+n*j ] = one*rand()/RAND_MAX; } B_local[ i+n*i ] += two; } for ( j=0; j<n; j++ ){ for ( i=0; i<n; i++ ){ if ( j < n-1 ){ if ( i <= j ){ A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) ); } else if ( i == j+1 ) { A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) ); } else { A_local[ i+n*j ] = zero; } } else { A_local[ i+n*(n-1) ] = one / sqrt( ( double )n ); } } } /* Print information of task */ printf( "=== START OF EXAMPLE ===================\n" ); printf( "Matrix-matrix multiplication: A*B = C\n\n" ); printf( "/ 1/q_1 ........ 1/q_n-1 1/q_n \\ \n" ); printf( "| . | \n" ); printf( "| `. : : | \n" ); printf( "| -1/q_1 `. : : | \n" ); printf( "| . `. : : | = A \n" ); printf( "| 0 `. ` | \n" ); printf( "| : `. `. 1/q_n-1 1/q_n | \n" ); printf( "| : `. `. | \n" ); printf( "\\ 0 .... 0 -(n-1)/q_n-1 1/q_n / \n\n" ); printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" ); printf( "A - n*n real matrix (orthonormal) \n" ); printf( "B - random n*n real matrix\n\n" ); printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol ); printf( "=== PROGRESS ===========================\n" ); } else { /* Other processes don't contain parts of initial arrays */ A_local = NULL; B_local = NULL; } /* Compute precise length of local pieces and allocate array on each process for parts of distributed vectors */ mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow ); nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); A = (double*) calloc( mp*nq, sizeof( double ) ); B = (double*) calloc( mp*nq, sizeof( double ) ); C = (double*) calloc( mp*nq, sizeof( double ) ); /* Compute leading dimensions */ lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 ); lld = MAX( mp, 1 ); /* Initialize descriptors for initial arrays located on 0 process */ descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); /* Initialize descriptors for distributed arrays */ descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); /* Distribute matrices from 0 process over process grid */ pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); } /* Destroy arrays on 0 process - they are not necessary anymore */ if( ( myrow == 0 ) && ( mycol == 0 ) ){ free( A_local ); free( B_local ); } /* Compute norm of A and B */ work = (double*) calloc( mp, sizeof( double ) ); anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work ); bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); } /* Compute product C = A*B */ pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); } /* Compute difference B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */ pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC, &negone, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); } /* Compute norm of B - inv_A*C (which is contained in B) */ diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); free( work ); if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); } /* Print results */ if( iam == 0 ){ printf( ".. Solutions are compared ..\n" ); printf( "== Results ==\n" ); printf( "||A|| = %03.11f\n", anorm ); printf( "||B|| = %03.11f\n", bnorm ); printf( "=== END OF EXAMPLE =====================\n" ); } /* Compute machine epsilon */ eps = pdlamch_( &ictxt, "e" ); /* Compute residual */ residual = diffnorm /( two*anorm*bnorm*eps ); /* Destroy arrays */ free( A ); free( B ); free( C ); /* Destroy process grid */ blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); /* Check if residual passed or failed the threshold */ if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){ printf( "FAILED. Residual = %05.16f\n", residual ); return 1; } else { return 0; } /*======================================================================== ====== End of PBLAS Level 3 example ==================================== ======================================================================*/ }