void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){ int i,j,k; /* BLACS_PARAMS: [0] = ictxt; [1] = my_proc; [2] = total_procs; [3] = Nproc_row; [4] = Nproc_col; [5] = my_row; [6] = my_col; [7] = Global_rows; [8] = Global_columns; [9] = Block_Size_row; [10] = Block_Size_col; [11] = LOC_rows; [12] = LOC_columns; */ blacs_pinfo_( i, j); BLACS_PARAMS[1] = i; BLACS_PARAMS[2] = j; blacs_get_( -1, 0, k); // <- LG check the arguments of this call BLACS_PARAMS[0] = k; i = BLACS_PARAMS[3]; j = BLACS_PARAMS[4]; blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j); blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j); BLACS_PARAMS[5] = i; BLACS_PARAMS[6] = j; i = 0; BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]); BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]); i = 0; j = 0; descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8], BLACS_PARAMS[9], BLACS_PARAMS[10], i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); descinit_(DESCB, BLACS_PARAMS[7], 1, BLACS_PARAMS[9], 1, i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); }
int driver_psgesvd( char jobU, char jobVT, int m, int n, float *A, int ia, int ja, int *descA, float *S_NN, float *U_NN, int iu, int ju, int *descU, float *VT_NN, int ivt, int jvt, int *descVT, double *MPIelapsedNN){ float *Acpy=NULL, *work=NULL; int lwork; /**/ int ione=1; /**/ int nprow, npcol, myrow, mycol; int mpA, nqA; int ictxt, nbA, rsrcA, csrcA; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int infoNN; double MPIt1, MPIt2; /**/ ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ; if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); } pslacpy_( "All", &m, &n, A, &ione, &ione, descA, Acpy, &ione, &ione, descA ); work = (float *)calloc(1,sizeof(float)) ; if (work==NULL){ printf("error of memory allocation for work on proc %dx%d (1st time)\n",myrow,mycol); exit(0); } lwork=-1; psgesvd_( &jobU, &jobVT, &m, &n, Acpy, &ione, &ione, descA, S_NN, U_NN, &ione, &ione, descU, VT_NN, &ione, &ione, descVT, work, &lwork, &infoNN); lwork = (int) (work[0]); free(work); work = (float *)calloc(lwork,sizeof(float)) ; if (work==NULL){ printf("error of memory allocation work on proc %dx%d\n",myrow,mycol); exit(0); } /**/ MPIt1 = MPI_Wtime(); /**/ psgesvd_( &jobU, &jobVT, &m, &n, Acpy, &ione, &ione, descA, S_NN, U_NN, &ione, &ione, descU, VT_NN, &ione, &ione, descVT, work, &lwork, &infoNN); /**/ MPIt2 = MPI_Wtime(); (*MPIelapsedNN)=MPIt2-MPIt1; /**/ free(work); free(Acpy); return infoNN; }
/// /// This code generates a PdgesvdArgs parameter block that can be used to drive /// pdgesvdSlave2() when there is no SciDB application to provide the info. /// It makes up parameters for a pdgesvd call that are appropriate to the /// processor grid and order of matrix being decomposed /// scidb::PdgesvdArgs pdgesvdGenTestArgs(slpp::int_t ICTXT, slpp::int_t NPROW, slpp::int_t NPCOL, slpp::int_t MYPROW, slpp::int_t MYPCOL, slpp::int_t MYPNUM, slpp::int_t order) { scidb::PdgesvdArgs result; // hard-code a problem based on order and fixed block size const slpp::int_t M=order; const slpp::int_t N=order; const slpp::int_t MIN_MN=order; const slpp::int_t BLKSZ=slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE; // we are making up an array descriptor, not receiving one // as is normal for functions in a xxxxSlave.cpp file. It is only because // its a test routine that SCALAPACK_EFFICIENT_BLOCK_SIZE is referenced here // Normally it is only used at the xxxxxPhysical.cpp operator level. const slpp::int_t one = 1 ; const char jobU = 'V'; const char jobVT = 'V'; // create ScaLAPACK array descriptors const slpp::int_t RSRC = 0 ; // LLD(A) slpp::int_t LLD_A = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW )); // LLD(VT) slpp::int_t LLD_VT = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW )); // WARNING -- note I never checked INFO from descinits !! slpp::int_t INFO = 0; slpp::desc_t DESC_A; descinit_(DESC_A, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); slpp::desc_t DESC_U; descinit_(DESC_U, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); slpp::desc_t DESC_VT; descinit_(DESC_VT, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_VT, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); // S is different: global, not distributed, so its LLD(S) == LEN(S) slpp::desc_t DESC_S; descinit_(DESC_S, MIN_MN, 1, BLKSZ, BLKSZ, 0, 0, ICTXT, MIN_MN, INFO); if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error"); pdgesvdMarshallArgs(&result, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM, jobU, jobVT, M, N, NULL /*A*/, one, one, DESC_A, NULL /*S*/, NULL /*U*/, one, one, DESC_U, NULL /*VT*/, one, one, DESC_VT); return result; }
float verif_representativity(int m, int n, float *A, int ia, int ja, int *descA, float *U, int iu, int ju, int *descU, float *VT, int ivt, int jvt, int *descVT, float *S){ float *Acpy=NULL, *Ucpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, pcol, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, nbU, rsrcU, csrcU, mpU, nqU; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, ione = 1; float *wwork=NULL; float tmone= -1.0e+00, tpone= +1.0e+00; float residF, AnormF; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ; if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); } pslacpy_( "All", &m, &n, A, &ia, &ja, descA, Acpy, &ia, &ja, descA ); nbU = descU[nb_]; rsrcU = descU[rsrc_] ; csrcU = descU[csrc_] ; mpU = numroc_( &m , &nbU, &myrow, &rsrcU, &nprow ); nqU = numroc_( &min_mn, &nbU, &mycol, &csrcU, &npcol ); Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ; if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); } pslacpy_( "All", &m, &min_mn, U, &iu, &ju, descU, Ucpy, &iu, &ju, descU ); AnormF = pslange_( "F", &m, &n, A, &ia, &ja, descA, wwork); for (i=1;i<min_mn+1;i++){ pcol = indxg2p_( &i, &(descU[5]), &izero, &izero, &npcol ); localcol = indxg2l_( &i, &(descU[5]), &izero, &izero, &npcol ); if( mycol==pcol ) sscal_( &mpA, &(S[i-1]), &(Ucpy[ ( localcol-1 )*descU[8] ]), &ione ); } psgemm_( "N", "N", &m, &n, &min_mn, &tpone, Ucpy, &iu, &ju, descU, VT, &ivt, &jvt, descVT, &tmone, Acpy, &ia, &ja, descA ); residF = pslange_( "F", &m, &n, Acpy, &ione, &ione, descA, wwork); residF = residF/AnormF/((float) max_mn); free(Ucpy); free(Acpy); return residF; }
float verif_repres_VN(int m, int n, float *A, int ia, int ja, int *descA, float *U, int iu, int ju, int *descU, float *S){ float *VTcpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, prow, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, mpVT, nqVT, descVTcpy[9], itemp, ivtcpy, jvtcpy; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, info; float tpone= +1.0e+00, tzero= +0.0e+00; float verif_repres_VN, invStemp; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); mpVT = numroc_( &min_mn, &nbA, &myrow, &rsrcA, &nprow ); nqVT = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); itemp = max( 1, mpVT ); descinit_( descVTcpy, &min_mn, &n, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info ); ivtcpy = 1; jvtcpy = 1; VTcpy = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VTcpy==NULL){ printf("error of memory allocation VTcpy on proc %dx%d\n",myrow,mycol); exit(0); } psgemm_( "T", "N", &min_mn, &n, &m, &tpone, U, &iu, &ju, descU, A, &ia, &ja, descA, &tzero, VTcpy, &ivtcpy, &jvtcpy, descVTcpy ); for (i=1;i<min_mn+1;i++){ prow = indxg2p_( &i, &nbA, &izero, &izero, &nprow ); localcol = indxg2l_( &i, &nbA, &izero, &izero, &nprow ); invStemp = 1/S[i-1]; if( myrow==prow ) sscal_( &nqA, &invStemp, &(VTcpy[localcol-1]), &mpVT ); } verif_repres_VN = verif_orthogonality(min_mn,n,VTcpy, ivtcpy, jvtcpy, descVTcpy); free(VTcpy); return verif_repres_VN; }
float verif_repres_NV(int m, int n, float *A, int ia, int ja, int *descA, float *VT, int ivt, int jvt, int *descVT, float *S){ float *Ucpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, pcol, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, mpU, nqU, descUcpy[9], itemp, iucpy, jucpy; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, ione = 1, info; float tpone= +1.0e+00, tzero= +0.0e+00; float verif_repres_NV, invStemp; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); itemp = max( 1, mpA ); descinit_( descUcpy, &m, &min_mn, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info ); iucpy = 1; jucpy = 1; mpU = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqU = numroc_( &min_mn, &nbA, &mycol, &csrcA, &npcol ); Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ; if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); } psgemm_( "N", "T", &m, &min_mn, &n, &tpone, A, &ia, &ja, descA, VT, &ivt, &jvt, descVT, &tzero, Ucpy, &iucpy, &jucpy, descUcpy ); for (i=1;i<min_mn+1;i++){ pcol = indxg2p_( &i, &(descUcpy[5]), &izero, &izero, &npcol ); localcol = indxg2l_( &i, &(descUcpy[5]), &izero, &izero, &npcol ); invStemp = 1/S[i-1]; if( mycol==pcol ) sscal_( &mpA, &invStemp, &(Ucpy[ ( localcol-1 )*descUcpy[8] ]), &ione ); } verif_repres_NV = verif_orthogonality(m,min_mn,Ucpy, iucpy, jucpy, descUcpy); free(Ucpy); return verif_repres_NV; }
void dgather(int ictxt, int n, int numc, int nb, double *A, double *A_d, int *descAd){ int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info; int nprow, npcol, myrow, mycol, descA[9], itemp; int i,k; sl_init_(&RootNodeic, &ione, &ione); Cblacs_gridinfo(ictxt, &nprow,&npcol, &myrow, &mycol); if (myrow==0 && mycol ==0){ isRootNode = 1;} if(isRootNode){ nru = numroc_(&n, &n, &myrow, &izero, &nprow); itemp = max(1,nru); descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info ); } else{ k=0; for(i=0;i<9;i++){ descA[k]=0; k++; } descA[1]=-1; } pdgemr2d_(&n,&numc,A_d,&ione, &ione, descAd, A, &ione, &ione, descA, &ictxt ); if (isRootNode){ Cblacs_gridexit(RootNodeic); } }
PyObject* get_blacs_local_shape(PyObject *self, PyObject *args) { int ConTxt; int m, n, mb, nb, rsrc, csrc; int nprow, npcol, myrow, mycol; int locM, locN; if (!PyArg_ParseTuple(args, "iiiiiii", &ConTxt, &m, &n, &mb, &nb, &rsrc, &csrc)){ return NULL; } Cblacs_gridinfo_(ConTxt, &nprow, &npcol, &myrow, &mycol); locM = numroc_(&m, &mb, &myrow, &rsrc, &nprow); locN = numroc_(&n, &nb, &mycol, &csrc, &npcol); return Py_BuildValue("(ii)", locM, locN); }
void blacs_pdgetri_nektar(int *BLACS_PARAMS, int *DESCA, int *ipvt, double **inva_LOC){ int row_start = 1, col_start = 1; int lwork,liwork,info = 0; double *work; int *iwork; int M; M = BLACS_PARAMS[7] + ((row_start-1) % BLACS_PARAMS[10]); lwork = BLACS_PARAMS[9]*numroc_( M, BLACS_PARAMS[10], BLACS_PARAMS[5], row_start, BLACS_PARAMS[3]); liwork = BLACS_PARAMS[7]; work = dvector(0,lwork-1); iwork = ivector(0,liwork-1); int i = -1, j = -1; pdgetri_(BLACS_PARAMS[7],*inva_LOC, row_start,col_start,DESCA,ipvt, work,i, iwork,j, info); if (info != 0) fprintf(stderr,"blacs_pdgetri_nektar: ERROR - info = %d \n",info); if ( ((int) work[0]) > lwork){ lwork = (int) work[0]; free(work); work = dvector(0,lwork); } if ( iwork[0] > lwork){ liwork = iwork[0]; free(iwork); iwork = ivector(0,liwork); } pdgetri_(BLACS_PARAMS[7],*inva_LOC, row_start,col_start,DESCA,ipvt, work,lwork, iwork,liwork, info); if (info != 0) fprintf(stderr,"blacs_pdgetri_nektar: ERROR - info = %d \n",info); free(work); free(iwork); }
float verif_orthogonality(int m, int n, float *U, int iu, int ju, int *descU){ float *R=NULL; int nprow, npcol, myrow, mycol; int mpR, nqR, nb, itemp, descR[9], ictxt, info, min_mn, max_mn; int ctxt_ = 1, nb_ = 5; int izero = 0, ione = 1; float *wwork=NULL; float tmone= -1.0e+00, tpone= +1.0e+00, tzero= +0.0e+00; float orthU; min_mn = min(m,n); max_mn = max(m,n); ictxt = descU[ctxt_]; nb = descU[nb_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); mpR = numroc_( &min_mn, &nb, &myrow, &izero, &nprow ); nqR = numroc_( &min_mn, &nb, &mycol, &izero, &npcol ); R = (float *)calloc(mpR*nqR,sizeof(float)) ; if (R==NULL){ printf("error of memory allocation R on proc %dx%d\n",myrow,mycol); exit(0); } itemp = max( 1, mpR ); descinit_( descR, &min_mn, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); pslaset_( "F", &min_mn, &min_mn, &tzero, &tpone, R, &ione, &ione, descR ); if (m>n) psgemm_( "T", "N", &min_mn, &min_mn, &m, &tpone, U, &iu, &ju, descU, U, &iu, &ju, descU, &tmone, R, &ione, &ione, descR ); else psgemm_( "N", "T", &min_mn, &min_mn, &n, &tpone, U, &iu, &ju, descU, U, &iu, &ju, descU, &tmone, R, &ione, &ione, descR ); orthU = pslange_( "F", &min_mn, &min_mn, R, &ione, &ione, descR, wwork ); orthU = orthU / ((float) max_mn); free(R); return orthU; }
void dmat_ldimget(int *desc, int* nrows, int* ncols) { int M = desc[2]; int N = desc[3]; int Mb = desc[4]; int Nb = desc[5]; int ictxt = desc[1]; int rsrc = desc[6]; int csrc = desc[7]; int nprow, npcol, myprow, mypcol; Cblacs_gridinfo(ictxt, &nprow, &npcol, &myprow, &mypcol); *nrows = numroc_(&M, &Mb, &myprow, &rsrc, &nprow); *ncols = numroc_(&N, &Nb, &mypcol, &csrc, &npcol); if (*nrows < 1 || *ncols < 1) { *nrows = 0; *ncols = 0; } }
int BlacsSystem::MSize(int m) { int irsrc=0; return numroc_(&m,&mb_,&myrow_,&irsrc,&nprow_); }
int main (int argc, char *argv[]) { myscalar *A=NULL, *B=NULL, *Btrue=NULL; int descA[BLACSCTXTSIZE], descB[BLACSCTXTSIZE]; int n; int nb; int locr, locc; int i, j, ii, jj; int *I, *J; int nI, nJ; int ierr; int dummy; int myid, np; int myrow, mycol, nprow, npcol; int ctxt; myreal err; n=1024; /* Size of the problem */ nb=16; /* Blocksize for the 2D block-cyclic distribution */ /* Initialize MPI */ if((ierr=MPI_Init(&argc,&argv))) return 1; myid=-1; if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid))) return 1; np=-1; if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np))) return 1; /* Initialize the BLACS grid */ nprow=floor(sqrt((float)np)); npcol=np/nprow; blacs_get_(&IZERO,&IZERO,&ctxt); blacs_gridinit_(&ctxt,"R",&nprow,&npcol); blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); /* A is a dense n x n distributed Toeplitz matrix */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol); A=new myscalar[locr*locc]; dummy=std::max(1,locr); descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); for(i=1;i<=locr;i++) for(j=1;j<=locc;j++) { ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow); jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol); // Toeplitz matrix from Quantum Chemistry. myreal pi=3.1416, d=0.1; A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2); } } else { descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initialize the solver and set parameters */ StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD); sdp.use_HSS=true; sdp.levels_HSS=4; sdp.min_rand_HSS=64; sdp.lim_rand_HSS=0; sdp.tol_HSS=1e-6; /* Compression */ sdp.compress(A,descA); /* Accuracy checking */ sdp.check_compression(A,descA); /* Element extraction: a bunch of random indices. * Not that duplicates do not matter (the code works). */ nI=1+rand()%n; MPI_Bcast((void*)&nI,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); I=new int[nI]; if(!myid) for(i=0;i<nI;i++) I[i]=1+rand()%n; MPI_Bcast((void*)I,nI,MPI_INTEGER,IZERO,MPI_COMM_WORLD); nJ=1+rand()%n; MPI_Bcast((void*)&nJ,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); J=new int[nJ]; if(!myid) for(j=0;j<nJ;j++) J[j]=1+rand()%n; MPI_Bcast((void*)J,nJ,MPI_INTEGER,IZERO,MPI_COMM_WORLD); /* Extraction for the HSS form */ if(myid<nprow*npcol) { locr=numroc_(&nI,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&nJ,&nb,&mycol,&IZERO,&npcol); B=new myscalar[locr*locc](); dummy=std::max(1,locr); descinit_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); } else descset_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); sdp.extract(A,descA,B,descB,I,nI,J,nJ); /* Extraction from the original matrix just to compare */ sdp.use_HSS=false; if(myid<nprow*npcol) Btrue=new myscalar[locr*locc]; sdp.extract(A,descA,Btrue,descB,I,nI,J,nJ); /* Comparison with elements of input matrix */ if(myid<nprow*npcol){ err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL); for(i=0;i<locr*locc;i++) Btrue[i]-=B[i]; err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL)/err; } if(!myid) std::cout << "Element extraction (" << nI << "x" << nJ << " submatrix): maximum relative error max ||A(I,J)-HSS(I,J)||//||A(I,J)|| = " << err << std::endl << std::endl; /* Statistics */ sdp.print_statistics(); /* Clean-up */ delete[] A; delete[] B; delete[] Btrue; delete[] I; delete[] J; /* The end */ MPI_Finalize(); return 0; }
void test_gemr2d(int M, int N) { int repeat = 10; int32_t one = 1; int32_t isrc = 0; int32_t num_ranks; MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); int32_t rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); int32_t bs_row_A = M / num_ranks + std::min(1, M % num_ranks); int32_t bs_col_A = 1; int32_t bs_row_B = 1; int32_t bs_col_B = 1; int32_t blacs_handler = Csys2blacs_handle(MPI_COMM_WORLD); int32_t context1 = blacs_handler; int32_t context2 = blacs_handler; /* create BLACS context */ Cblacs_gridinit(&context1, "C", num_ranks, 1); /* get row and column ranks */ int32_t rank_row1, rank_col1; Cblacs_gridinfo(context1, &num_ranks, &one, &rank_row1, &rank_col1); /* get local number of rows and columns of a matrix */ int32_t num_rows_local1, num_cols_local1; num_rows_local1 = numroc_(&M, &bs_row_A, &rank_row1, &isrc, &num_ranks); num_cols_local1 = numroc_(&N, &bs_col_A, &rank_col1, &isrc, &one); Cblacs_gridinit(&context2, "C", 1, num_ranks); int32_t rank_row2, rank_col2; Cblacs_gridinfo(context2, &one, &num_ranks, &rank_row2, &rank_col2); int32_t num_rows_local2, num_cols_local2; num_rows_local2 = numroc_(&M, &bs_row_B, &rank_row2, &isrc, &one); num_cols_local2 = numroc_(&N, &bs_col_B, &rank_col2, &isrc, &num_ranks); if (rank == 0) { printf("local dimensions of A: %i x %i\n", num_rows_local1, num_cols_local1); printf("local dimensions of B: %i x %i\n", num_rows_local2, num_cols_local2); } int32_t descA[9], descB[9], info; descinit_(descA, &M, &N, &bs_row_A, &bs_col_A, &isrc, &isrc, &context1, &num_rows_local1, &info); descinit_(descB, &M, &N, &bs_row_B, &bs_col_B, &isrc, &isrc, &context2, &num_rows_local2, &info); std::vector<double_complex> A(num_rows_local1 * num_cols_local1); std::vector<double_complex> B(num_rows_local2 * num_cols_local2, double_complex(0, 0)); std::vector<double_complex> C(num_rows_local1 * num_cols_local1); for (int i = 0; i < num_rows_local1 * num_cols_local1; i++) { A[i] = double_complex(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX); C[i] = A[i]; } double time = -MPI_Wtime(); for (int i = 0; i < repeat; i++) { pzgemr2d_(&M, &N, &A[0], &one, &one, descA, &B[0], &one, &one, descB, &context1); } time += MPI_Wtime(); if (rank == 0) { printf("average time %.4f sec, swap speed: %.4f GB/sec\n", time / repeat, sizeof(double_complex) * repeat * M * N / double(1 << 30) / time); } /* check correctness */ pzgemr2d_(&M, &N, &B[0], &one, &one, descB, &A[0], &one, &one, descA, &context1); for (int i = 0; i < num_rows_local1 * num_cols_local1; i++) { if (std::abs(A[i] - C[i]) > 1e-14) { printf("Fail.\n"); exit(0); } } Cblacs_gridexit(context1); Cblacs_gridexit(context2); Cfree_blacs_system_handle(blacs_handler); }
/// /// @return INFO = the status of the psgemm_() /// slpp::int_t pdgemmSlave(void* bufs[], size_t sizes[], unsigned count) { enum dummy {BUF_ARGS=0, BUF_A, BUF_B, BUF_C, NUM_BUFS }; for(size_t i=0; i < count; i++) { if(DBG) { std::cerr << "pdgemmSlave: buffer at:"<< bufs[i] << std::endl; std::cerr << "pdgemmSlave: bufsize =" << sizes[i] << std::endl; } } if(count < NUM_BUFS) { std::cerr << "pdgemmSlave: master sent " << count << " buffers, but " << NUM_BUFS << " are required." << std::endl; ::exit(99); // something that does not look like a signal } // take a COPY of args (because we will have to patch DESC.CTXT) scidb::PdgemmArgs args = *reinterpret_cast<PdgemmArgs*>(bufs[BUF_ARGS]) ; if(DBG) { std::cerr << "pdgemmSlave: args {" << std::endl ; std::cerr << args << std::endl; std::cerr << "}" << std::endl ; } // set up the scalapack grid if(DBG) std::cerr << "pdgemmSlave: NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL<<std::endl; slpp::int_t ICTXT=-1; // will be overwritten by sl_init // call scalapack tools routine to initialize a scalapack grid and give us its // context sl_init_(ICTXT/*out*/, args.NPROW/*in*/, args.NPCOL/*in*/); slpp::int_t NPROW=-1, NPCOL=-1, MYPROW=-1, MYPCOL=-1, MYPNUM=-1; // illegal vals getSlaveBLACSInfo(ICTXT/*in*/, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM); if(NPROW != args.NPROW || NPCOL != args.NPCOL || MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){ if(DBG) { std::cerr << "scalapack general parameter mismatch" << std::endl; std::cerr << "args NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL << "MYPROW:"<<args.MYPROW<<" MYPCOL:"<<args.MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; std::cerr << "ScaLAPACK NPROW:"<<NPROW<<" NPCOL:"<<NPCOL << "MYPROW:"<<MYPROW<<" MYPCOL:"<<MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; } } const slpp::int_t one = 1 ; const slpp::int_t LTD_A = std::max(one, numroc_( args.A.DESC.N, args.A.DESC.NB, MYPCOL, /*CSRC_A*/0, NPCOL )); const slpp::int_t LTD_B = std::max(one, numroc_( args.B.DESC.N, args.B.DESC.NB, MYPCOL, /*CSRC_B*/0, NPCOL )); const slpp::int_t LTD_C = std::max(one, numroc_( args.C.DESC.N, args.C.DESC.NB, MYPCOL, /*CSRC_C*/0, NPCOL )); if(DBG) { std::cerr << "##################################################" << std::endl; std::cerr << "####pdgemmSlave##################################" << std::endl; std::cerr << "one:" << one << std::endl; std::cerr << "args.A.DESC.MB:" << args.A.DESC.MB << std::endl; std::cerr << "MYPROW:" << MYPROW << std::endl; std::cerr << "NPROW:" << NPROW << std::endl; } // size check args SLAVE_ASSERT_ALWAYS( sizes[BUF_ARGS] >= sizeof(PdgemmArgs)); // size check A,B,C -- debugs first slpp::int_t SIZE_A = args.A.DESC.LLD * LTD_A ; slpp::int_t SIZE_B = args.B.DESC.LLD * LTD_B ; slpp::int_t SIZE_C = args.C.DESC.LLD * LTD_C ; if(DBG) { if(sizes[BUF_A] != SIZE_A *sizeof(double)) { std::cerr << "sizes[BUF_A]:" << sizes[BUF_A] << " != args.A.DESC.LLD:" << args.A.DESC.LLD << "* LTD_A" << LTD_A << "*" << sizeof(double) << std::endl; } if(sizes[BUF_B] != SIZE_B *sizeof(double)) { std::cerr << "sizes[BUF_B]:" << sizes[BUF_B] << " != args.B.DESC.LLD:" << args.B.DESC.LLD << "* LTD_B" << LTD_B << "*" << sizeof(double) << std::endl; } if(sizes[BUF_C] != SIZE_C *sizeof(double)) { std::cerr << "sizes[BUF_C]:" << sizes[BUF_C] << " != args.C.DESC.LLD:" << args.C.DESC.LLD << "* LTD_C" << LTD_C << "*" << sizeof(double) << std::endl; } } SLAVE_ASSERT_ALWAYS(sizes[BUF_A] >= SIZE_A * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_B] >= SIZE_B * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_C] >= SIZE_C * sizeof(double)); // sizes are correct, give the pointers their names double* A = reinterpret_cast<double*>(bufs[BUF_A]) ; double* B = reinterpret_cast<double*>(bufs[BUF_B]) ; double* C = reinterpret_cast<double*>(bufs[BUF_C]) ; // debug that the input is readable and show its contents if(DBG) { for(int ii=0; ii < SIZE_A; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") A["<<ii<<"] = " << A[ii] << std::endl; } for(int ii=0; ii < SIZE_B; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") B["<<ii<<"] = " << B[ii] << std::endl; } for(int ii=0; ii < SIZE_C; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << C[ii] << std::endl; } } // ScaLAPACK: the DESCS are complete except for the correct context args.A.DESC.CTXT= ICTXT ; // (no DESC for S) args.B.DESC.CTXT= ICTXT ; args.C.DESC.CTXT= ICTXT ; if(true || DBG) { // we'll leave this on in Cheshire.0 and re-evaluate later std::cerr << "pdgemmSlave: argsBuf is: {" << std::endl; std::cerr << args << std::endl; std::cerr << "}" << std::endl << std::endl; std::cerr << "pdgemmSlave: calling pdgemm_ for computation, with args:" << std::endl ; std::cerr << "TRANSA: " << args.TRANSA << ", TRANSB: " << args.TRANSB << ", M: " << args.M << ", N: " << args.N << ", K: " << args.K << std::endl; std::cerr << "ALPHA: " << args.ALPHA << std::endl; std::cerr << "A: " << (void*)(A) << ", A.I: " << args.A.I << ", A.J: " << args.A.J << std::endl; std::cerr << ", A.DESC: " << args.A.DESC << std::endl; std::cerr << "B: " << (void*)(B) << ", B.I: " << args.B.I << ", B.J: " << args.B.J << std::endl; std::cerr << ", B.DESC: " << args.B.DESC << std::endl; std::cerr << "BETA: " << args.BETA << std::endl; std::cerr << "C: " << (void*)(C) << ", C.I: " << args.C.I << ", C.J: " << args.C.J << std::endl; std::cerr << ", C.DESC: " << args.C.DESC << std::endl; } ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// pdgemm_( args.TRANSA, args.TRANSB, args.M, args.N, args.K, &args.ALPHA, A, args.A.I, args.A.J, args.A.DESC, B, args.B.I, args.B.J, args.B.DESC, &args.BETA, C, args.C.I, args.C.J, args.C.DESC); if(true || DBG) { // we'll leave this on in Cheshire.0 and re-evaluate later std::cerr << "pdgemmSlave: pdgemm_ complete (pdgemm_ has no result INFO)" << std::endl; } if (DBG) { std::cerr << "pdgemmSlave outputs: {" << std::endl; // debug prints of the outputs: for(int ii=0; ii < SIZE_C; ii++) { std::cerr << " C["<<ii<<"] = " << C[ii] << std::endl; } std::cerr << "}" << std::endl; } // TODO: what is the check on the pdgemm_ (pblas call) for successful completion? if (DBG) std::cerr << "pdgemmSlave returning successfully:" << std::endl; slpp::int_t INFO = 0 ; return INFO ; }
int pdtrans(char *trans, int *m, int *n, int * mb, int *nb, double *a, int *lda, double *beta, double *c__, int *ldc, int *imrow, int *imcol, double *work, int *iwork) { /* System generated locals */ long a_dim1, a_offset, c_dim1, c_offset; int i__1, i__2, i__3, i__4; /* Local variables */ int j1, k1, k2, ml, nl, mp, mq, np, nq, mb0, mb1, mb2, nb0, nb1, nb2, kia, kja, kic, kjc, lbm, lbn, lcm, ldt, lbm0, lbm1, lbm2, lbn0, lbn1, lbn2, igcd; long ipt; int mcol, info, lcmp, lcmq, item, ncol, kmod1, kmod2; double tbeta; int kpcol, mpcol, npcol, mrcol, mycol, kprow, mprow, nprow, mrrow, myrow; /* -- PUMMA Package routine (version 2.1) -- */ /* Jaeyoung Choi, Oak Ridge National Laboratory. */ /* Jack Dongarra, Univ. of Tennessee, Oak Ridge National Laboratory. */ /* David Walker, Oak Ridge National Laboratory. */ /* October 31, 1994. */ /* Purpose */ /* PDTRANS routine is one of the PUMMA package based on block cyclic */ /* data distribution on 2-D process configuration. */ /* It is used for the following matrix transposition, */ /* Form C := A' + beta*C */ /* where beta is a scalar, and A and C are matrices, with A an M by N */ /* matrix (globally), and C an N by M matrix (globally). */ /* Parameters */ /* TRANS - (input) CHARACTER*1 */ /* TRANS specifies whether A is transposed or conjugate */ /* transposed. */ /* TRANS = 'T', transpose; */ /* TRANS = 'C', conjugate transpose. */ /* M - (input) INTEGER */ /* M specifies the (global) number of rows of the matrix A and */ /* the (global) number of rows of the matrix C. M >= 0. */ /* N - (input) INTEGER */ /* N specifies the (global) number of columns of the matrix A */ /* and columns of the matrix B. N >= 0. */ /* MB - (input) INTEGER */ /* MB specifies the row block size of the matrix A and the */ /* column block of the matrix C. MB >= 1. */ /* NB - (input) INTEGER */ /* NB specifies the column block size of the matrix A and the */ /* row block size of the matrix C. NB >= 1. */ /* A - (input) DOUBLE PRECISION array of DIMENSION ( LDA, Nq ). */ /* The leading Mp by Nq part of the array A must contain the */ /* local matrix A. Mp and Nq are local variables */ /* (see description of local parameters). */ /* LDA - (input) INTEGER */ /* The leading dimension of the (local) array A. */ /* LDA >= MAX( 1, Mp ). */ /* BETA - (input) DOUBLE PRECISION */ /* BETA specifies the scalar beta. When BETA is supplied as */ /* zero then C need not be set on input. */ /* C - (input/ouput) DOUBLE PRECISION array of DIMENSION (LDC, Mq). */ /* On entry the leading Np by Mq part of the array C must */ /* contain the local matrix C, except when beta is zero, */ /* in which case C need not be set on entry. */ /* On exit, the array C is overwritten by the Np by Mq matrix */ /* (A'+bata*C). Np and Mq are local variables */ /* (see description of local parameters). */ /* LDC - (input) INTEGER */ /* The leading dimension of the (local) array C. */ /* LDC >= MAX( 1, Np ). */ /* IMROW - (input) INTEGER */ /* IMROW specifies a row of the process template, which holds */ /* the first block of the matrices. 0 <= IMROW < NPROW. */ /* IMCOL - (input) INTEGER */ /* IMCOL specifies a column of the process template, which */ /* holds the first block of the matrices. 0 <= IMCOL < NPCOL. */ /* WORK - (workspace) DOUBLE PRECISION array */ /* See requirements. */ /* IWORK - (workspace) INTEGER array */ /* See requirements. */ /* Local Parameters */ /* LCM = the lowest common multiple of P and Q */ /* LCMP = LCM/P = number of template rows in LCM block */ /* LCMQ = LCM/Q = number of template columns in LCM block */ /* IGCD = the greatest common divisor (GCD) of P and Q */ /* MpxNq = size of (local) matrix A in the process, iam */ /* NpxMq = size of (local) matrix C in the process, iam */ /* KMOD = Define Group I.D. */ /* item = temporal integer parameter */ /* Two buffers for storing A' and T(= subblock of A') */ /* WORK <== A' */ /* WORK(IPT) <== T */ /* Three interger buffers */ /* IWORK(1,k) <== starting point of row subblock of A to send and */ /* C to receive in K2 loop (rowwise communication) */ /* IWORK(2,k) <== starting point of column subblock of A to send in */ /* J1 loop (columnwise communication) */ /* IWORK(3,k) <== starting point of column subblock of C to receive */ /* in J1 loop (columnwise communication) */ /* Requirements (approximate) */ /* Size(IWORK) = 3 x MAX(P, Q) */ /* Size(WORK) = 2 x Ceil(Ceil(M,MB),LCM)xMB x Ceil(Ceil(N,NB),LCM)xNB */ /* Get grid parameters */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; c_dim1 = *ldc; c_offset = 1 + c_dim1; c__ -= c_offset; --work; --iwork; /* Function Body */ Cblacs_gridinfo(context_1.ictxt, &nprow, &npcol, &myrow, &mycol); /* Test for the input parameters. */ info = 0; if (*trans != 'T' && *trans != 'C') { info = 1; } else if (*m < 0) { info = 2; } else if (*n < 0) { info = 3; } else if (*mb < 1) { info = 4; } else if (*nb < 1) { info = 5; } else if (*lda < 1) { info = 7; } else if (*ldc < 1) { info = 10; } else if (*imrow < 0 || *imrow >= nprow) { info = 11; } else if (*imcol < 0 || *imcol >= npcol) { info = 12; } L10: if (info != 0) { pxerbla( &context_1.ictxt, "PDTRANS", &info ); return 0; } /* Initialize parameters */ mprow = nprow + myrow; mpcol = npcol + mycol; mrrow = (mprow - *imrow) % nprow; mrcol = (mpcol - *imcol) % npcol; lcm = ilcm_(&nprow, &npcol); lcmp = lcm / nprow; lcmq = lcm / npcol; igcd = nprow / lcmq; mp = numroc_(m, mb, &mrrow, &c__0, &nprow); mq = numroc_(m, mb, &mrcol, &c__0, &npcol); np = numroc_(n, nb, &mrrow, &c__0, &nprow); nq = numroc_(n, nb, &mrcol, &c__0, &npcol); i__1 = iceil_(m, mb); lbm = iceil_(&i__1, &lcm); i__1 = iceil_(n, nb); lbn = iceil_(&i__1, &lcm); /* Test for the input parameters again with local parameters */ if (*lda < mp) { info = 7; } else if (*ldc < np) { info = 10; } if (info != 0) { goto L10; } /* Quick return if possible. */ if (*m == 0 || *n == 0) { return 0; } /* At first, scale C with beta if beta != 0.0 & beta != 1.0 */ tbeta = *beta; if (*beta != 0. && *beta != 1.) { i__1 = mq; for (j1 = 1; j1 <= i__1; ++j1) { HPL_dscal( np, *beta, &c__[j1 * c_dim1 + 1], 1 ); /* L20: */ } tbeta = 1.; } commtrb_1.iaz = lcmp * *mb; commtrb_1.jaz = lcmq * *nb; commtrb_1.itz = lcmp * *nb; commtrb_1.jtz = lcmq * *mb; ml = lbm * *mb; nl = lbn * *nb; ipt = (long)ml * (long)nl + 1; ldt = nl; kprow = mrrow + nprow; kpcol = mrcol + npcol; /* Initialize Parameters -- Compute the positions of subblocks */ i__1 = npcol - 1; for (k1 = 0; k1 <= i__1; ++k1) { ncol = (kpcol - k1) % npcol; i__2 = lcmq - 1; for (j1 = 0; j1 <= i__2; ++j1) { item = npcol * j1 + ncol; if (item % nprow == mrrow) { iwork[ncol * 3 + 1] = item / nprow; } /* L30: */ } } i__2 = lcmq - 1; for (j1 = 0; j1 <= i__2; ++j1) { item = (npcol * j1 + mrcol) % nprow; iwork[item * 3 + 2] = j1; iwork[item * 3 + 3] = j1; i__1 = igcd - 1; for (k1 = 1; k1 <= i__1; ++k1) { iwork[(item + nprow - k1) % nprow * 3 + 2] = j1; iwork[(item + k1) % nprow * 3 + 3] = j1; /* L40: */ } } /* Set parameters for efficient copying */ lbm0 = lbm; lbm1 = lbm; lbm2 = lbm; lbn0 = lbn; lbn1 = lbn; lbn2 = lbn; mb0 = *mb; mb1 = *mb; mb2 = *mb; nb0 = *nb; nb1 = *nb; nb2 = *nb; if (nprow == npcol) { lbm0 = 1; lbn0 = 1; mb0 = mp; nb0 = nq; } if (nprow == lcm) { lbm1 = 1; lbn2 = 1; mb1 = mp; nb2 = np; } if (npcol == lcm) { lbn1 = 1; lbm2 = 1; nb1 = nq; mb2 = mq; } /* For each K2 loop (rowwise), Copy A' to WORK & Send it to KTPROC */ /* then, Receive WORK and Copy WORK to C */ kmod1 = (nprow + mrcol - mrrow) % igcd; kmod2 = (igcd - kmod1) % igcd; i__1 = lcmp - 1; for (k2 = 0; k2 <= i__1; ++k2) { /* Copy A' to WORK in the appropriate order & Send it */ k1 = k2 * igcd + kmod1; mcol = (kpcol - k1) % npcol; kia = iwork[mcol * 3 + 1] * *mb; mcol = (mcol + *imcol) % npcol; ncol = (mrcol + k2 * igcd + kmod2) % npcol; kic = iwork[ncol * 3 + 1] * *nb; ncol = (ncol + *imcol) % npcol; i__2 = lcmq - 1; for (j1 = 0; j1 <= i__2; ++j1) { kja = iwork[(mrrow + igcd * j1) % nprow * 3 + 2] * *nb; if (myrow == (myrow + igcd * j1 + kmod1) % nprow && mycol == mcol) { kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb; i__3 = mp - kia; i__4 = nq - kja; dtr2mx_(&a[kia + 1 + (kja + 1) * a_dim1], lda, &tbeta, &c__[ kic + 1 + (kjc + 1) * c_dim1], ldc, &lbm0, &lbn0, & mb0, &nb0, &i__3, &i__4); } else { i__3 = mp - kia; i__4 = nq - kja; dtr2bf_(&a[kia + 1 + (kja + 1) * a_dim1], lda, &work[1], &ldt, &lbm1, &lbn1, &mb1, &nb1, &i__3, &i__4); if (nprow == npcol && *beta == 0. && *ldc == ldt) { i__3 = (myrow + igcd * j1 + kmod1) % nprow; i__4 = (mprow - igcd * j1 - kmod2) % nprow; kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb; #if 0 Cdgesd2d(context_1.ictxt,nl,ml,&work[1],nl,i__3,mcol); Cdgerv2d(context_1.ictxt,nl,ml,&c__[(kjc + 1) * c_dim1 + 1],*ldc,i__4,ncol); #else Cblacs_dSendrecv( context_1.ictxt, nl, ml, &work[1], nl, i__3, mcol, nl, ml, &c__[(kjc + 1) * c_dim1 + 1], *ldc, i__4, ncol ); #endif } else { i__3 = (myrow + igcd * j1 + kmod1) % nprow; i__4 = (mprow - igcd * j1 - kmod2) % nprow; #if 0 Cdgesd2d(context_1.ictxt,nl,ml,&work[1],nl,i__3,mcol); Cdgerv2d(context_1.ictxt,nl,ml,&work[ipt],nl, i__4,ncol); #else Cblacs_dSendrecv( context_1.ictxt, nl, ml, &work[1], nl, i__3, mcol, nl, ml, &work[ipt], nl, i__4, ncol ); #endif kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb; i__3 = np - kic; i__4 = mq - kjc; dmv2mx_(&work[ipt], &ldt, &tbeta, &c__[kic + 1 + (kjc + 1) * c_dim1], ldc, &lbn2, &lbm2, &nb2, &mb2, &i__3, &i__4); } } } } return 0; } /* pdtrans_ */
int main() { const MKL_INT m = 1000; const MKL_INT k = 100000; const MKL_INT n = 1000; const MKL_INT nb = 100; const MKL_INT nprow = 2; const MKL_INT npcol = 2; MKL_INT iam, nprocs, ictxt, myrow, mycol; MDESC descA, descB, descC, descA_local, descB_local, descC_local; MKL_INT info; MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local; MKL_INT a_lld, b_lld, c_lld; blacs_pinfo_( &iam, &nprocs ); blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); double *a = 0; double *b = 0; double *c = 0; if (iam==0) { a = gen_a(m, k); b = gen_b(k, n); c = (double*)calloc(m*n, sizeof(double)); puts("a="); print(a, m, k); puts("b="); print(b, k, n); } a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol ); b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow ); b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) ); double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) ); double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) ); a_lld = MAX( a_m_local, 1 ); b_lld = MAX( b_m_local, 1 ); c_lld = MAX( c_m_local, 1 ); if (iam==0) { printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local); printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld); } descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info ); descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info ); descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info ); descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info ); printf("Rank %d: start distribute data\n", iam); pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); printf("Rank %d: finished distribute data\n", iam); if (iam==0) { puts("a"); print(A, a_m_local, a_n_local); puts("b"); print(B, b_m_local, b_n_local); } pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); printf("Rank %d: finished dgemm\n", iam); if (iam == 0) { puts("c"); print(C, c_m_local, c_n_local); } pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local); if (iam==0) { puts("global c"); print(c, m, n); } free(A); free(B); free(C); if (iam==0) { free(a); free(b); free(c); } blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); }
void ddistr(int ictxt, int n, int numc, int nb, double *A , double *A_d, int *descAd ){ int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info; int nprow, npcol, myrow, mycol,descA[9], itemp; int i,k; /* #ifdef NOUNDERLAPACK sl_init__(&RootNodeic,&ione, &ione); #else sl_init__(&RootNodeic,&ione, &ione); #endif */ sl_init_(&RootNodeic,&ione, &ione); Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); //printf("nprow=%d, npcol=%d, myrow=%d, mycol=%d\n",nprow,npcol,myrow,mycol); //printf("nb=%d\n",nb); if (myrow==0 && mycol==0) { isRootNode=1;} if (isRootNode){ //printf("root entro aca...\n"); nru = numroc_(&n, &n, &myrow,&izero, &nprow ); //printf("root paso numroc\n"); itemp = max(1, nru); descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info); //printf("root paso descinit\n"); } else{ //printf("yo entre aca\n"); k=0; for(i=0;i<9;i++){ descA[k]=0; k++; } descA[1]=-1; } //printf("inicio de cosas para todos\n"); nru = numroc_(&n, &nb, &myrow, &izero, &nprow); //printf("todos pasan numroc\n"); itemp = max(1,nru); descinit_(descAd, &n, &numc, &nb, &nb, &izero, &izero, &ictxt,&itemp, &info); //printf("todos pasan descinit\n"); pdgemr2d_( &n, &numc, A, &ione, &ione, descA, A_d, &ione, &ione, descAd, &ictxt); //printf("todos pasan pdgemr2d\n"); if (isRootNode){ //printf("RootNodeic=%d\n",RootNodeic); Cblacs_gridexit(RootNodeic); //printf("root paso gridexit\n"); } }
static int CheckNode(int imrow, int imcol, int nmat, int *mval, int *nval, int nbmat, int *mbval, int *nbval, int myrow, int mycol, int nprow, int npcol, long *maxMem) { int i__, ii, m, n, mb, nb, ierr[1]; int lcm, np0, nq0, mp0, mq0, mg, ng, np, nq, mp, mq; long isw, ipw, ipiw, ipa, ipc; *maxMem = 0; for (i__ = 0; i__ < nmat; ++i__) { m = mval[i__]; n = nval[i__]; /* Make sure matrix information is correct */ ierr[0] = 0; if (m < 1) { ierr[0] = 1; } else if (n < 1) { ierr[0] = 1; } if (ierr[0] > 0) { continue; } for (ii = 0; ii < nbmat; ++ii) { /* Loop over different block sizes */ mb = mbval[ii]; nb = nbval[ii]; /* Make sure blocking sizes are legal */ ierr[0] = 0; if (mb < 1) { ierr[0] = 1; } else if (nb < 1) { ierr[0] = 1; } /* Make sure no one had error */ if (ierr[0] > 0) { continue; } mp = numroc_(&m, &mb, &myrow, &imrow, &nprow); mq = numroc_(&m, &mb, &mycol, &imcol, &npcol); np = numroc_(&n, &nb, &myrow, &imrow, &nprow); nq = numroc_(&n, &nb, &mycol, &imcol, &npcol); mg = iceil_(&m, &mb); ng = iceil_(&n, &nb); mp0 = iceil_(&mg, &nprow) * mb; mq0 = iceil_(&mg, &npcol) * mb; np0 = iceil_(&ng, &nprow) * nb; nq0 = iceil_(&ng, &npcol) * nb; lcm = ilcm_(&nprow, &npcol); ipc = 1; ipa = ipc + (long)np0 * (long)mq0; ipiw = (long)mp0 * (long)nq0 + ipa; ipw = ipiw; isw = ipw + (long)(iceil_(&mg, &lcm) << 1) * (long)mb * (long)iceil_(&ng, &lcm) * (long)nb; if (*maxMem < isw) *maxMem = isw; } } return 0; }
// torus-wrap distribution used in EigenK and Elemental distributed_matrix(int m_global, int n_global, const grid<rokko::eigen_sx>& g_in) : m_global(m_global), n_global(n_global), g(g_in), myrank(g_in.myrank), nprocs(g_in.nprocs), myrow(g_in.myrow), mycol(g_in.mycol), nprow(g_in.nprow), npcol(g_in.npcol) { int n = m_global; int nx = (n-1)/nprow+1; int i1 = 6, i2 = 16 * 2, i3 = 16 * 4, nm; CSTAB_get_optdim(nx, i1, i2, i3, nm); // return an optimized (possiblly) leading dimension of local block-cyclic matrix to nm. //int para_int = 0; eigen_free_wrapper(para_int); int NB = 64 + 32; int nmz = ((n-1)/nprow+1); nmz = ((nmz-1)/NB+1)*NB+1; int nmw = ((n-1)/npcol+1); nmw = ((nmw-1)/NB+1)*NB+1; nme = ((n-1)/2+1) * 2; int nh = (n-1)/4+1; i1 = 4; int nnh; CSTAB_get_optdim(nh, i1, i2, i3, nnh); // return an optimized (possiblly) leading dimension of local block-cyclic matrix to nm. nnh = 4 * nnh; cout << "nnh=" << nnh << endl; cout << "nm=" << nm << endl; cout << "nmz=" << nmz << endl; cout << "nmw=" << nmw << endl; cout << "nme=" << nme << endl; cout << "nnh=" << nnh << endl; int n1x = ((n-1)/nprocs + 1); int larray = std::max(std::max(std::max(nmz, nm), nnh) * nmw, n*n1x); cout << "n1x=" << n1x << endl; cout << "larray=" << larray << endl; // calculate sizes of my proc's local part of distributed matrix mb = 1; nb = 1; const int ZERO=0, ONE=1; lld = nm; m_local = numroc_( m_global, mb, myrow, ZERO, nprow ); //(m_global + nprow - myrow) / nprow; n_local = numroc_( n_global, nb, mycol, ZERO, npcol ); //(n_global + npcol - mycol) / npcol; //m_local = nm; //n_local = (larray + (nm-1)) / nm; for (int proc=0; proc<nprocs; ++proc) { if (proc == g.myrank) { cout << "proc=" << proc << endl; //cout << " mb=" << mb << " nb=" << nb << endl; //cout << " mA=" << m_local << " nprow=" << g.nprow << endl; //cout << " nA=" << n_local << " npcol=" << g.npcol << endl; cout << "nm(lld)=" << lld << endl; cout << " m_local=" << m_local << " n_local=" << n_local << endl; } MPI_Barrier(MPI_COMM_WORLD); } array = new double[larray]; //m_local * n_local]; if (array == NULL) { cerr << "failed to allocate array." << endl; MPI_Abort(MPI_COMM_WORLD, 3); } for (int ii=0; ii<larray; ++ii) array[ii] = -3; }
sl_int_t mpiCopySlave(void* bufs[], size_t sizes[], unsigned count) { enum dummy {DBG=0}; enum dummy2 {BUF_ARGS=0, BUF_IN, BUF_OUT, NUM_BUFS }; if(DBG) { std::cerr << "mpiCopySlave(): entered" << std::endl; for(size_t i=0; i < count; i++) { std::cerr << "mpiCopySlave: buffer at:"<< bufs[i] << std::endl; std::cerr << "mpiCopySlave: bufsize =" << sizes[i] << std::endl; } } if(count < NUM_BUFS) { std::cerr << "mpiCopySlave: master sent " << count << " buffers, but " << NUM_BUFS << " are required." << std::endl; ::exit(99); // something that does not look like a signal } // take a COPY of args (because we will have to patch DESC.CTXT) scidb::MPICopyArgs args = *reinterpret_cast<MPICopyArgs*>(bufs[BUF_ARGS]) ; if(DBG) { std::cerr << "mpiCopySlave: args --------------------------" << std::endl ; std::cerr << args << std::endl; std::cerr << "mpiCopySlave: args end ----------------------" << std::endl ; } // set up the scalapack grid if(DBG) std::cerr << "##### sl_init() NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL<<std::endl; slpp::int_t ICTXT=-1; // will be overwritten by sl_init // call scalapack tools routine to initialize a scalapack grid and give us its // context sl_init_(ICTXT/*out*/, args.NPROW/*in*/, args.NPCOL/*in*/); sl_int_t NPROW=-1, NPCOL=-1, MYPROW=-1, MYPCOL=-1, MYPNUM=-1; // illegal vals getSlaveBLACSInfo(ICTXT/*in*/, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM); if(NPROW != args.NPROW || NPCOL != args.NPCOL || MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){ std::cerr << "scalapack general parameter mismatch:" << std::endl; std::cerr << "args:" << std::endl; std::cerr << "NP=("<<args.NPROW<<", "<<args.NPCOL <<")"<< std::endl; std::cerr << "MYP("<<args.MYPROW<<", "<<args.MYPCOL<<")"<< std::endl; std::cerr << "MYPNUM" <<args.MYPNUM << std::endl; std::cerr << "ScaLAPACK:" << std::endl; std::cerr << "NP=("<<NPROW<<", "<<NPCOL <<")"<< std::endl; std::cerr << "MYP("<<MYPROW<<", "<<MYPCOL<<")"<< std::endl; std::cerr << "MYPNUM" <<MYPNUM << std::endl; ::exit(99); // something that does not look like a signal } const sl_int_t& LLD_IN = args.IN.DESC.LLD ; const sl_int_t one = 1 ; const sl_int_t LTD_IN = std::max(one, numroc_( args.IN.DESC.N, args.IN.DESC.NB, MYPCOL, /*CSRC_IN*/0, NPCOL )); const sl_int_t& MP = LLD_IN ; const sl_int_t& NQ = LTD_IN ; // size check args if( sizes[BUF_ARGS] != sizeof(MPICopyArgs)) { assert(false); // TODO: correct way to fail ::exit(99); // something that does not look like a signal } // size check IN sl_int_t SIZE_IN = MP*NQ ; if( sizes[BUF_IN] != SIZE_IN * sizeof(double)) { std::cerr << "slave: error size mismatch:" << std::endl; std::cerr << "sizes[BUF_IN]" << sizes[BUF_IN] << std::endl; std::cerr << "MP * NQ = " << MP <<"*"<<NQ<<"="<< MP*NQ << std::endl; assert(false); // TODO: correct way to fail ::exit(99); // something that does not look like a signal } // size check OUT sl_int_t SIZE_OUT = SIZE_IN; if( sizes[BUF_OUT] != SIZE_OUT *sizeof(double)) { std::cerr << "sizes[BUF_OUT]:"<<sizes[BUF_OUT]; std::cerr << "MP * NQ = " << MP <<"*"<<NQ<<"="<< MP*NQ << std::endl; assert(false); // TODO: correct way to fail ::exit(99); // something that does not look like a signal } // sizes are correct, give the pointers their names double* IN = reinterpret_cast<double*>(bufs[BUF_IN]) ; double* OUT = reinterpret_cast<double*>(bufs[BUF_OUT]) ; // here's the whole thing: copy IN to OUT // TODO: use memcpy instead for(int ii=0; ii < SIZE_OUT; ii++) { OUT[ii] = IN[ii] ; } return 0; }
int BlacsSystem::NSize(int n) { int icsrc=0; return numroc_(&n,&nb_,&mycol_,&icsrc,&npcol_); }
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in, double *A, int *ia_in, int *ja_in, int *descA, double *dB, int *ib_in, int *jb_in, int *descB ) { /* Copy m by n distributed submatrix from host to GPU */ int m = *m_in; int n = *n_in; int ia = *ia_in; int ja = *ja_in; int ib = *ib_in; int jb = *jb_in; int nprow = 0; int npcol = 0; int myprow = 0; int mypcol = 0; int mmb = 0; int nnb = 0; int istart = 0; int iend = 0; int isize = 0; int Locp = 0; int Locq = 0; int lld = 0; int jstart = 0; int jend = 0; int jsize = 0; int iib = 0; int jjb = 0; cuDoubleComplex *dBptr = 0; double *Btmp = 0; F_CHAR_T NoTrans = "N"; int descBtmp[DLEN_]; int elmSize = sizeof(cuDoubleComplex); double z_one[2]; double z_zero[2]; /* Tuneable parameters */ int mfactor = 4; int nfactor = 4; int rsrc = 0; int csrc = 0; int irsrc = 0; int jcsrc = 0; int info = 0; int notran = 0; int TrA = 0; int lrindx = 0; int lcindx = 0; int nrow = 0; int ncol = 0; int mm = 0; int nn = 0; int iia = 0; int jja = 0; cublasStatus cu_status; z_one[REAL_PART] = 1; z_one[IMAG_PART] = 0; z_zero[REAL_PART] = 0; z_zero[IMAG_PART] = 0; Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol ); notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); /* * check arguments */ if ((m <= 0) || (n <= 0)) { return; }; assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) ); assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) ); assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) ); assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) ); /* * Create a temp matrix that is aligned to descB. * Assume size is mmb by nnb */ if (notran) { mmb = MIN( m, descB[MB_] * nprow * mfactor); nnb = MIN( n, descB[NB_] * npcol * nfactor); } else { nnb = MIN( m, descB[MB_] * nprow * mfactor); mmb = MIN( n, descB[NB_] * npcol * nfactor); }; mmb = MAX( mmb,1); nnb = MAX( nnb,1); rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow); csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol); Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow ); Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol ); Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize ); assert( Btmp != 0 ); lld = MAX(1, Locp); descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_], &rsrc, &csrc, &descB[CTXT_], &lld, &info ); assert( info == 0); for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) { jend = MIN( ja + n -1, jstart + nnb - 1); jsize = jend - jstart + 1; for( istart=ia; istart <= ia + m-1; istart = iend + 1) { iend = MIN( ia + m-1, istart + mmb -1); isize = iend - istart + 1; iia = ia + (istart-1); jja = ja + (jstart-1); iib = 1; jjb = 1; if (notran) { mm = isize; nn = jsize; } else { mm = jsize; nn = isize; }; pzgeadd_( TRAN, &mm, &nn, z_one, A, &iia, &jja, descA, z_zero, Btmp, &iib, &jjb, descBtmp ); /* * find local extent */ if (notran) { iib = ib + (istart-1); jjb = jb + (jstart-1); } else { iib = ib + (jstart-1); jjb = jb + (istart-1); }; if (notran) { nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow); ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol); } else { nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow); ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol); }; /* Perform global dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize) Perform local dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <- B(1:nrow, 1:ncol) */ infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol, &lrindx, &lcindx, &irsrc, &jcsrc ); dBptr = (cuDoubleComplex *) dB; dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]); cu_status = cublasSetMatrix( nrow,ncol,elmSize, (cuDoubleComplex *) Btmp, descBtmp[LLD_], dBptr, descB[LLD_] ); assert( cu_status == CUBLAS_STATUS_SUCCESS ); }; }; free( Btmp ); return; }
void CG(StrumpackDensePackage<myscalar,myreal> *sdp, myscalar *X, myscalar *B, int *descVec, int n, int nrhs, int niter, myreal threshold) { /* Conjugate Gradients. Calls sdp.schur_product for matvecs. Fills only the bottom part of X. */ int printit=250; int IA; int ctxt; int nprow, npcol; int myrow, mycol; int rsrc, csrc; int nb; int locr, locc; int i, j; int idummy, ierr; int neff; int it; int desc[BLACSCTXTSIZE], descScal[BLACSCTXTSIZE]; bool ingrid; myreal res; myscalar *x=NULL, *b=NULL; myscalar *r=NULL, *p=NULL, *Ap=NULL; myscalar alpha, rrprev, rrnext; IA=sdp->split_HSS+1; neff=n-IA+1; ctxt=descVec[BLACSctxt]; rsrc=descVec[BLACSrsrc]; csrc=descVec[BLACScsrc]; nb=descVec[BLACSmb]; blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); ingrid=myrow>=0 && mycol>=0; /* X and B have n rows. We create versions with * the last neff rows only that we pass to SDP. */ if(ingrid) { locr=numroc_(&neff,&nb,&myrow,&rsrc,&nprow); locc=numroc_(&IONE,&nb,&mycol,&csrc,&npcol); idummy=locr>1?locr:1; descinit_(desc,&neff,&IONE,&nb,&nb,&rsrc,&csrc,&ctxt,&idummy,&ierr); x=new myscalar[locr*locc](); b=new myscalar[locr*locc](); r=new myscalar[locr*locc](); p=new myscalar[locr*locc](); Ap=new myscalar[locr*locc](); } else { locr=0; locc=0; descset_(desc,&neff,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Descriptor for the scalar containing the result of the dot product */ if(ingrid) descinit_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&IONE,&ierr); else descset_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); for(j=1;j<=nrhs;j++) { /* One RHS at a time */ if(ingrid) { pgeadd('N',neff,IONE,ONE,B,IA,j,descVec,ZERO,b,IONE,IONE,desc); pgeadd('N',neff,IONE,ONE,X,IA,j,descVec,ZERO,x,IONE,IONE,desc); } /* r = b - A x * If the Schur was explitly in matrix A, the BLAS call would be * pgemm('N','N',neff,IONE,neff,NONE,A,IA,IA,descA,x,IONE,IONE,desc,ONE,r,IONE,IONE,desc); * */ if(ingrid) placpy('N',neff,IONE,b,IONE,IONE,desc,r,IONE,IONE,desc); sdp->schur_product('N',NONE,x,desc,ONE,r,desc); /* p = r */ if(ingrid) placpy('N',neff,IONE,r,IONE,IONE,desc,p,IONE,IONE,desc); /* "Previous" r' * r */ rrprev=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrprev,IONE,IONE,descScal); MPI_Bcast((void *)&rrprev,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); it=1; while(it<=niter) { /* Ap = A * p * If the Schur was explitly in matrix A, the BLAS call would be * pgemm('N','N',neff,IONE,neff,ONE,A,IA,IA,descA,p,IONE,IONE,desc,ZERO,Ap,IONE,IONE,desc); * */ sdp->schur_product('N',ONE,p,desc,ZERO,Ap,desc); /* alpha = r'*r / (p' * A * p) = rrprev/(p' * Ap) */ alpha=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,p,IONE,IONE,desc,Ap,IONE,IONE,desc,ZERO,&alpha,IONE,IONE,descScal); MPI_Bcast((void *)&alpha,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); alpha=rrprev/alpha; /* x = x + alpha * p */ for(i=0;i<locr*locc;i++) x[i]+=alpha*p[i]; /* r = r - alpha * Ap */ for(i=0;i<locr*locc;i++) r[i]-=alpha*Ap[i]; /* "Next" r' * r */ rrnext=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrnext,IONE,IONE,descScal); MPI_Bcast((void *)&rrnext,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); /* Residual */ res=sqrt(rrnext.real()); if(it%printit==0) if(!myrow && !mycol) std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl; if(res<threshold) break; /* p = r + rrnext/rrprev * p */ for(i=0;i<locr*locc;i++) p[i]=r[i]+rrnext/rrprev*p[i]; /* "Previous" r' * r */ rrprev=rrnext; it++; } if(it>niter) it=niter; if(it%printit) if(!myrow && !mycol) std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl << std::endl; /* Back to n-sized vector */ if(ingrid) pgeadd('N',neff,IONE,ONE,x,IONE,IONE,desc,ZERO,X,IA,j,descVec); } delete[] b; delete[] x; delete[] r; delete[] p; delete[] Ap; }
int Amesos_Scalapack::RedistributeA( ) { if( debug_ == 1 ) std::cout << "Entering `RedistributeA()'" << std::endl; Time_->ResetStartTime(); Epetra_RowMatrix *RowMatrixA = dynamic_cast<Epetra_RowMatrix *>(Problem_->GetOperator()); EPETRA_CHK_ERR( RowMatrixA == 0 ) ; const Epetra_Map &OriginalMap = RowMatrixA->RowMatrixRowMap() ; int NumberOfProcesses = Comm().NumProc() ; // // Compute a uniform distribution as ScaLAPACK would want it // MyFirstElement - The first element which this processor would have // NumExpectedElemetns - The number of elements which this processor would have // int NumRows_ = RowMatrixA->NumGlobalRows() ; int NumColumns_ = RowMatrixA->NumGlobalCols() ; if ( MaxProcesses_ > 0 ) { NumberOfProcesses = EPETRA_MIN( NumberOfProcesses, MaxProcesses_ ) ; } else { int ProcessNumHeuristic = (1+NumRows_/200)*(1+NumRows_/200); NumberOfProcesses = EPETRA_MIN( NumberOfProcesses, ProcessNumHeuristic ); } if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:171" << std::endl; // // Create the ScaLAPACK data distribution. // The TwoD data distribution is created in a completely different // manner and is not transposed (whereas the SaLAPACK 1D data // distribution was transposed) // if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:163" << std::endl; Comm().Barrier(); if ( TwoD_distribution_ ) { if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:166" << std::endl; Comm().Barrier(); npcol_ = EPETRA_MIN( NumberOfProcesses, EPETRA_MAX ( 2, (int) sqrt( NumberOfProcesses * 0.5 ) ) ) ; nprow_ = NumberOfProcesses / npcol_ ; // // Create the map for FatA - our first intermediate matrix // int NumMyElements = RowMatrixA->RowMatrixRowMap().NumMyElements() ; std::vector<int> MyGlobalElements( NumMyElements ); RowMatrixA->RowMatrixRowMap().MyGlobalElements( &MyGlobalElements[0] ) ; int NumMyColumns = RowMatrixA->RowMatrixColMap().NumMyElements() ; std::vector<int> MyGlobalColumns( NumMyColumns ); RowMatrixA->RowMatrixColMap().MyGlobalElements( &MyGlobalColumns[0] ) ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:194" << std::endl; std::vector<int> MyFatElements( NumMyElements * npcol_ ); for( int LocalRow=0; LocalRow<NumMyElements; LocalRow++ ) { for (int i = 0 ; i < npcol_; i++ ){ MyFatElements[LocalRow*npcol_+i] = MyGlobalElements[LocalRow]*npcol_+i; } } Epetra_Map FatInMap( npcol_*NumRows_, NumMyElements*npcol_, &MyFatElements[0], 0, Comm() ); // // Create FatIn, our first intermediate matrix // Epetra_CrsMatrix FatIn( Copy, FatInMap, 0 ); std::vector<std::vector<int> > FatColumnIndices(npcol_,std::vector<int>(1)); std::vector<std::vector<double> > FatMatrixValues(npcol_,std::vector<double>(1)); std::vector<int> FatRowPtrs(npcol_); // A FatRowPtrs[i] = the number // of entries in local row LocalRow*npcol_ + i if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:219" << std::endl; // mypcol_ = iam_%npcol_; myprow_ = (iam_/npcol_)%nprow_; if ( iam_ >= nprow_ * npcol_ ) { myprow_ = nprow_; mypcol_ = npcol_; } // Each row is split into npcol_ rows, with each of the // new rows containing only those elements belonging to // its process column (in the ScaLAPACK 2D process grid) // int MaxNumIndices = RowMatrixA->MaxNumEntries(); int NumIndices; std::vector<int> ColumnIndices(MaxNumIndices); std::vector<double> MatrixValues(MaxNumIndices); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:232 NumMyElements = " << NumMyElements << std::endl; nb_ = grid_nb_; for( int LocalRow=0; LocalRow<NumMyElements; ++LocalRow ) { RowMatrixA->ExtractMyRowCopy( LocalRow, MaxNumIndices, NumIndices, &MatrixValues[0], &ColumnIndices[0] ); for (int i=0; i<npcol_; i++ ) FatRowPtrs[i] = 0 ; // // Deal the individual matrix entries out to the row owned by // the process to which this matrix entry will belong. // for( int i=0 ; i<NumIndices ; ++i ) { int GlobalCol = MyGlobalColumns[ ColumnIndices[i] ]; int pcol_i = pcolnum( GlobalCol, nb_, npcol_ ) ; if ( FatRowPtrs[ pcol_i ]+1 >= FatColumnIndices[ pcol_i ].size() ) { FatColumnIndices[ pcol_i ]. resize( 2 * FatRowPtrs[ pcol_i ]+1 ); FatMatrixValues[ pcol_i ]. resize( 2 * FatRowPtrs[ pcol_i ]+1 ); } FatColumnIndices[pcol_i][FatRowPtrs[pcol_i]] = GlobalCol ; FatMatrixValues[pcol_i][FatRowPtrs[pcol_i]] = MatrixValues[i]; FatRowPtrs[ pcol_i ]++; } // // Insert each of the npcol_ rows individually // for ( int pcol_i = 0 ; pcol_i < npcol_ ; pcol_i++ ) { FatIn.InsertGlobalValues( MyGlobalElements[LocalRow]*npcol_ + pcol_i, FatRowPtrs[ pcol_i ], &FatMatrixValues[ pcol_i ][0], &FatColumnIndices[ pcol_i ][0] ); } } FatIn.FillComplete( false ); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:260" << std::endl; if ( debug_ == 1) std::cout << "Amesos_Scalapack.cpp:265B" << " iam_ = " << iam_ << " nb_ = " << nb_ << " nprow_ = " << nprow_ << " npcol_ = " << npcol_ << std::endl; // // Compute the map for our second intermediate matrix, FatOut // // Compute directly int UniformRows = ( NumRows_ / ( nprow_ * nb_ ) ) * nb_ ; int AllExcessRows = NumRows_ - UniformRows * nprow_ ; int OurExcessRows = EPETRA_MIN( nb_, AllExcessRows - ( myprow_ * nb_ ) ) ; OurExcessRows = EPETRA_MAX( 0, OurExcessRows ); NumOurRows_ = UniformRows + OurExcessRows ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:277" << std::endl; int UniformColumns = ( NumColumns_ / ( npcol_ * nb_ ) ) * nb_ ; int AllExcessColumns = NumColumns_ - UniformColumns * npcol_ ; int OurExcessColumns = EPETRA_MIN( nb_, AllExcessColumns - ( mypcol_ * nb_ ) ) ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:281" << std::endl; OurExcessColumns = EPETRA_MAX( 0, OurExcessColumns ); NumOurColumns_ = UniformColumns + OurExcessColumns ; if ( iam_ >= nprow_ * npcol_ ) { UniformRows = 0; NumOurRows_ = 0; NumOurColumns_ = 0; } Comm().Barrier(); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:295" << std::endl; #if 0 // Compute using ScaLAPACK's numroc routine, assert agreement int izero = 0; // All matrices start at process 0 int NumRocSays = numroc_( &NumRows_, &nb_, &myprow_, &izero, &nprow_ ); assert( NumOurRows_ == NumRocSays ); #endif // // Compute the rows which this process row owns in the ScaLAPACK 2D // process grid. // std::vector<int> AllOurRows(NumOurRows_); int RowIndex = 0 ; int BlockRow = 0 ; for ( ; BlockRow < UniformRows / nb_ ; BlockRow++ ) { for ( int RowOffset = 0; RowOffset < nb_ ; RowOffset++ ) { AllOurRows[RowIndex++] = BlockRow*nb_*nprow_ + myprow_*nb_ + RowOffset ; } } Comm().Barrier(); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:315" << std::endl; assert ( BlockRow == UniformRows / nb_ ) ; for ( int RowOffset = 0; RowOffset < OurExcessRows ; RowOffset++ ) { AllOurRows[RowIndex++] = BlockRow*nb_*nprow_ + myprow_*nb_ + RowOffset ; } assert( RowIndex == NumOurRows_ ); // // Distribute those rows amongst all the processes in that process row // This is an artificial distribution with the following properties: // 1) It is a 1D data distribution (each row belogs entirely to // a single process // 2) All data which will eventually belong to a given process row, // is entirely contained within the processes in that row. // Comm().Barrier(); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:312" << std::endl; // // Compute MyRows directly // std::vector<int>MyRows(NumOurRows_); RowIndex = 0 ; BlockRow = 0 ; for ( ; BlockRow < UniformRows / nb_ ; BlockRow++ ) { for ( int RowOffset = 0; RowOffset < nb_ ; RowOffset++ ) { MyRows[RowIndex++] = BlockRow*nb_*nprow_*npcol_ + myprow_*nb_*npcol_ + RowOffset*npcol_ + mypcol_ ; } } Comm().Barrier(); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:326" << std::endl; assert ( BlockRow == UniformRows / nb_ ) ; for ( int RowOffset = 0; RowOffset < OurExcessRows ; RowOffset++ ) { MyRows[RowIndex++] = BlockRow*nb_*nprow_*npcol_ + myprow_*nb_*npcol_ + RowOffset*npcol_ + mypcol_ ; } Comm().Barrier(); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:334" << std::endl; Comm().Barrier(); for (int i=0; i < NumOurRows_; i++ ) { assert( MyRows[i] == AllOurRows[i]*npcol_+mypcol_ ); } Comm().Barrier(); if ( debug_ == 1) std::cout << "Amesos_Scalapack.cpp:340" << " iam_ = " << iam_ << " myprow_ = " << myprow_ << " mypcol_ = " << mypcol_ << " NumRows_ = " << NumRows_ << " NumOurRows_ = " << NumOurRows_ << std::endl; Comm().Barrier(); Epetra_Map FatOutMap( npcol_*NumRows_, NumOurRows_, &MyRows[0], 0, Comm() ); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:344" << std::endl; Comm().Barrier(); if ( FatOut_ ) delete FatOut_ ; FatOut_ = new Epetra_CrsMatrix( Copy, FatOutMap, 0 ) ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:348" << std::endl; Epetra_Export ExportToFatOut( FatInMap, FatOutMap ) ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:360" << std::endl; FatOut_->Export( FatIn, ExportToFatOut, Add ); FatOut_->FillComplete( false ); // // Create a map to allow us to redistribute the vectors X and B // Epetra_RowMatrix *RowMatrixA = dynamic_cast<Epetra_RowMatrix *>(Problem_->GetOperator()); const Epetra_Map &OriginalMap = RowMatrixA->RowMatrixRowMap() ; assert( NumGlobalElements_ == OriginalMap.NumGlobalElements() ) ; int NumMyVecElements = 0 ; if ( mypcol_ == 0 ) { NumMyVecElements = NumOurRows_; } if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:385" << std::endl; if (VectorMap_) { delete VectorMap_ ; VectorMap_ = 0 ; } VectorMap_ = new Epetra_Map( NumGlobalElements_, NumMyVecElements, &AllOurRows[0], 0, Comm() ); if ( debug_ == 1) std::cout << "iam_ = " << iam_ << " Amesos_Scalapack.cpp:393 debug_ = " << debug_ << std::endl; } else { nprow_ = 1 ; npcol_ = NumberOfProcesses / nprow_ ; assert ( nprow_ * npcol_ == NumberOfProcesses ) ; m_per_p_ = ( NumRows_ + NumberOfProcesses - 1 ) / NumberOfProcesses ; int MyFirstElement = EPETRA_MIN( iam_ * m_per_p_, NumRows_ ) ; int MyFirstNonElement = EPETRA_MIN( (iam_+1) * m_per_p_, NumRows_ ) ; int NumExpectedElements = MyFirstNonElement - MyFirstElement ; assert( NumRows_ == RowMatrixA->NumGlobalRows() ) ; if ( ScaLAPACK1DMap_ ) delete( ScaLAPACK1DMap_ ) ; ScaLAPACK1DMap_ = new Epetra_Map( NumRows_, NumExpectedElements, 0, Comm() ); if ( ScaLAPACK1DMatrix_ ) delete( ScaLAPACK1DMatrix_ ) ; ScaLAPACK1DMatrix_ = new Epetra_CrsMatrix(Copy, *ScaLAPACK1DMap_, 0); Epetra_Export ExportToScaLAPACK1D_( OriginalMap, *ScaLAPACK1DMap_); ScaLAPACK1DMatrix_->Export( *RowMatrixA, ExportToScaLAPACK1D_, Add ); ScaLAPACK1DMatrix_->FillComplete( false ) ; } if ( debug_ == 1) std::cout << "iam_ = " << iam_ << " Amesos_Scalapack.cpp:417 debug_ = " << debug_ << std::endl; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:402" << " nprow_ = " << nprow_ << " npcol_ = " << npcol_ << std::endl ; int info; const int zero = 0 ; if ( ictxt_ == -1313 ) { ictxt_ = 0 ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:408" << std::endl; SL_INIT_F77(&ictxt_, &nprow_, &npcol_) ; } if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:410A" << std::endl; int nprow; int npcol; int myrow; int mycol; BLACS_GRIDINFO_F77(&ictxt_, &nprow, &npcol, &myrow, &mycol) ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "iam_ = " << iam_ << " Amesos_Scalapack.cpp:410" << std::endl; if ( iam_ < nprow_ * npcol_ ) { assert( nprow == nprow_ ) ; if ( npcol != npcol_ ) std::cout << "Amesos_Scalapack.cpp:430 npcol = " << npcol << " npcol_ = " << npcol_ << std::endl ; assert( npcol == npcol_ ) ; if ( TwoD_distribution_ ) { assert( myrow == myprow_ ) ; assert( mycol == mypcol_ ) ; lda_ = EPETRA_MAX(1,NumOurRows_) ; } else { assert( myrow == 0 ) ; assert( mycol == iam_ ) ; nb_ = m_per_p_; lda_ = EPETRA_MAX(1,NumGlobalElements_); } if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp: " << __LINE__ << " TwoD_distribution_ = " << TwoD_distribution_ << " NumGlobalElements_ = " << NumGlobalElements_ << " debug_ = " << debug_ << " nb_ = " << nb_ << " lda_ = " << lda_ << " nprow_ = " << nprow_ << " npcol_ = " << npcol_ << " myprow_ = " << myprow_ << " mypcol_ = " << mypcol_ << " iam_ = " << iam_ << std::endl ; AMESOS_PRINT( myprow_ ); DESCINIT_F77(DescA_, &NumGlobalElements_, &NumGlobalElements_, &nb_, &nb_, &zero, &zero, &ictxt_, &lda_, &info) ; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:441" << std::endl; assert( info == 0 ) ; } else { DescA_[0] = -13; if ( debug_ == 1) std::cout << "iam_ = " << iam_ << "Amesos_Scalapack.cpp:458 nprow = " << nprow << std::endl; assert( nprow == -1 ) ; } if ( debug_ == 1) std::cout << "Amesos_Scalapack.cpp:446" << std::endl; MatTime_ += Time_->ElapsedTime(); return 0; }
int main (int argc, char *argv[]) { myscalar *A=NULL, *X=NULL, *B=NULL, *Y=NULL, *Xtrue=NULL; myscalar elem; int descA[BLACSCTXTSIZE], descVec[BLACSCTXTSIZE], descelem[BLACSCTXTSIZE]; int n; int nrhs; int nb; int locr, locc; int i, j, ii, jj; int ierr; int dummy; int myid, np; int myrow, mycol, nprow, npcol; int ctxt; myreal rdummy, res; n=1024; /* Size of the problem */ nrhs=3; /* Number of RHS */ nb=16; /* Blocksize for the 2D block-cyclic distribution */ /* Initialize MPI */ if((ierr=MPI_Init(&argc,&argv))) return 1; myid=-1; if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid))) return 1; np=-1; if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np))) return 1; /* Initialize the BLACS grid */ nprow=floor(sqrt((float)np)); npcol=np/nprow; blacs_get_(&IZERO,&IZERO,&ctxt); blacs_gridinit_(&ctxt,"R",&nprow,&npcol); blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); /* A is a dense n x n distributed Toeplitz matrix */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol); A=new myscalar[locr*locc]; dummy=std::max(1,locr); descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); myreal pi=3.1416, d=0.1; for(i=1;i<=locr;i++) { for(j=1;j<=locc;j++) { ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow); jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol); // Toeplitz matrix from Quantum Chemistry. A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2); } } } else { descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initializing solution */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&nrhs,&nb,&mycol,&IZERO,&npcol); dummy=std::max(1,locr); descinit_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); Xtrue=new myscalar[locr*locc](); for(i=0;i<locr*locc;i++) Xtrue[i]=static_cast<myscalar>(rand())/(static_cast<myscalar>(RAND_MAX)); } else { descset_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initializing solution and intermediate vector space */ if(myid<nprow*npcol) { X=new myscalar[locr*locc](); Y=new myscalar[locr*locc](); } /* Initializing RHS as A*Xtrue */ if(myid<nprow*npcol) { B=new myscalar[locr*locc](); pgemm('N','N',n,nrhs,n,ONE,A,IONE,IONE,descA,Xtrue,IONE,IONE,descVec,ZERO,B,IONE,IONE,descVec); } /* Initialize the solver and set parameters */ StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD); sdp.use_HSS=true; sdp.levels_HSS=4; sdp.min_rand_HSS=64; sdp.lim_rand_HSS=0; sdp.tol_HSS=1e-12; sdp.split_HSS=768; /* Size of A11 */ /* Compression */ sdp.compress(A,descA); /* Accuracy checking */ sdp.check_compression(A,descA); /* Factorization */ sdp.partially_factor(A,descA,sdp.split_HSS); /* Schur complement update */ sdp.compute_schur(); /* Extracting a random element from the Schur complement */ if(myid<nprow*npcol) { descinit_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); } else { descset_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } i=1+rand()%(n-sdp.split_HSS); j=1+rand()%(n-sdp.split_HSS); MPI_Bcast((void *)&i,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); MPI_Bcast((void *)&j,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); sdp.extract_schur(&elem,descelem,&i,IONE,&j,IONE); if(!myid) std::cout << "Element (" << i << "," << j << ") of Schur complement = " << elem << std::endl << std::endl; /* Condensation */ sdp.reduce_RHS(Y,descVec,B,descVec); /* Solve the Schur complement system (touches only the bottom part of X)*/ sdp.verbose=false; if(!myid) std::cout << "Solving Schur complement system with Conjugate Gradient..." << std::endl << std::endl; CG(&sdp,X,Y,descVec,n,nrhs,3000,1e-14); sdp.verbose=true; /* Expansion (touches only the top part of X) */ sdp.expand_solution(X,descVec,Y,descVec); /* Accuracy checking */ sdp.check_solution(A,descA,X,descVec,B,descVec); /* Forward error */ if(myid<nprow*npcol) { for(i=0;i<locr*locc;i++) Y[i]=X[i]-Xtrue[i]; res=plange('F',n,nrhs,Y,IONE,IONE,descVec,&rdummy); res/=plange('F',n,nrhs,Xtrue,IONE,IONE,descVec,&rdummy); if(!myid) std::cout << "Forward error = " << res << std::endl; } /* Statistics */ sdp.print_statistics(); /* Clean-up */ delete[] A; delete[] B; delete[] X; delete[] Y; delete[] Xtrue; /* The end */ MPI_Finalize(); return 0; }
/// /// This is the new standard style for implementing a slave routine for a ScaLAPACK operator, /// in this case, pdgesvd_(). The difference from the old style is that the new style /// requires that the ScaLAPACK context, ICTXT, be provided. Until that requirement can be pushed /// up into the "mpi_slave_xxx" files, the existing pdgesvdSlave() routine will create the context /// and then call this routine. /// slpp::int_t pdgesvdSlave2(const slpp::int_t ICTXT, PdgesvdArgs args, void* bufs[], size_t sizes[], unsigned count) { // find out where I am in the scalapack grid slpp::int_t NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM; getSlInfo(ICTXT/*in*/, NPROW/*in*/, NPCOL/*in*/, MYPROW/*out*/, MYPCOL/*out*/, MYPNUM/*out*/); if(NPROW != args.NPROW || NPCOL != args.NPCOL || MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){ if(DBG) { std::cerr << "scalapack general parameter mismatch" << std::endl; std::cerr << "args NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL << "MYPROW:"<<args.MYPROW<<" MYPCOL:"<<args.MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; std::cerr << "ScaLAPACK NPROW:"<<NPROW<<" NPCOL:"<<NPCOL << "MYPROW:"<<MYPROW<<" MYPCOL:"<<MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; } } // setup MB,NB const slpp::int_t& M = args.A.DESC.M ; const slpp::int_t& N = args.A.DESC.N ; const slpp::int_t& MB = args.A.DESC.MB ; const slpp::int_t& NB = args.A.DESC.NB ; const slpp::int_t& LLD_A = args.A.DESC.LLD ; const slpp::int_t one = 1 ; const slpp::int_t LTD_A = std::max(one, numroc_( N, NB, MYPCOL, /*CSRC_A*/0, NPCOL )); const slpp::int_t& MP = LLD_A ; const slpp::int_t& NQ = LTD_A ; // size check A, S, U, VT slpp::int_t SIZE_A = MP * NQ ; slpp::int_t SIZE_S = std::min(M, N); slpp::int_t size_p = std::max(one, numroc_( SIZE_S, MB, MYPROW, /*RSRC_A*/0, NPROW )); slpp::int_t size_q = std::max(one, numroc_( SIZE_S, NB, MYPCOL, /*RSRC_A*/0, NPCOL )); slpp::int_t SIZE_U = MP * size_q; slpp::int_t SIZE_VT= size_p * NQ; if(DBG) { std::cerr << "##################################################" << std::endl; std::cerr << "####pdgesvdSlave##################################" << std::endl; std::cerr << "one:" << one << std::endl; std::cerr << "SIZE_S:" << SIZE_S << std::endl; std::cerr << "MB:" << MB << std::endl; std::cerr << "MYPROW:" << MYPROW << std::endl; std::cerr << "NPROW:" << NPROW << std::endl; } // TODO: >= because master is permitted to use a larger buffer // to allow to see if rounding up to chunksize eliminates some errors // before we put the roundUp formula everywhere SLAVE_ASSERT_ALWAYS(sizes[BUF_A] >= SIZE_A * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_S] >= SIZE_S * sizeof(double)); if (args.jobU == 'V') { SLAVE_ASSERT_ALWAYS( sizes[BUF_U] >= SIZE_U *sizeof(double)); } if (args.jobVT == 'V') { SLAVE_ASSERT_ALWAYS( sizes[BUF_VT] >= SIZE_VT *sizeof(double)); } // sizes are correct, give the pointers their names double* A = reinterpret_cast<double*>(bufs[BUF_A]) ; double* S = reinterpret_cast<double*>(bufs[BUF_S]) ; double* U = reinterpret_cast<double*>(bufs[BUF_U]) ; double* VT = reinterpret_cast<double*>(bufs[BUF_VT]) ; // debug that the input is readable and show its contents if(DBG) { for(int ii=0; ii < SIZE_A; ii++) { std::cerr << "("<< MYPROW << "," << MYPCOL << ") A["<<ii<<"] = " << A[ii] << std::endl; } } if(false) { // debug that outputs are writeable: for(int ii=0; ii < SIZE_S; ii++) { S[ii] = -9999.0 ; } if (args.jobU == 'V') { for(int ii=0; ii < SIZE_U; ii++) { U[ii] = -9999.0 ; } } if (args.jobVT == 'V') { for(int ii=0; ii < SIZE_VT; ii++) { VT[ii] = -9999.0 ; } } } // ScaLAPACK: the DESCS are complete except for the correct context args.A.DESC.CTXT= ICTXT ; // note: no DESC for S, it is not distributed, all have a copy args.U.DESC.CTXT= ICTXT ; args.VT.DESC.CTXT= ICTXT ; if(DBG) { std::cerr << "pdgesvdSlave: argsBuf is: {" << std::endl; std::cerr << args << std::endl; std::cerr << "}" << std::endl << std::endl; std::cerr << "pdgesvdSlave: calling pdgesvd_ for computation, with args:" << std::endl ; std::cerr << "jobU: " << args.jobU << ", jobVT: " << args.jobVT << ", M: " << args.M << ", N: " << args.N << std::endl; std::cerr << "A: " << (void*)(A) << ", A.I: " << args.A.I << ", A.J: " << args.A.J << std::endl; std::cerr << ", A.DESC: " << args.A.DESC << std::endl; std::cerr << "S: " << (void*)(S) << std::endl; std::cerr << "U: " << (void*)(U) << ", U.I: " << args.U.I << ", U.J: " << args.U.J << std::endl; std::cerr << ", U.DESC: " << args.U.DESC << std::endl; std::cerr << "VT: " << (void*)(VT) << ", VT.I: " << args.VT.I << ", VT.J: " << args.VT.J << std::endl; std::cerr << ", VT.DESC: " << args.VT.DESC << std::endl; } if(DBG) std::cerr << "pdgesvdSlave calling PDGESVD to get work size" << std:: endl; slpp::int_t INFO = 0; double LWORK_DOUBLE; pdgesvd_(args.jobU, args.jobVT, args.M, args.N, A, args.A.I, args.A.J, args.A.DESC, S, U, args.U.I, args.U.J, args.U.DESC, VT, args.VT.I, args.VT.J, args.VT.DESC, &LWORK_DOUBLE, -1, INFO); if(INFO < 0) { // argument error std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: pdgesvd_() for work size, argument error, argument # " << -INFO << std::endl; } else if(INFO == (std::min(args.M, args.N)+1)) { // should not happen when checking work size // heterogeneity detected (eigenvalues did not match on all nodes) std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "WARNING: pdgesvd_() for work size, eigenvalues did not match across all instances" << std::endl; } else if (INFO > 0) { // other + value of INFO // should not happen when checking work size // DBDSQR did not converge std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: pdgesvd_() for work size, DBDSQR did not converge: " << INFO << std::endl; } if ( LWORK_DOUBLE < 0.0 || LWORK_DOUBLE > double(numeric_limits<slpp::int_t>::max())) { // Houston, we have a problem ... the user wants to do more than 1 instance // can handle through the slpp::int ... the size of which is determined // by which binary for ScaLAPACK/BLAS we are using .. .32-bit or 64-bit FORTRAN INTEGER // noting that 32-bit INTEGER is what is shipped with RHEL, CentOS, etc, even on // 64-bit systems, for some unknown reason. std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: LWORK_DOUBLE, " << LWORK_DOUBLE << ", is too large for the ScaLAPACK API to accept" << INFO << std::endl; if (INFO >= 0) { // make up our own argument error... -22 (there are 20 arguments) INFO = -22; } return INFO; } slpp::int_t LWORK = int(LWORK_DOUBLE); // get the cast from SVDPhysical.cpp std::cerr << "pdgesvdSlave(): info: LWORK is " << LWORK << std::endl; // ALLOCATE an array WORK size LWORK boost::scoped_array<double> WORKtmp(new double[LWORK]); double* WORK = WORKtmp.get(); ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// if(DBG) std::cerr << "pdgesvdSlave: calling pdgesvd_ for computation." << std::endl ; INFO=0; pdgesvd_(args.jobU, args.jobVT, args.M, args.N, A, args.A.I, args.A.J, args.A.DESC, S, U, args.U.I, args.U.J, args.U.DESC, VT, args.VT.I, args.VT.J, args.VT.DESC, WORK, LWORK, INFO); slpp::int_t numToPrintAtStart=4 ; if (MYPNUM==0 && DBG) { int ii; // used in 2nd loop for(ii=0; ii < std::min(SIZE_S, numToPrintAtStart); ii++) { std::cerr << "pdgesvdSlave: S["<<ii<<"] = " << S[ii] << std::endl; } // now skip to numToPrintAtStart before the end, (without repeating) and print to the end // to see if the test cases are producing zero eigenvalues (don't want that) slpp::int_t numToPrintAtEnd=4; for(int ii=std::max(ii, SIZE_S-numToPrintAtEnd); ii < SIZE_S; ii++) { std::cerr << "pdgesvdSlave: S["<<ii<<"] = " << S[ii] << std::endl; } } if (DBG) { if (args.jobU == 'V') { for(int ii=0; ii < std::min(SIZE_U, numToPrintAtStart); ii++) { std::cerr << "pdgesvdSlave: U["<<ii<<"] = " << U[ii] << std::endl; } } if (args.jobVT == 'V') { for(int ii=0; ii < std::min(SIZE_VT, numToPrintAtStart); ii++) { std::cerr << "pdgesvdSlave: VT["<<ii<<"] = " << VT[ii] << std::endl; } } } if(MYPNUM == 0) { if(INFO < 0) { // argument error std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: argument error, argument # " << -INFO << std::endl; } else if(INFO == (std::min(args.M, args.N)+1)) { // heterogeneity detected (eigenvalues did not match on all nodes) std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "WARNING: eigenvalues did not match across all instances" << std::endl; } else if (INFO > 0) { // other + value of INFO // DBDSQR did not converge std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: DBDSQR did not converge: " << INFO << std::endl; } } return INFO ; }
/*==== MAIN FUNCTION =================================================*/ int main( int argc, char *argv[] ){ /* ==== Declarations =================================================== */ /* File variables */ FILE *fin; /* Matrix descriptors */ MDESC descA, descB, descC, descA_local, descB_local; /* Local scalars */ MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol; MKL_INT n, nb, mp, nq, lld, lld_local; MKL_INT i, j, info; int n_int, nb_int, nprow_int, npcol_int; double thresh, diffnorm, anorm, bnorm, residual, eps; /* Local arrays */ double *A_local, *B_local, *A, *B, *C, *work; MKL_INT iwork[ 4 ]; /* ==== Executable statements ========================================== */ /* Get information about how many processes are used for program execution and number of current process */ blacs_pinfo_( &iam, &nprocs ); /* Init temporary 1D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "C", &nprocs, &i_one ); /* Open input file */ if ( iam == 0 ) { fin = fopen( "../in/pblas3ex.in", "r" ); if ( fin == NULL ) { printf( "Error while open input file." ); return 2; } } /* Read data and send it to all processes */ if ( iam == 0 ) { /* Read parameters */ fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int ); fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int ); fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int ); fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int ); fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh ); n = (MKL_INT) n_int; nb = (MKL_INT) nb_int; nprow = (MKL_INT) nprow_int; npcol = (MKL_INT) npcol_int; /* Check if all parameters are correct */ if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) { printf( "One or several input parameters has incorrect value. Limitations:\n" ); printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" ); printf( "p*q = number of processes\n" ); printf( "threshold - double (set to negative to swicth off check)\n"); return 2; } /* Pack data into array and send it to other processes */ iwork[ 0 ] = n; iwork[ 1 ] = nb; iwork[ 2 ] = nprow; iwork[ 3 ] = npcol; igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four ); dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one ); } else { /* Recieve and unpack data */ igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero ); dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero ); n = iwork[ 0 ]; nb = iwork[ 1 ]; nprow = iwork[ 2 ]; npcol = iwork[ 3 ]; } if ( iam == 0 ) { fclose( fin ); } /* Destroy temporary process grid */ blacs_gridexit_( &ictxt ); /* Init workind 2D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* Create on process 0 two matrices: A - orthonormal, B -random */ if ( ( myrow == 0 ) && ( mycol == 0 ) ){ /* Allocate arrays */ A_local = (double*) calloc( n*n, sizeof( double ) ); B_local = (double*) calloc( n*n, sizeof( double ) ); /* Set arrays */ for ( i=0; i<n; i++ ){ for ( j=0; j<n; j++ ){ B_local[ i+n*j ] = one*rand()/RAND_MAX; } B_local[ i+n*i ] += two; } for ( j=0; j<n; j++ ){ for ( i=0; i<n; i++ ){ if ( j < n-1 ){ if ( i <= j ){ A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) ); } else if ( i == j+1 ) { A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) ); } else { A_local[ i+n*j ] = zero; } } else { A_local[ i+n*(n-1) ] = one / sqrt( ( double )n ); } } } /* Print information of task */ printf( "=== START OF EXAMPLE ===================\n" ); printf( "Matrix-matrix multiplication: A*B = C\n\n" ); printf( "/ 1/q_1 ........ 1/q_n-1 1/q_n \\ \n" ); printf( "| . | \n" ); printf( "| `. : : | \n" ); printf( "| -1/q_1 `. : : | \n" ); printf( "| . `. : : | = A \n" ); printf( "| 0 `. ` | \n" ); printf( "| : `. `. 1/q_n-1 1/q_n | \n" ); printf( "| : `. `. | \n" ); printf( "\\ 0 .... 0 -(n-1)/q_n-1 1/q_n / \n\n" ); printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" ); printf( "A - n*n real matrix (orthonormal) \n" ); printf( "B - random n*n real matrix\n\n" ); printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol ); printf( "=== PROGRESS ===========================\n" ); } else { /* Other processes don't contain parts of initial arrays */ A_local = NULL; B_local = NULL; } /* Compute precise length of local pieces and allocate array on each process for parts of distributed vectors */ mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow ); nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); A = (double*) calloc( mp*nq, sizeof( double ) ); B = (double*) calloc( mp*nq, sizeof( double ) ); C = (double*) calloc( mp*nq, sizeof( double ) ); /* Compute leading dimensions */ lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 ); lld = MAX( mp, 1 ); /* Initialize descriptors for initial arrays located on 0 process */ descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); /* Initialize descriptors for distributed arrays */ descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); /* Distribute matrices from 0 process over process grid */ pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); } /* Destroy arrays on 0 process - they are not necessary anymore */ if( ( myrow == 0 ) && ( mycol == 0 ) ){ free( A_local ); free( B_local ); } /* Compute norm of A and B */ work = (double*) calloc( mp, sizeof( double ) ); anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work ); bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); } /* Compute product C = A*B */ pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); } /* Compute difference B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */ pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC, &negone, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); } /* Compute norm of B - inv_A*C (which is contained in B) */ diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); free( work ); if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); } /* Print results */ if( iam == 0 ){ printf( ".. Solutions are compared ..\n" ); printf( "== Results ==\n" ); printf( "||A|| = %03.11f\n", anorm ); printf( "||B|| = %03.11f\n", bnorm ); printf( "=== END OF EXAMPLE =====================\n" ); } /* Compute machine epsilon */ eps = pdlamch_( &ictxt, "e" ); /* Compute residual */ residual = diffnorm /( two*anorm*bnorm*eps ); /* Destroy arrays */ free( A ); free( B ); free( C ); /* Destroy process grid */ blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); /* Check if residual passed or failed the threshold */ if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){ printf( "FAILED. Residual = %05.16f\n", residual ); return 1; } else { return 0; } /*======================================================================== ====== End of PBLAS Level 3 example ==================================== ======================================================================*/ }
int main(int argc, char *argv[]) { // Some constants int minusone = -1; int zero = 0; int one = 1; double dzero = 0.0; // ConText int ConTxt = minusone; // order char order = 'R'; char scope = 'A'; // root process int root = zero; // BLACS/SCALAPACK parameters // the size of the blocks the distributed matrix is split into // (applies to both rows and columns) int mb = 32; int nb = mb; // PDSYEVxxx constraint // the number of rows and columns in the processor grid // only square processor grids due to C vs. Fortran ordering int nprow = 2; int npcol = nprow; // only square processor grids, // starting row and column in grid, do not change int rsrc = zero; int csrc = zero; // dimensions of the matrix to diagonalize int m = 1000; int n = m; // only square matrices int info = zero; // Rest of code will only work for: // nprow = npcol // mb = nb; // m = n; // rsrc = crsc; // Paramteres for Trivial Matrix double alpha = 0.1; // off-diagonal double beta = 75.0; // diagonal // For timing: double tdiag0, tdiag, ttotal0, ttotal; // BLACS Communicator MPI_Comm blacs_comm; int nprocs; int iam; int myrow, mycol; MPI_Init(&argc, &argv); MPI_Barrier(MPI_COMM_WORLD); ttotal0 = MPI_Wtime(); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &iam); if (argc > one) { nprow = strtod(argv[1],NULL); m = strtod(argv[2],NULL); npcol = nprow; n = m; } if (iam == root) { printf("world size %d \n",nprocs); printf("n %d \n", n); printf("nprow %d \n", nprow); printf("npcol %d \n", npcol); } // We can do this on any subcommunicator. #ifdef CartComm int dim[2]; int pbc[2]; dim[0] = nprow; dim[1] = npcol; pbc[0] = 0; pbc[1] = 0; MPI_Cart_create(MPI_COMM_WORLD, 2, dim, pbc, 1, &blacs_comm); #else blacs_comm = MPI_COMM_WORLD; #endif // initialize the grid // The lines below are equivalent to the one call to: if (blacs_comm != MPI_COMM_NULL) { ConTxt = Csys2blacs_handle_(blacs_comm); Cblacs_gridinit_(&ConTxt, &order, nprow, npcol); // get information back about the grid Cblacs_gridinfo_(ConTxt, &nprow, &npcol, &myrow, &mycol); } if (ConTxt != minusone) { int desc[9]; // get the size of the distributed matrix int locM = numroc_(&m, &mb, &myrow, &rsrc, &nprow); int locN = numroc_(&n, &nb, &mycol, &csrc, &npcol); // printf ("locM = %d \n", locM); // printf ("locN = %d \n", locN); int lld = MAX(one,locM); // build the descriptor descinit_(desc, &m, &n, &mb, &nb, &rsrc, &csrc, &ConTxt, &lld, &info); // Allocate arrays // eigenvalues double* eigvals = malloc(n * sizeof(double)); // allocate the distributed matrices double* mata = malloc(locM*locN * sizeof(double)); // allocate the distributed matrix of eigenvectors double* z = malloc(locM*locN * sizeof(double)); // Eigensolver parameters int ibtype = one; char jobz = 'V'; // eigenvectors also char range = 'A'; // all eiganvalues char uplo = 'L'; // work with upper double vl, vu; int il, iu; char cmach = 'U'; double abstol = 2.0 * pdlamch_(&ConTxt, &cmach); int eigvalm, nz; double orfac = -1.0; //double orfac = 0.001; int* ifail; ifail = malloc(m * sizeof(int)); int* iclustr; iclustr = malloc(2*nprow*npcol * sizeof(int)); double* gap; gap = malloc(nprow*npcol * sizeof(double)); double* work; work = malloc(3 * sizeof(double)); int querylwork = minusone; int* iwork; iwork = malloc(1 * sizeof(int)); int queryliwork = minusone; // Build a trivial distributed matrix: Diagonal matrix pdlaset_(&uplo, &m, &n, &alpha, &beta, mata, &one, &one, desc); // First there is a workspace query // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl, // &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one, // &one, desc, work, &querylwork, iwork, &queryliwork, ifail, iclustr, gap, &info); pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals, z, &one, &one, desc, work, &querylwork, iwork, &queryliwork, &info); //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals, // z, &one, &one, desc, work, &querylwork, &info); int lwork = (int)work[0]; //printf("lwork %d\n", lwork); free(work); int liwork = (int)iwork[0]; //printf("liwork %d\n", liwork); free(iwork); work = (double*)malloc(lwork * sizeof(double)); iwork = (int*)malloc(liwork * sizeof(int)); // This is actually diagonalizes the matrix // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl, // &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one, // &one, desc, work, &lwork, iwork, &liwork, ifail, iclustr, gap, &info); Cblacs_barrier(ConTxt, &scope); tdiag0 = MPI_Wtime(); pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals, z, &one, &one, desc, work, &lwork, iwork, &liwork, &info); //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals, // z, &one, &one, desc, work, &lwork, &info); Cblacs_barrier(ConTxt, &scope); tdiag = MPI_Wtime() - tdiag0; free(work); free(iwork); free(gap); free(iclustr); free(ifail); free(z); free(mata); // Destroy BLACS grid Cblacs_gridexit_(ConTxt); // Check eigenvalues if (myrow == zero && mycol == zero) { for (int i = 0; i < n; i++) { if (fabs(eigvals[i] - beta) > 0.0001) printf("Problem: eigval %d != %f5.2 but %f\n", i, beta, eigvals[i]); } if (info != zero) { printf("info = %d \n", info); } printf("Time (s) diag: %f\n", tdiag); } free(eigvals); } MPI_Barrier(MPI_COMM_WORLD); ttotal = MPI_Wtime() - ttotal0; if (iam == 0) printf("Time (s) total: %f\n", ttotal); MPI_Finalize(); }
int main(int argc, char **argv) { int iam, nprocs; int myrank_mpi, nprocs_mpi; int ictxt, nprow, npcol, myrow, mycol; int nb, m, n; int mpA, nqA, mpU, nqU, mpVT, nqVT; int i, j, k, itemp, min_mn; int descA[9], descU[9], descVT[9]; float *A=NULL; int info, infoNN, infoVV, infoNV, infoVN; float *U_NN=NULL, *U_VV=NULL, *U_NV=NULL, *U_VN=NULL; float *VT_NN=NULL, *VT_VV=NULL, *VT_NV=NULL, *VT_VN=NULL; float *S_NN=NULL, *S_VV=NULL, *S_NV=NULL, *S_VN=NULL; float *S_res_NN=NULL; float orthU_VV, residF, orthVT_VV; float orthU_VN, orthVT_NV; float residS_NN, eps; float res_repres_NV, res_repres_VN; /**/ int izero=0,ione=1; float rtmone=-1.0e+00; /**/ double MPIelapsedVV, MPIelapsedNN, MPIelapsedVN, MPIelapsedNV; char jobU, jobVT; int nbfailure=0, nbtestcase=0,inputfromfile, nbhetereogeneity=0; float threshold=100e+00; char buf[1024]; FILE *fd; char *c; char *t_jobU, *t_jobVT; int *t_m, *t_n, *t_nb, *t_nprow, *t_npcol; int nb_expe, expe; char hetereogeneityVV, hetereogeneityNN, hetereogeneityVN, hetereogeneityNV; int iseed[4], idist; /**/ MPI_Init( &argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); /**/ m = 100; n = 100; nprow = 1; npcol = 1; nb = 64; jobU='A'; jobVT='A'; inputfromfile = 0; for( i = 1; i < argc; i++ ) { if( strcmp( argv[i], "-f" ) == 0 ) { inputfromfile = 1; } if( strcmp( argv[i], "-jobvt" ) == 0 ) { if (i+1<argc) { if( strcmp( argv[i+1], "V" ) == 0 ){ jobVT = 'V'; i++; } else if( strcmp( argv[i+1], "N" ) == 0 ){ jobVT = 'N'; i++; } else if( strcmp( argv[i+1], "A" ) == 0 ){ jobVT = 'A'; i++; } else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n"); } else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n"); } if( strcmp( argv[i], "-jobu" ) == 0 ) { if (i+1<argc) { if( strcmp( argv[i+1], "V" ) == 0 ){ jobU = 'V'; i++; } else if( strcmp( argv[i+1], "N" ) == 0 ){ jobU = 'N'; i++; } else if( strcmp( argv[i+1], "A" ) == 0 ){ jobU = 'A'; i++; } else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n"); } else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n"); } if( strcmp( argv[i], "-m" ) == 0 ) { m = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-n" ) == 0 ) { n = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-p" ) == 0 ) { nprow = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-q" ) == 0 ) { npcol = atoi(argv[i+1]); i++; } if( strcmp( argv[i], "-nb" ) == 0 ) { nb = atoi(argv[i+1]); i++; } } /**/ if (inputfromfile){ nb_expe = 0; fd = fopen("svd.dat", "r"); if (fd == NULL) { printf("File failed to open svd.dat from processor mpirank(%d/%d): \n",myrank_mpi,nprocs_mpi); exit(-1); } do { c = fgets(buf, 1024, fd); /* get one line from the file */ if (c != NULL) if (c[0] != '#') nb_expe++; } while (c != NULL); /* repeat until NULL */ fclose(fd); t_jobU = (char *)calloc(nb_expe,sizeof(char)) ; t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ; t_m = (int *)calloc(nb_expe,sizeof(int )) ; t_n = (int *)calloc(nb_expe,sizeof(int )) ; t_nb = (int *)calloc(nb_expe,sizeof(int )) ; t_nprow = (int *)calloc(nb_expe,sizeof(int )) ; t_npcol = (int *)calloc(nb_expe,sizeof(int )) ; fd = fopen("svd.dat", "r"); expe=0; do { c = fgets(buf, 1024, fd); /* get one line from the file */ if (c != NULL) if (c[0] != '#'){ //printf("NBEXPE = %d\n",expe); sscanf(c,"%c %c %d %d %d %d %d", &(t_jobU[expe]),&(t_jobVT[expe]),&(t_m[expe]),&(t_n[expe]), &(t_nb[expe]),(&t_nprow[expe]),&(t_npcol[expe])); expe++; } } while (c != NULL); /* repeat until NULL */ fclose(fd); } else { nb_expe = 1; t_jobU = (char *)calloc(nb_expe,sizeof(char)) ; t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ; t_m = (int *)calloc(nb_expe,sizeof(int )) ; t_n = (int *)calloc(nb_expe,sizeof(int )) ; t_nb = (int *)calloc(nb_expe,sizeof(int )) ; t_nprow = (int *)calloc(nb_expe,sizeof(int )) ; t_npcol = (int *)calloc(nb_expe,sizeof(int )) ; t_jobU[0] = jobU; t_jobVT[0] = jobVT; t_m[0] = m; t_n[0] = n; t_nb[0] = nb; t_nprow[0] = nprow; t_npcol[0] = npcol; } if (myrank_mpi==0){ printf("\n"); printf("--------------------------------------------------------------------------------------------------------------------\n"); printf(" Testing psgsevd -- float precision SVD ScaLAPACK routine \n"); printf("jobU jobVT m n nb p q || info heter resid orthU orthVT |SNN-SVV| time(s) cond(A) \n"); printf("--------------------------------------------------------------------------------------------------------------------\n"); } /**/ for (expe = 0; expe<nb_expe; expe++){ jobU = t_jobU[expe] ; jobVT = t_jobVT[expe] ; m = t_m[expe] ; n = t_n[expe] ; nb = t_nb[expe] ; nprow = t_nprow[expe] ; npcol = t_npcol[expe] ; if (nb>n) nb = n; if (nprow*npcol>nprocs_mpi){ if (myrank_mpi==0) printf(" **** ERROR : we do not have enough processes available to make a p-by-q process grid ***\n"); printf(" **** Bye-bye ***\n"); MPI_Finalize(); exit(1); } /**/ Cblacs_pinfo( &iam, &nprocs ) ; Cblacs_get( -1, 0, &ictxt ); Cblacs_gridinit( &ictxt, "Row", nprow, npcol ); Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); /**/ min_mn = min(m,n); /**/ //if (iam==0) //printf("\tm=%d\tn = %d\t\t(%d,%d)\t%dx%d\n",m,n,nprow,npcol,nb,nb); //printf("Hello World, I am proc %d over %d for MPI, proc %d over %d for BLACS in position (%d,%d) in the process grid\n", //myrank_mpi,nprocs_mpi,iam,nprocs,myrow,mycol); /* * * Work only the process in the process grid * */ //if ((myrow < nprow)&(mycol < npcol)){ if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)){ /* * * Compute the size of the local matrices (thanks to numroc) * */ mpA = numroc_( &m , &nb, &myrow, &izero, &nprow ); nqA = numroc_( &n , &nb, &mycol, &izero, &npcol ); mpU = numroc_( &m , &nb, &myrow, &izero, &nprow ); nqU = numroc_( &min_mn, &nb, &mycol, &izero, &npcol ); mpVT = numroc_( &min_mn, &nb, &myrow, &izero, &nprow ); nqVT = numroc_( &n , &nb, &mycol, &izero, &npcol ); /* * * Allocate and fill the matrices A and B * */ A = (float *)calloc(mpA*nqA,sizeof(float)) ; if (A==NULL){ printf("error of memory allocation A on proc %dx%d\n",myrow,mycol); exit(0); } /**/ // seed = iam*(mpA*nqA*2); srand(seed); idist = 2; iseed[0] = mpA%4096; iseed[1] = iam%4096; iseed[2] = nqA%4096; iseed[3] = 23; /**/ k = 0; for (i = 0; i < mpA; i++) { for (j = 0; j < nqA; j++) { slarnv_( &idist, iseed, &ione, &(A[k]) ); k++; } } /* * * Initialize the array descriptor for the distributed matrices xA, U and VT * */ itemp = max( 1, mpA ); descinit_( descA, &m, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); itemp = max( 1, mpA ); descinit_( descU, &m, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); itemp = max( 1, mpVT ); descinit_( descVT, &min_mn, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info ); /**/ eps = pslamch_( &ictxt, "Epsilon" ); /**/ if ( ((jobU=='V')&(jobVT=='N')) ||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; U_VN = (float *)calloc(mpU*nqU,sizeof(float)) ; if (U_VN==NULL){ printf("error of memory allocation U_VN on proc %dx%d\n",myrow,mycol); exit(0); } S_VN = (float *)calloc(min_mn,sizeof(float)) ; if (S_VN==NULL){ printf("error of memory allocation S_VN on proc %dx%d\n",myrow,mycol); exit(0); } infoVN = driver_psgesvd( 'V', 'N', m, n, A, 1, 1, descA, S_VN, U_VN, 1, 1, descU, VT_VN, 1, 1, descVT, &MPIelapsedVN); orthU_VN = verif_orthogonality(m,min_mn,U_VN , 1, 1, descU); res_repres_VN = verif_repres_VN( m, n, A, 1, 1, descA, U_VN, 1, 1, descU, S_VN); if (infoVN==min_mn+1) hetereogeneityVN = 'H'; else hetereogeneityVN = 'N'; if ( iam==0 ) printf(" V N %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoVN,hetereogeneityVN,res_repres_VN/(S_VN[0]/S_VN[min_mn-1]), orthU_VN,MPIelapsedVN,S_VN[0]/S_VN[min_mn-1]); if (infoVN==min_mn+1) nbhetereogeneity++ ; else if ((res_repres_VN/eps/(S_VN[0]/S_VN[min_mn-1])>threshold)||(orthU_VN/eps>threshold)||(infoVN!=0)) nbfailure++; } /**/ if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; VT_NV = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VT_NV==NULL){ printf("error of memory allocation VT_NV on proc %dx%d\n",myrow,mycol); exit(0); } S_NV = (float *)calloc(min_mn,sizeof(float)) ; if (S_NV==NULL){ printf("error of memory allocation S_NV on proc %dx%d\n",myrow,mycol); exit(0); } infoNV = driver_psgesvd( 'N', 'V', m, n, A, 1, 1, descA, S_NV, U_NV, 1, 1, descU, VT_NV, 1, 1, descVT, &MPIelapsedNV); orthVT_NV = verif_orthogonality(min_mn,n,VT_NV, 1, 1, descVT); res_repres_NV = verif_repres_NV( m, n, A, 1, 1, descA, VT_NV, 1, 1, descVT, S_NV); if (infoNV==min_mn+1) hetereogeneityNV = 'H'; else hetereogeneityNV = 'N'; if ( iam==0 ) printf(" N V %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoNV,hetereogeneityNV,res_repres_NV/(S_NV[0]/S_NV[min_mn-1]), orthVT_NV,MPIelapsedNV,S_NV[0]/S_NV[min_mn-1]); if (infoNV==min_mn+1) nbhetereogeneity++ ; else if ((res_repres_NV/eps/(S_NV[0]/S_NV[min_mn-1])>threshold)||(orthVT_NV/eps>threshold)||(infoNV!=0)) nbfailure++; } /**/ if ( ((jobU=='N')&(jobVT=='N')) || ((jobU=='V')&(jobVT=='V')) || (jobU == 'A' ) || (jobVT=='A') ) { nbtestcase++; U_VV = (float *)calloc(mpU*nqU,sizeof(float)) ; if (U_VV==NULL){ printf("error of memory allocation U_VV on proc %dx%d\n",myrow,mycol); exit(0); } VT_VV = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VT_VV==NULL){ printf("error of memory allocation VT_VV on proc %dx%d\n",myrow,mycol); exit(0); } S_VV = (float *)calloc(min_mn,sizeof(float)) ; if (S_VV==NULL){ printf("error of memory allocation S_VV on proc %dx%d\n",myrow,mycol); exit(0); } infoVV = driver_psgesvd( 'V', 'V', m, n, A, 1, 1, descA, S_VV, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT, &MPIelapsedVV); orthU_VV = verif_orthogonality(m,min_mn,U_VV , 1, 1, descU); orthVT_VV = verif_orthogonality(min_mn,n,VT_VV, 1, 1, descVT); residF = verif_representativity( m, n, A, 1, 1, descA, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT, S_VV); if (infoVV==min_mn+1) hetereogeneityVV = 'H'; else hetereogeneityVV = 'N'; if ( iam==0 ) printf(" V V %6d %6d %3d %3d %3d || %3d %c %7.1e %7.1e %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoVV,hetereogeneityVV,residF,orthU_VV,orthVT_VV,MPIelapsedVV,S_VV[0]/S_VV[min_mn-1]); if (infoVV==min_mn+1) nbhetereogeneity++ ; else if ((residF/eps>threshold)||(orthU_VV/eps>threshold)||(orthVT_VV/eps>threshold)||(infoVV!=0)) nbfailure++; } /**/ if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ nbtestcase++; S_NN = (float *)calloc(min_mn,sizeof(float)) ; if (S_NN==NULL){ printf("error of memory allocation S_NN on proc %dx%d\n",myrow,mycol); exit(0); } infoNN = driver_psgesvd( 'N', 'N', m, n, A, 1, 1, descA, S_NN, U_NN, 1, 1, descU, VT_NN, 1, 1, descVT, &MPIelapsedNN); S_res_NN = (float *)calloc(min_mn,sizeof(float)) ; if (S_res_NN==NULL){ printf("error of memory allocation S on proc %dx%d\n",myrow,mycol); exit(0); } scopy_(&min_mn,S_VV,&ione,S_res_NN,&ione); saxpy_ (&min_mn,&rtmone,S_NN,&ione,S_res_NN,&ione); residS_NN = snrm2_(&min_mn,S_res_NN,&ione) / snrm2_(&min_mn,S_VV,&ione); free(S_res_NN); if (infoNN==min_mn+1) hetereogeneityNN = 'H'; else hetereogeneityNN = 'N'; if ( iam==0 ) printf(" N N %6d %6d %3d %3d %3d || %3d %c %7.1e %8.2f %7.1e\n", m,n,nb,nprow,npcol,infoNN,hetereogeneityNN,residS_NN,MPIelapsedNN,S_NN[0]/S_NN[min_mn-1]); if (infoNN==min_mn+1) nbhetereogeneity++ ; else if ((residS_NN/eps>threshold)||(infoNN!=0)) nbfailure++; } /**/ if (((jobU=='V')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_VN); free(U_VN); } if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(VT_NV); free(S_NV); } if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_NN); } if (((jobU=='N')&(jobVT=='N'))||((jobU=='V')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(U_VV); free(S_VV); free(VT_VV);} free(A); Cblacs_gridexit( 0 ); } /* * Print ending messages */ } if ( iam==0 ){ printf("--------------------------------------------------------------------------------------------------------------------\n"); printf(" [ nbhetereogeneity = %d / %d ]\n",nbhetereogeneity, nbtestcase); printf(" [ nbfailure = %d / %d ]\n",nbfailure, nbtestcase-nbhetereogeneity); printf("--------------------------------------------------------------------------------------------------------------------\n"); printf("\n"); } /**/ free(t_jobU ); free(t_jobVT ); free(t_m ); free(t_n ); free(t_nb ); free(t_nprow ); free(t_npcol ); MPI_Finalize(); exit(0); }