void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){
  int i,j,k;

  /*
   BLACS_PARAMS:
   [0]  = ictxt;
   [1]  = my_proc;
   [2]  = total_procs;
   [3]  = Nproc_row;
   [4]  = Nproc_col;
   [5]  = my_row;
   [6]  = my_col;
   [7]  = Global_rows;
   [8]  = Global_columns;
   [9]  = Block_Size_row;
   [10] = Block_Size_col;
   [11] = LOC_rows;
   [12] = LOC_columns;
   */


   blacs_pinfo_( i, j);
   BLACS_PARAMS[1] = i;
   BLACS_PARAMS[2] = j;

   blacs_get_( -1, 0,  k);   // <- LG check the arguments of this call
   BLACS_PARAMS[0] = k;

   i = BLACS_PARAMS[3];
   j = BLACS_PARAMS[4];
   blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j);

   blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j);
   BLACS_PARAMS[5] = i;
   BLACS_PARAMS[6] = j;

   i = 0;
   BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]);
   BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]);

   i = 0;
   j = 0;
   descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8],
                    BLACS_PARAMS[9], BLACS_PARAMS[10],
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
   if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);

   descinit_(DESCB, BLACS_PARAMS[7], 1,
                    BLACS_PARAMS[9], 1,
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
  if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);


}
Example #2
0
///
/// This code generates a PdgesvdArgs parameter block that can be used to drive
/// pdgesvdSlave2() when there is no SciDB application to provide the info.
/// It makes up parameters for a pdgesvd call that are appropriate to the
/// processor grid and order of matrix being decomposed
///
scidb::PdgesvdArgs pdgesvdGenTestArgs(slpp::int_t ICTXT, slpp::int_t NPROW, slpp::int_t NPCOL,
                                      slpp::int_t MYPROW, slpp::int_t MYPCOL, slpp::int_t MYPNUM,
                                      slpp::int_t order)
{
    scidb::PdgesvdArgs result;

    // hard-code a problem based on order and fixed block size
    const slpp::int_t M=order;
    const slpp::int_t N=order;
    const slpp::int_t MIN_MN=order;
    const slpp::int_t BLKSZ=slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE; // we are making up an array descriptor, not receiving one
                                                                  // as is normal for functions in a xxxxSlave.cpp file.  It is only because
                                                                  // its a test routine that SCALAPACK_EFFICIENT_BLOCK_SIZE is referenced here
                                                                  // Normally it is only used at the xxxxxPhysical.cpp operator level.
    const slpp::int_t one = 1 ;
    const char jobU = 'V';
    const char jobVT = 'V';

    // create ScaLAPACK array descriptors
    const slpp::int_t RSRC = 0 ;
    // LLD(A)
    slpp::int_t LLD_A = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW ));
    // LLD(VT)
    slpp::int_t LLD_VT = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW ));

    // WARNING -- note I never checked INFO from descinits !!
    slpp::int_t INFO = 0;

    slpp::desc_t DESC_A;
    descinit_(DESC_A, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    slpp::desc_t DESC_U;
    descinit_(DESC_U, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    slpp::desc_t DESC_VT;
    descinit_(DESC_VT, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_VT, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    // S is different: global, not distributed, so its LLD(S) == LEN(S)
    slpp::desc_t DESC_S;
    descinit_(DESC_S, MIN_MN, 1, BLKSZ, BLKSZ, 0, 0, ICTXT, MIN_MN, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    pdgesvdMarshallArgs(&result, NPROW,  NPCOL, MYPROW, MYPCOL, MYPNUM,
                           jobU, jobVT, M, N,
                           NULL /*A*/,  one, one, DESC_A,
                           NULL /*S*/,
                           NULL /*U*/,  one, one, DESC_U,
                           NULL /*VT*/, one, one, DESC_VT);
    return result;
}
Example #3
0
void 
dgather(int ictxt, int n, int numc, int nb, double *A, double *A_d, int *descAd){

int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info;
int nprow, npcol, myrow, mycol, descA[9], itemp;
int i,k;

   sl_init_(&RootNodeic, &ione, &ione);

   Cblacs_gridinfo(ictxt, &nprow,&npcol, &myrow, &mycol);

   if (myrow==0 && mycol ==0){ isRootNode = 1;}

   if(isRootNode){
      nru = numroc_(&n, &n, &myrow, &izero, &nprow);
      itemp = max(1,nru);
      descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info );
   }
   else{
      k=0;
      for(i=0;i<9;i++){
          descA[k]=0;
          k++;
      }
      descA[1]=-1;
    }

   pdgemr2d_(&n,&numc,A_d,&ione, &ione, descAd, A, &ione, &ione, descA, &ictxt );

   if (isRootNode){
       Cblacs_gridexit(RootNodeic);
   }
  
}
Example #4
0
/*===================================================================
 * 
 * Simplified wrapper for the ScaLAPACK routine descinit, which
 * initializes the descrip array associated to each distributed
 * matrix or vector.
 */
void Build_descrip(int my_rank, char* name, int* descrip, 
                   int m, int n, int row_block_size,
                   int col_block_size, int blacs_grid, 
                   int leading_dim) {
    int first_proc_row = 0;  /* Assume all distributed arrays begin */
    int first_proc_col = 0;  /* in process row 0, process col 0     */
    int error_info;

    extern void descinit_(int* descrip, int* m, int* n, 
                int* row_block_size, int* col_block_size, 
                int* first_proc_row, int* first_proc_col, 
                int* blacs_grid, int* leading_dim,
                int* error_info);

    descinit_(descrip, &m, &n, &row_block_size, &col_block_size,
              &first_proc_row, &first_proc_col, &blacs_grid,
              &leading_dim, &error_info);

    if (error_info != 0) {
        fprintf(stderr, "Process %d > Descinit for b failed.\n",
                my_rank);
        fprintf(stderr, "Process %d > error_info = %d\n", 
                my_rank, error_info);
        fprintf(stderr, "Process %d > Quitting.\n", my_rank);
        MPI_Abort(MPI_COMM_WORLD, -1);
    }
}  /* Build_descrip */
Example #5
0
SEXP R_descinit(SEXP DIM, SEXP BLDIM, SEXP ICTXT, SEXP LLD)
{
  R_INIT;
  int row_col_src = 0;
  int info = 0;
  SEXP desc;
  newRvec(desc, 9, "int");
  
  descinit_(INTP(desc), INTP(DIM), INTP(DIM)+1, INTP(BLDIM), INTP(BLDIM)+1, 
    &row_col_src, &row_col_src, INTP(ICTXT), INTP(LLD), &info);
  
  R_END;
  return desc;
}
Example #6
0
float verif_repres_VN(int m, int n, float *A, int ia, int ja, int *descA,
                                       float *U, int iu, int ju, int *descU,
                                       float *S){

        float *VTcpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, prow, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpVT, nqVT, descVTcpy[9], itemp, ivtcpy, jvtcpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_VN, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        mpVT   = numroc_( &min_mn, &nbA, &myrow, &rsrcA, &nprow );
        nqVT   = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpVT );
        descinit_( descVTcpy, &min_mn, &n, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        ivtcpy = 1; jvtcpy = 1; 
        VTcpy = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
        if (VTcpy==NULL){ printf("error of memory allocation VTcpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "T", "N", &min_mn, &n, &m, &tpone, U, &iu, &ju, descU, A, &ia, &ja, descA,
                        &tzero, VTcpy, &ivtcpy, &jvtcpy, descVTcpy ); 

        for (i=1;i<min_mn+1;i++){
                prow = indxg2p_( &i, &nbA, &izero, &izero, &nprow );
                localcol = indxg2l_( &i, &nbA, &izero, &izero, &nprow );
                invStemp = 1/S[i-1];
                if( myrow==prow )
                        sscal_( &nqA, &invStemp, &(VTcpy[localcol-1]), &mpVT );
        }

        verif_repres_VN = verif_orthogonality(min_mn,n,VTcpy, ivtcpy, jvtcpy, descVTcpy);

        free(VTcpy);

        return verif_repres_VN;
}
Example #7
0
float verif_repres_NV(int m, int n, float *A, int ia, int ja, int *descA,
                                float *VT, int ivt, int jvt, int *descVT,
                                float *S){

        float *Ucpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, pcol, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpU, nqU, descUcpy[9], itemp, iucpy, jucpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, ione = 1, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_NV, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpA );
        descinit_( descUcpy,  &m, &min_mn, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        iucpy = 1; jucpy = 1;   
        mpU    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqU    = numroc_( &min_mn, &nbA, &mycol, &csrcA, &npcol );
        Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ;
        if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "N", "T", &m, &min_mn, &n, &tpone, A, &ia, &ja, descA, VT, &ivt, &jvt, descVT,
                        &tzero, Ucpy, &iucpy, &jucpy, descUcpy ); 

        for (i=1;i<min_mn+1;i++){
                pcol = indxg2p_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                localcol = indxg2l_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                invStemp = 1/S[i-1];
                if( mycol==pcol )
                        sscal_( &mpA, &invStemp, &(Ucpy[ ( localcol-1 )*descUcpy[8] ]), &ione );
        }

        verif_repres_NV = verif_orthogonality(m,min_mn,Ucpy, iucpy, jucpy, descUcpy);

        free(Ucpy);

        return verif_repres_NV;
}
Example #8
0
float verif_orthogonality(int m, int n, float *U, int iu, int ju, int *descU){

        float *R=NULL;
        int nprow, npcol, myrow, mycol;
        int mpR, nqR, nb, itemp, descR[9], ictxt, info, min_mn, max_mn;
        int ctxt_ = 1, nb_ = 5;
        int izero = 0, ione = 1;
        float *wwork=NULL;
        float tmone= -1.0e+00,  tpone= +1.0e+00,  tzero= +0.0e+00;
        float orthU;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descU[ctxt_];
        nb = descU[nb_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        mpR    = numroc_( &min_mn, &nb, &myrow, &izero, &nprow );
        nqR    = numroc_( &min_mn, &nb, &mycol, &izero, &npcol );
        R = (float *)calloc(mpR*nqR,sizeof(float)) ;
        if (R==NULL){ printf("error of memory allocation R on proc %dx%d\n",myrow,mycol); exit(0); }
        itemp = max( 1, mpR );
        descinit_( descR,  &min_mn, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );

        pslaset_( "F", &min_mn, &min_mn, &tzero, &tpone, R, &ione, &ione, descR );
        if (m>n)
                psgemm_( "T", "N", &min_mn, &min_mn, &m, &tpone, U, &iu, &ju, descU, U, 
                        &iu, &ju, descU, &tmone, R, &ione, &ione, descR );
        else
                psgemm_( "N", "T", &min_mn, &min_mn, &n, &tpone, U, &iu, &ju, descU, U, 
                        &iu, &ju, descU, &tmone, R, &ione, &ione, descR );
        orthU = pslange_( "F", &min_mn, &min_mn, R, &ione, &ione, descR, wwork );
        orthU = orthU / ((float) max_mn);
        free(R);

        return orthU;

}
Example #9
0
void 
ddistr(int ictxt, int n, int numc, int nb, double *A , double *A_d, int *descAd ){

int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info;
int nprow, npcol, myrow, mycol,descA[9], itemp;
int i,k;

/*
#ifdef NOUNDERLAPACK
 sl_init__(&RootNodeic,&ione, &ione);
#else
  sl_init__(&RootNodeic,&ione, &ione);
#endif
*/



   
  sl_init_(&RootNodeic,&ione, &ione);
  
  Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);  


	//printf("nprow=%d, npcol=%d, myrow=%d, mycol=%d\n",nprow,npcol,myrow,mycol);
	//printf("nb=%d\n",nb);


  if (myrow==0 && mycol==0) { isRootNode=1;}

  if (isRootNode){
     //printf("root entro aca...\n");
	nru = numroc_(&n, &n, &myrow,&izero, &nprow );
     //printf("root paso numroc\n");
	itemp = max(1, nru);
     descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info);
  	//printf("root paso descinit\n");
  } 
  else{
     //printf("yo entre aca\n");
     k=0;
     for(i=0;i<9;i++){ 
       descA[k]=0;
       k++;
     }
     descA[1]=-1;
  }

  //printf("inicio de cosas para todos\n");
  nru = numroc_(&n, &nb, &myrow, &izero, &nprow);
  //printf("todos pasan numroc\n");
  itemp = max(1,nru);
  descinit_(descAd, &n, &numc, &nb, &nb, &izero, &izero, &ictxt,&itemp, &info);  
  //printf("todos pasan descinit\n");

  pdgemr2d_( &n, &numc, A, &ione, &ione, descA, A_d, &ione, &ione, descAd, &ictxt);
  //printf("todos pasan pdgemr2d\n");
  
 if (isRootNode){ 
     	//printf("RootNodeic=%d\n",RootNodeic);
	Cblacs_gridexit(RootNodeic);
	//printf("root paso gridexit\n");
  }
}
Example #10
0
int MAIN__(int argc, char** argv) {
    int num;  // number of data
    int dim;  // dimension of each data
    int nprow=4; // number of row
    int npcol=1;  // number of columnn
    int zero=0, one=1; // constant value
    int ictxt,myrow,mycol,pnum,pdim,info;
    char ifilename[LEN_FILENAME];
    char ofilename[LEN_FILENAME];

    int myproc, nprocs;
    Cblacs_pinfo(&myproc, &nprocs);
    Cblacs_setup(&myproc, &nprocs);
    Cblacs_get(-1,0,&ictxt);
    nprow = nprocs;
    npcol = 1; // fixed

    char order[] = "Row";
    Cblacs_gridinit(&ictxt, order, nprow, npcol);
    Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);

    if (DEBUG_MODE) {
        printf("ConTxt = %d\n", ictxt);
        printf("nprocs=%d, nprow=%d, npcol=%d\n", nprocs, nprow, npcol);
        printf("nprocs=%d, myrow=%d, mycol=%d\n", nprocs, myrow, mycol);
    }

    get_option(argc, argv, ifilename, ofilename, &num, &dim);

    // 0. cosinedist(ij) = 1 - V(i)V(j)/(Length(V(i))*Length(V(j)))

    // 1. calculate submatrix size
    int bsize = num / nprow; // blocking factor
    pnum = num / nprow;
    pdim = dim;
    if ( myrow < (num/bsize)%nprow) {
        pnum += bsize;
    }
    else if ( myrow == (num/bsize)%nprow) {
        pnum += (num % bsize);
    }
    else {
    }
    if(DEBUG_MODE)
        printf("myproc=%d: pnum=%d, pdim=%d, bsize=%d\n", myproc, pnum, pdim, bsize);

    int desc_input[9], desc_v[9], desc_ip[9], desc_n[9], desc_result[9];
    descinit_(desc_input,  &num, &dim, &num,   &dim,  &zero, &zero, &ictxt, &num,  &info);
    descinit_(desc_v,      &num, &dim, &bsize, &pdim, &zero, &zero, &ictxt, &pnum, &info);
    descinit_(desc_ip,     &num, &num, &bsize, &num,  &zero, &zero, &ictxt, &pnum, &info);
    descinit_(desc_n,      &num, &one, &bsize, &one,  &zero, &zero, &ictxt, &pnum, &info);
    descinit_(desc_result, &num, &num, &num,   &num,  &zero, &zero, &ictxt, &num,  &info);

    // 2. read input data
    double* input;
    if (myproc == 0) {
        input = (double*)malloc(sizeof(double)*num*dim);
        memset(input, 0, sizeof(double)*num*dim);
        read_data(ifilename, num, dim, input);
        printArray("input", myproc, input, num, dim);
    }

    // 3. distribute input data array
    double* V = (double*)malloc(sizeof(double)*pnum*pdim);
    memset(V, 0, sizeof(double)*pnum*pdim);
    Cpdgemr2d(num, dim, input, 1, 1, desc_input, V, 1, 1, desc_v, ictxt);
    printArray("V", myproc, V, pnum, pdim);

    // 4. InnerProduct = VV'
    double* InnerProduct = (double*)malloc(sizeof(double)*pnum*num);
    memset(InnerProduct, 0, sizeof(double)*pnum*num);
    char transa = 'N', transb = 'T';
    int m = num, n = num, k = dim;
    int lda = num, ldb = num, ldc = num;
    double alpha = 1.0f, beta = 0.0f;
    pdgemm_(&transa, &transb, &m, &n, &k, &alpha, V, &one, &one, desc_v, V, &one, &one, desc_v, &beta, InnerProduct, &one, &one, desc_ip);
    printArray("InnerProduct", myproc, InnerProduct, pnum, num);

    // 5. Norm of each vector
    double* Norm = (double*)malloc(sizeof(double)*pnum);
    for (int i = 0; i < pnum; i++) {
        int n = ((myproc*bsize)+(i/bsize)*(nprocs-1)*bsize+i)*pnum + i;
        Norm[i] = sqrt(InnerProduct[n]);
    }
    printArray("Norm", myproc, Norm, 1, pnum);

    // 6. Norm product matrix
    double* NormProduct = (double*)malloc(sizeof(double)*pnum*num);
    memset(NormProduct, 0, sizeof(double)*pnum*num);
    char uplo = 'U';
    n = num;
    alpha = 1.0f;
    int incx = 1;
    lda = num;
    pdsyr_(&uplo, &n, &alpha, Norm, &one, &one, desc_n, &incx, NormProduct, &one, &one, desc_ip);
    printArray("NormProduct", myproc, NormProduct, pnum, num);

    // 7. CosineDistance(ij) = 1-InnerProduct(ij)/NormProduct(ij)
    double* CosineDistance = (double*)malloc(sizeof(double)*pnum*num);
    memset(CosineDistance, 0, sizeof(double)*pnum*num);
    for (int j = 0; j < num; j++) {
        for (int i = 0; i < pnum; i++) {
            int n = ((myproc*bsize)+i+(i/bsize)*(nprocs-1)*bsize)*pnum+i;
            int p = i+j*pnum;
            if (p<=n) {
                CosineDistance[p] = 0.0;
            }
            else {
                CosineDistance[p] = 1 - InnerProduct[p]/NormProduct[p];
            }
        }
    }
    printArray("CosineDistance", myproc, CosineDistance, pnum, num);

    // 8. gather result
    double* result;
    if ( myproc == 0 ) {
        result = (double*)malloc(sizeof(double)*num*num);
        memset(result, 0, sizeof(double)*num*num);
    }
    Cpdgemr2d(num, num, CosineDistance, 1, 1, desc_ip, result, 1, 1, desc_result, ictxt);

    // 9. output to file
    if ( myproc == 0 ) {
        output_results(ofilename, result, num, num);
    }

    // a. cleanup memory
    free(V);
    free(InnerProduct);
    free(Norm);
    free(NormProduct);
    free(CosineDistance);
    if ( myproc == 0 ) {
        free(input);
        free(result);
    }

    blacs_exit_(&zero);

    return 0;
}
/* Test program 
 * created 23/09/2014
 * author Alex Bombrun
 * 
 * icc -O1  -o eigen.exe lapackReadStore.c mpiutil.c normals.c matrixBlockStore.c -mkl
 * ./eigen.exe 4 4
 *
 */
int main(int argc, char **argv) {
  
    FILE* store;
    FILE* scaStore;
    
    int N , M;
    int i, j;
    
    int n_blocks;
    int scalapack_size;
    int NB, MB;
    int i_block, j_block;
    int dim[4];
    double * mat;  // local matrix block use for reading
        
    int t, t_block;
    
    const char* profileG_file_name= "./data/NormalsG/profile.txt";
    const char* store_location = "./data/ReducedNormals";
    const char* scaStore_location ="./data/DiagCholeskyReducedNormals";
    
    int mp;	 // number of rows in the processor grid
    int mla;   // number of rows in the local array
    int mb;    // number of rows in a block
    int np;	 // number of columns in the processor grid
    int nla;   // number of columns in the local array
    int nb;    // number of columns in a block
    
    int mype,npe; // rank and total number of process
    
    int idescal[9]; // matrix descriptors
    double *la; // matrix values: al is the local array
    
    int idescbl[9];
    double *lb;
    double normb;
    
    int idescxl[9];
    double *lx;
    double normx;
    
    int idesczl[9]; // matrix descriptors
    double *lz; // matrix values: al is the local array
    
    double *w;
   
    int ierr; // error output 
    int mp_ret, np_ret, myrow, mycol; // to store grid info
    
    int zero=0; // value used for the descriptor initialization
    int one=1; // value used for the descriptor initialization
    
    int  m,n; // matrix A dimensions
    double norm, cond;
    double *work = NULL;
    double * work2 = NULL;
    int *iwork = NULL;
    int lwork, liwork;


     float ll,mm,cr,cc;
      int ii,jj,pr,pc,h,g; // ii,jj coordinates of local array element
      int rsrc=0,csrc=0; // assume that 0,0 element should be stored in the 0,0 process
      int n_b = 1;
      int index;
    int icon; // scalapack cblacs context
    char normJob, jobz, uplo, trans, diag;
    
    double MPIt1, MPIt2, MPIelapsed;
    
    jobz= 'N'; uplo='U';
    Cblacs_pinfo( &mype, &npe );
    
     if (argc == 3) {
	//printf("%s %s %s\n", argv[0], argv[1], argv[2]);
	n_blocks= (int) strtol(argv[1], NULL, 10);
	scalapack_size= (int) strtol(argv[2], NULL, 10);
     } else {
	printf("Usage: expect 2 integers \n");
	printf(" 1 : the number of diagonal blocks \n");
	printf(" 2 : scalapack number to define block size (assume n is divisible by sqrt(p) and that n/sqrt(p) is divisible by this number)\n");
	exit( -1);
     }
    
  

    printf("%d/%d: read store\n",mype,npe);
   
    N = getNumberOfLine(profileG_file_name); // the dimension of the matrix;
    M = N; // square matrix
    
    m=M; //mla*mp;
    n=N; //nla*np;
   
    np = isqrt(npe); // assume that the number of process is a square
    mp = np; // square grid
    
    mla = m/mp; // assume that the matrix dimension if a multiple of the process grid dimension
    nla = n/np;
    
    mb = mla/scalapack_size; // assume that the dimension of the matrix is a multiple of the number of the number of diagonal blocks
    nb = nla/scalapack_size;
    
    // init CBLACS
    Cblacs_get( -1, 0, &icon );
    Cblacs_gridinit( &icon,"c", mp, np ); 
    Cblacs_gridinfo( icon, &mp_ret, &np_ret, &myrow, &mycol);
    
   

    // allocate local matrix
    la=malloc(sizeof(double)*mla*nla);
    printf("%d/%d: full matrix (%d,%d), local matrix (%d,%d), processor grid (%d,%d), block (%d,%d) \n", mype, npe, m, n, mla, nla, np, mp, mb, nb);

    // set identity matrix
    for(i = 0;i<M;i++){
	for(j = i;j<i+1;j++){
	    cr = (float)( i/mb );
	    h = rsrc+(int)(cr);
	    pr = h%np;
	    cc = (float)( j/mb );
	    g = csrc+(int)(cc);
	    pc = g%mp;
	    // check if process should get this element
	    if (myrow == pr && mycol==pc){
		// ii = x + l*mb
		// jj = y + m*nb
		ll = (float)( ( i/(np*mb) ) );  // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric
		mm = (float)( ( j/(mp*nb) ) );
		ii = i%mb + (int)(ll)*mb;
		jj = j%nb + (int)(mm)*nb;
		index=jj*mla+ii;   // seems to be the transpose !?
		//if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j);
		//if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j);
		la[index] = 1;
	    }
	}
    }



/*
    for(i_block=0;i_block<n_blocks;i_block++){
      printf("%d/%d: process store block %d \n", mype, npe, i_block);
      readStore(&store,i_block,store_location);
      t_block = 0;
      while(readNextBlockDimension(dim,store)!=-1) { // loop B over all block tasks
	j_block = mpi_get_diag_block_id(i_block, t_block, n_blocks);
	mat = malloc((dim[1]-dim[0])*(dim[3]-dim[2]) * sizeof(double));         
	    
	readNextBlock(dim[0],dim[1],dim[2],dim[3],mat,store);
	if (dim[0]==dim[2]){ // process only the diagonal blocks

//	printf("%d/%d: read block (%d,%d) with global indices (%d,%d,%d,%d) \n",mype, npe, i_block,j_block,dim[0],dim[1],dim[2],dim[3]);
	
	NB = dim[1]-dim[0];
	MB = dim[3]-dim[2];
	for(i = dim[0];i<dim[1];i++){
	  for(j = dim[2];j<dim[3];j++){
	      //matA[i*M+j] = mat[(i-dim[0])*MB+(j-dim[2])];
	     // finding out which pe gets this i,j element
              cr = (float)( i/mb );
              h = rsrc+(int)(cr);
              pr = h%np;
              cc = (float)( j/mb );
              g = csrc+(int)(cc);
              pc = g%mp;
	      // check if process should get this element
              if (myrow == pr && mycol==pc){
		  // ii = x + l*mb
		  // jj = y + m*nb
                  ll = (float)( ( i/(np*mb) ) );  // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric
                  mm = (float)( ( j/(mp*nb) ) );
                  ii = i%mb + (int)(ll)*mb;
                  jj = j%nb + (int)(mm)*nb;
                  index=jj*mla+ii;   // seems to be the transpose !?
		  //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j);
		  //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j);
                  la[index] = mat[(i-dim[0])*MB+(j-dim[2])];
              }
	  }
	}
	// transpose
	if(j_block != i_block){
	  for(i = dim[0];i<dim[1];i++){
	    for(j = dim[2];j<dim[3];j++){
	      //matA[j*M+i] = mat[(i-dim[0])*MB+(j-dim[2])];
	       // finding out which pe gets this j,i element
              cr = (float)( j/mb );
              h = rsrc+(int)(cr);
              pr = h%np;
              cc = (float)( i/mb );
              g = csrc+(int)(cc);
              pc = g%mp;
	      // check if process should get this element
              if (myrow == pr && mycol==pc){
		  // ii = x + l*mb
		  // jj = y + m*nb
                  ll = (float)( ( j/(np*mb) ) );  // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric
                  mm = (float)( ( i/(mp*nb) ) );
                  ii = j%mb + (int)(ll)*mb;
                  jj = i%nb + (int)(mm)*nb;
                  index=jj*mla+ii;   // seems to be the transpose !?
		  //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j);
		  //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j);

                  la[index] = mat[(i-dim[0])*MB+(j-dim[2])];
              }
	    }
	  } 
	}

	}
	
	free(mat);
	t_block++;
      }


      closeStore(store);
    }
*/    
    
    printf("%d/%d: finished scaterring the matrix \n",mype,npe);
    
    printf("%d/%d: start computing \n",mype,npe);
       // set the matrix descriptor
    ierr=0;
    descinit_(idescal, &m, &n  , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); // processor grip id start at 0
    if (mype==0) saveMatrixDescriptor(idescal, scaStore_location);
    
    
    ierr=0;
    descinit_(idescbl, &m, &one  , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0
    lb = calloc(sizeof(double),mla);
    
    ierr=0;
    // set x
    descinit_(idescxl, &n, &one  , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0
    lx = calloc(sizeof(double),mla);
    for(i=0;i<mla;i++){
      lx[i] = 1.0/m;
    }
    pddot_(&n,&normx,lx,&one,&one,idescxl,&one,lx,&one,&one,idescxl,&one); // normx <- x'x
    if (mype==0) printf("%d/%d: normx2 %E \n",mype,npe,normx);  
    
    
    ierr=0;
    // set b
    double alpha =1.0;
    double beta =0.0;
    trans = 'N';
    pdgemv_(&trans,&m,&n,&alpha,la,&one,&one,idescal,lx,&one,&one,idescxl,&one,&beta,lb,&one,&one,idescbl,&one); // b <- A x
    pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b
    if (mype==0) printf("%d/%d: normb2 %E \n",mype,npe,normb);  
    
    
    ierr = 0;
    // compute norm 1 of the reduced normal matrix
    /* DO NOT WORK
    lwork = 2*mla+2*nla;
    work = malloc(sizeof(double)*lwork);
    normJob = '1';
    norm = pdlansy_(&normJob, &uplo, &n, la, &one, &one, idescal, work);  // matrix index start at one 
    printf("%d/%d: norm %f \n",mype,npe,norm);
    free(work);
    */
    
    ierr = 0;
    // compute the cholesky decomposition 
    printf("%d/%d: start computing cholesky factor\n",mype,npe);  
    pdpotrf_(&uplo,&n,la,&one,&one,idescal,&ierr);
    printf("%d/%d: finish computing cholesky factor\n",mype,npe);
    openScalapackStore(&scaStore,myrow,mycol,scaStore_location);
    saveLocalMatrix(la,nla,mla,scaStore);
    
    double test=0.0;
    for(i=0;i<nla*mla;i++){
	test += la[i]*la[i];
    }
    printf("%d/%d: finished computing cholesky, test=%f \n",mype,npe,test);
    
    ierr =0;
    // assume x and b set
    // assume cholesky decomposition
    // compute the soluation A x = b
    diag = 'N';
    printf("%d/%d: start solving\n",mype,npe);  
    //pdpptrs_(&uplo, &trans , &diag , &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // solve triangular system
    //pdtrtrs (&uplo, &trans , &diag , &n , &n , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr);
    pdpotrs_(&uplo, &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // b<- A-1 b
    
    alpha = -1.0;
    normb=0;
    pdaxpy_(&n,&alpha,lx,&one,&one,idescxl,&one,lb,&one,&one,idescbl,&one); // b<-b-x
    pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b
    if (mype==0) printf("%d/%d: finish solving, norm2(sol-true) %E \n",mype,npe,normb);  
    
    

    ierr = 0;
    /*
    // compute the eigen values
    jobz= 'N'; uplo='U'; // with N z is ignored
    descinit_(idesczl, &m, &n  , &mb, &nb , &zero, &zero, &icon, &mla, &ierr);
    lz = malloc(sizeof(double)*mla*nla);
    w = malloc(sizeof(double)*m);
    lwork = -1;
    work = malloc(sizeof(double)*2);
    pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr);   // only compute lwork
    //pdsyev_( &jobz, &uplo, &n, A, &ione, &ione, descA, W, Z, &ione, &ione, descZ, work, &lwork, &info );
    lwork= (int) work[0];
    free(work);
    work = (double *)calloc(lwork,sizeof(double)) ;
    //MPIt1 = MPI_Wtime();
    pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr);   // compute the eigen values
    //MPIt2 = MPI_Wtime();
    //MPIelapsed=MPIt2-MPIt1;
    
    if (mype == 0) {
	saveMatrix(n,w,"eigenvalues.txt");
	//printf("%d/%d: finished job in %8.2fs\n",mype,npe,MPIelapsed); // not working
    }
    */
    
    ierr = 0;
    // compute the conditioner number assume that the norm and the cholesky decomposition have been computed
    /* DO NOT WORK
    lwork = 2*mla+3*nla;
    printf("%d/%d: lwork=%d @%p\n",mype,npe,lwork,&lwork);
    work2 = malloc(sizeof(double)*lwork);
    liwork = 2*mla+3*nla;
    iwork = malloc(sizeof(int)*liwork);
    pdpocon_(&uplo,&n,la,&one,&one,idescal,&norm,&cond,work2,&lwork,iwork,&liwork,&ierr);
    printf("%d/%d: condition number %f \n",mype,npe,cond);
    */
    
    free(la);
    Cblacs_gridexit(icon);
    Cblacs_exit( 0 );
    return 0;
}
Example #12
0
void CG(StrumpackDensePackage<myscalar,myreal> *sdp, myscalar *X, myscalar *B, int *descVec, int n, int nrhs, int niter, myreal threshold) {
  /* Conjugate Gradients. Calls sdp.schur_product for matvecs. Fills only the bottom part of X. */
  int printit=250;
  int IA;
  int ctxt;
  int nprow, npcol;
  int myrow, mycol;
  int rsrc, csrc;
  int nb;
  int locr, locc;
  int i, j;
  int idummy, ierr;
  int neff;
  int it;
  int desc[BLACSCTXTSIZE], descScal[BLACSCTXTSIZE];
  bool ingrid;
  myreal res;
  myscalar *x=NULL, *b=NULL;
  myscalar *r=NULL, *p=NULL, *Ap=NULL;
  myscalar alpha, rrprev, rrnext;

  IA=sdp->split_HSS+1;
  neff=n-IA+1;

  ctxt=descVec[BLACSctxt];
  rsrc=descVec[BLACSrsrc];
  csrc=descVec[BLACScsrc];
  nb=descVec[BLACSmb];
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);
  ingrid=myrow>=0 && mycol>=0;

  /* X and B have n rows. We create versions with
   * the last neff rows only that we pass to SDP.
   */
  if(ingrid) {
    locr=numroc_(&neff,&nb,&myrow,&rsrc,&nprow);
    locc=numroc_(&IONE,&nb,&mycol,&csrc,&npcol);
    idummy=locr>1?locr:1;
    descinit_(desc,&neff,&IONE,&nb,&nb,&rsrc,&csrc,&ctxt,&idummy,&ierr);
    x=new myscalar[locr*locc]();
    b=new myscalar[locr*locc]();
    r=new myscalar[locr*locc]();
    p=new myscalar[locr*locc]();
    Ap=new myscalar[locr*locc]();
  } else {
    locr=0;
    locc=0;
    descset_(desc,&neff,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Descriptor for the scalar containing the result of the dot product */
  if(ingrid)
    descinit_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&IONE,&ierr);
  else
    descset_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);

  for(j=1;j<=nrhs;j++) {
    /* One RHS at a time */
    if(ingrid) {
      pgeadd('N',neff,IONE,ONE,B,IA,j,descVec,ZERO,b,IONE,IONE,desc);
      pgeadd('N',neff,IONE,ONE,X,IA,j,descVec,ZERO,x,IONE,IONE,desc);
    }

    /* r = b - A x
     * If the Schur was explitly in matrix A, the BLAS call would be
     * pgemm('N','N',neff,IONE,neff,NONE,A,IA,IA,descA,x,IONE,IONE,desc,ONE,r,IONE,IONE,desc);
     *
     */
    if(ingrid)
      placpy('N',neff,IONE,b,IONE,IONE,desc,r,IONE,IONE,desc);
    sdp->schur_product('N',NONE,x,desc,ONE,r,desc);

    /* p = r */
    if(ingrid)
      placpy('N',neff,IONE,r,IONE,IONE,desc,p,IONE,IONE,desc);

    /* "Previous" r' * r */
    rrprev=ZERO;
    if(ingrid)
      pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrprev,IONE,IONE,descScal);
    MPI_Bcast((void *)&rrprev,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);

    it=1;
    while(it<=niter) {
      /* Ap = A * p
       * If the Schur was explitly in matrix A, the BLAS call would be
       * pgemm('N','N',neff,IONE,neff,ONE,A,IA,IA,descA,p,IONE,IONE,desc,ZERO,Ap,IONE,IONE,desc);
       *
       */
      sdp->schur_product('N',ONE,p,desc,ZERO,Ap,desc);

      /* alpha = r'*r / (p' * A * p) = rrprev/(p' * Ap) */
      alpha=ZERO;
      if(ingrid)
        pgemm('C','N',IONE,IONE,neff,ONE,p,IONE,IONE,desc,Ap,IONE,IONE,desc,ZERO,&alpha,IONE,IONE,descScal);
      MPI_Bcast((void *)&alpha,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);
      alpha=rrprev/alpha;

      /* x = x + alpha * p */
      for(i=0;i<locr*locc;i++)
        x[i]+=alpha*p[i];

      /* r = r - alpha * Ap */
      for(i=0;i<locr*locc;i++)
        r[i]-=alpha*Ap[i];

      /* "Next" r' * r */
      rrnext=ZERO;
      if(ingrid)
        pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrnext,IONE,IONE,descScal);
      MPI_Bcast((void *)&rrnext,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);

      /* Residual */
      res=sqrt(rrnext.real());
      if(it%printit==0)
        if(!myrow && !mycol)
          std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl;
      if(res<threshold)
        break;

      /* p = r + rrnext/rrprev * p */
      for(i=0;i<locr*locc;i++)
        p[i]=r[i]+rrnext/rrprev*p[i];

      /* "Previous" r' * r */
      rrprev=rrnext;

      it++;
    }

    if(it>niter)
      it=niter;

    if(it%printit)
      if(!myrow && !mycol)
        std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl << std::endl;

    /* Back to n-sized vector */
    if(ingrid)
      pgeadd('N',neff,IONE,ONE,x,IONE,IONE,desc,ZERO,X,IA,j,descVec);

  }

  delete[] b;
  delete[] x;
  delete[] r;
  delete[] p;
  delete[] Ap;
}
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in,
    double *A, int *ia_in, int *ja_in, int *descA,
    double *dB, int *ib_in, int *jb_in, int *descB )
{
/*
 Copy m by n distributed submatrix  from host to GPU
 */

int m  = *m_in;
int n  = *n_in;

int ia = *ia_in;
int ja = *ja_in;
int ib = *ib_in;
int jb = *jb_in;

int nprow = 0;
int npcol = 0;
int myprow = 0;
int mypcol = 0;
int mmb = 0;
int nnb = 0;

int istart = 0;
int iend = 0;
int isize = 0;

int Locp = 0;
int Locq = 0;
int lld = 0;

int jstart = 0;
int jend = 0;
int jsize = 0;

int iib = 0;
int jjb = 0;

cuDoubleComplex *dBptr = 0;
double *Btmp = 0;
F_CHAR_T NoTrans = "N";

int descBtmp[DLEN_];
int elmSize = sizeof(cuDoubleComplex);

double z_one[2];
double z_zero[2];

/*
 Tuneable parameters
 */
int mfactor = 4;
int nfactor = 4;


int rsrc = 0;
int csrc = 0;
int irsrc = 0;
int jcsrc = 0;
int info = 0;

int notran = 0;
int TrA = 0;

int lrindx = 0;
int lcindx = 0;
int nrow = 0;
int ncol = 0;
int mm = 0;
int nn = 0;
int iia = 0;
int jja = 0;

cublasStatus cu_status;

z_one[REAL_PART] = 1;
z_one[IMAG_PART] = 0;
z_zero[REAL_PART] = 0;
z_zero[IMAG_PART] = 0;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );

notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );


/*
 * check arguments
 */
if ((m <= 0) || (n <= 0)) {
   return;
   };

assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) );
assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) );
assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) );
assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) );

/*
 * Create a temp matrix that is aligned to descB.
 * Assume size is   mmb by nnb
 */

if (notran) {
  mmb = MIN( m, descB[MB_] * nprow * mfactor);
  nnb = MIN( n, descB[NB_] * npcol * nfactor);
  }
else {
  nnb = MIN( m, descB[MB_] * nprow * mfactor);
  mmb = MIN( n, descB[NB_] * npcol * nfactor);
  };
  

mmb = MAX( mmb,1);
nnb = MAX( nnb,1);

rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow);
csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol);


Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow );
Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol );

Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize );
assert( Btmp != 0 );


lld = MAX(1, Locp);
descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_],
            &rsrc, &csrc, &descB[CTXT_], &lld, &info );

assert( info == 0);



for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) {
  jend = MIN( ja + n -1, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  for( istart=ia; istart <= ia + m-1; istart = iend + 1) {
     iend = MIN( ia + m-1, istart + mmb -1);
     isize = iend - istart + 1;

     iia = ia + (istart-1);
     jja = ja + (jstart-1);


     iib = 1;
     jjb = 1;
     

     if (notran) {
        mm = isize;
        nn = jsize;
        }
    else {
        mm = jsize;
        nn = isize;
        };


     pzgeadd_( TRAN,  &mm, &nn,   z_one, A, &iia, &jja, descA,  
                         z_zero, Btmp, &iib, &jjb, descBtmp );

     /* 
      * find local extent
      */

     
     if (notran) {
       iib = ib + (istart-1);
       jjb = jb + (jstart-1);
       }
     else {
       iib = ib + (jstart-1);
       jjb = jb + (istart-1);
       };

     if (notran) {
       nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol);
       }
     else {
       nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol);

     };

     /*
      Perform global
      dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize)

      Perform local
      dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <-
              B(1:nrow, 1:ncol)
      */


     infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol,
               &lrindx, &lcindx,  &irsrc, &jcsrc );


     dBptr = (cuDoubleComplex *) dB;
     dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]);


     cu_status = cublasSetMatrix( nrow,ncol,elmSize,
               (cuDoubleComplex *) Btmp,  descBtmp[LLD_],
                                   dBptr, descB[LLD_] );
     assert( cu_status == CUBLAS_STATUS_SUCCESS );

     };
  };

  free( Btmp );

  return;
}
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *B=NULL, *Btrue=NULL;
  int descA[BLACSCTXTSIZE], descB[BLACSCTXTSIZE];
  int n;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int *I, *J;
  int nI, nJ;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal err;

  n=1024; /* Size of the problem */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    for(i=1;i<=locr;i++)
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        myreal pi=3.1416, d=0.1;
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-6;

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Element extraction: a bunch of random indices.
   * Not that duplicates do not matter (the code works).
   */
  nI=1+rand()%n;
  MPI_Bcast((void*)&nI,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  I=new int[nI];
  if(!myid)
    for(i=0;i<nI;i++)
      I[i]=1+rand()%n;
  MPI_Bcast((void*)I,nI,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  nJ=1+rand()%n;
  MPI_Bcast((void*)&nJ,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  J=new int[nJ];
  if(!myid)
    for(j=0;j<nJ;j++)
      J[j]=1+rand()%n;
  MPI_Bcast((void*)J,nJ,MPI_INTEGER,IZERO,MPI_COMM_WORLD);

  /* Extraction for the HSS form */
  if(myid<nprow*npcol) {
    locr=numroc_(&nI,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nJ,&nb,&mycol,&IZERO,&npcol);
    B=new myscalar[locr*locc]();
    dummy=std::max(1,locr);
    descinit_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else
    descset_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  sdp.extract(A,descA,B,descB,I,nI,J,nJ);

  /* Extraction from the original matrix just to compare */
  sdp.use_HSS=false;
  if(myid<nprow*npcol)
    Btrue=new myscalar[locr*locc];
  sdp.extract(A,descA,Btrue,descB,I,nI,J,nJ);

  /* Comparison with elements of input matrix */
  if(myid<nprow*npcol){
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL);
    for(i=0;i<locr*locc;i++)
      Btrue[i]-=B[i];
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL)/err;
  }
  if(!myid)
    std::cout << "Element extraction (" << nI << "x" << nJ << " submatrix): maximum relative error max ||A(I,J)-HSS(I,J)||//||A(I,J)|| = " << err << std::endl << std::endl;

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] Btrue;
  delete[] I;
  delete[] J;

  /* The end */
  MPI_Finalize();
  return 0;

}
Example #15
0
int main(int argc, char **argv) {
        int iam, nprocs;
        int myrank_mpi, nprocs_mpi;
        int ictxt, nprow, npcol, myrow, mycol;
        int nb, m, n;
        int mpA, nqA, mpU, nqU, mpVT, nqVT;
        int i, j, k, itemp, min_mn;
        int descA[9], descU[9], descVT[9];
        float *A=NULL;
        int info, infoNN, infoVV, infoNV, infoVN;
        float *U_NN=NULL,  *U_VV=NULL,  *U_NV=NULL,  *U_VN=NULL;
        float *VT_NN=NULL, *VT_VV=NULL, *VT_NV=NULL, *VT_VN=NULL;
        float *S_NN=NULL,  *S_VV=NULL, *S_NV=NULL, *S_VN=NULL;
        float *S_res_NN=NULL;
        float orthU_VV, residF, orthVT_VV;
        float orthU_VN, orthVT_NV;
        float  residS_NN, eps;
        float  res_repres_NV, res_repres_VN;
/**/
        int izero=0,ione=1;
        float rtmone=-1.0e+00;
/**/
        double MPIelapsedVV, MPIelapsedNN, MPIelapsedVN, MPIelapsedNV;
        char jobU, jobVT;
        int nbfailure=0, nbtestcase=0,inputfromfile, nbhetereogeneity=0;
        float threshold=100e+00;
        char buf[1024];
        FILE *fd;       
        char *c;
        char *t_jobU, *t_jobVT;
        int *t_m, *t_n, *t_nb, *t_nprow, *t_npcol;
        int nb_expe, expe;
        char hetereogeneityVV, hetereogeneityNN, hetereogeneityVN, hetereogeneityNV;
        int iseed[4], idist;
/**/
        MPI_Init( &argc, &argv);
        MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
        MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
/**/
        m = 100; n = 100; nprow = 1; npcol = 1; nb = 64; jobU='A'; jobVT='A'; inputfromfile = 0;
        for( i = 1; i < argc; i++ ) {
                if( strcmp( argv[i], "-f" ) == 0 ) {
                        inputfromfile = 1;
                }
                if( strcmp( argv[i], "-jobvt" ) == 0 ) {
                        if (i+1<argc) {
                                if( strcmp( argv[i+1], "V" ) == 0 ){ jobVT = 'V'; i++; }
                                else if( strcmp( argv[i+1], "N" ) == 0 ){ jobVT = 'N'; i++; }
                                else if( strcmp( argv[i+1], "A" ) == 0 ){ jobVT = 'A'; i++; }
                                else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n");
                        }
                        else    
                                printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n");
                }
                if( strcmp( argv[i], "-jobu" ) == 0 ) {
                        if (i+1<argc) {
                                if( strcmp( argv[i+1], "V" ) == 0 ){ jobU = 'V'; i++; }
                                else if( strcmp( argv[i+1], "N" ) == 0 ){ jobU = 'N'; i++; }
                                else if( strcmp( argv[i+1], "A" ) == 0 ){ jobU = 'A'; i++; }
                                else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n");
                        }
                        else    
                                printf(" ** warning: jobu should be set to V, N or A in the command line ** \n");
                }
                if( strcmp( argv[i], "-m" ) == 0 ) {
                        m      = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-n" ) == 0 ) {
                        n      = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-p" ) == 0 ) {
                        nprow  = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-q" ) == 0 ) {
                        npcol  = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-nb" ) == 0 ) {
                        nb     = atoi(argv[i+1]);
                        i++;
                }
        }
/**/
        if (inputfromfile){
                nb_expe = 0;
                fd = fopen("svd.dat", "r");
                if (fd == NULL) { printf("File failed to open svd.dat from processor mpirank(%d/%d): \n",myrank_mpi,nprocs_mpi); exit(-1); }
                do {    
                        c = fgets(buf, 1024, fd);  /* get one line from the file */
                        if (c != NULL)
                                if (c[0] != '#')
                                        nb_expe++;
                } while (c != NULL);              /* repeat until NULL          */
                fclose(fd);
                t_jobU  = (char *)calloc(nb_expe,sizeof(char)) ;
                t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ;
                t_m     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_n     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nb    = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nprow = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_npcol = (int  *)calloc(nb_expe,sizeof(int )) ;
                fd = fopen("svd.dat", "r");
                expe=0;
                do {    
                        c = fgets(buf, 1024, fd);  /* get one line from the file */
                        if (c != NULL)
                                if (c[0] != '#'){
                                        //printf("NBEXPE = %d\n",expe);
                                        sscanf(c,"%c %c %d %d %d %d %d",
                                                &(t_jobU[expe]),&(t_jobVT[expe]),&(t_m[expe]),&(t_n[expe]),
                                                &(t_nb[expe]),(&t_nprow[expe]),&(t_npcol[expe]));
                                        expe++;
                                }
                } while (c != NULL);              /* repeat until NULL          */
                fclose(fd);
        }
        else {
                nb_expe = 1;
                t_jobU  = (char *)calloc(nb_expe,sizeof(char)) ;
                t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ;
                t_m     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_n     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nb    = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nprow = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_npcol = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_jobU[0]  = jobU;
                t_jobVT[0] = jobVT;
                t_m[0]     = m;
                t_n[0]     = n;
                t_nb[0]    = nb;
                t_nprow[0] = nprow;
                t_npcol[0] = npcol;
        }

        if (myrank_mpi==0){
                printf("\n");
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                                printf("                            Testing psgsevd -- float precision SVD ScaLAPACK routine                \n");
                printf("jobU jobVT    m     n     nb   p   q   || info   heter   resid     orthU    orthVT   |SNN-SVV|    time(s)   cond(A) \n");
                printf("--------------------------------------------------------------------------------------------------------------------\n");
        }
/**/
        for (expe = 0; expe<nb_expe; expe++){

        jobU  = t_jobU[expe]  ; 
        jobVT = t_jobVT[expe] ; 
        m     = t_m[expe]     ; 
        n     = t_n[expe]     ; 
        nb    = t_nb[expe]    ; 
        nprow = t_nprow[expe] ; 
        npcol = t_npcol[expe] ; 

        if (nb>n)
                nb = n;
        if (nprow*npcol>nprocs_mpi){
                if (myrank_mpi==0)
                        printf(" **** ERROR : we do not have enough processes available to make a p-by-q process grid ***\n");
                        printf(" **** Bye-bye                                                                         ***\n");
                MPI_Finalize(); exit(1);
        }
/**/
        Cblacs_pinfo( &iam, &nprocs ) ;
        Cblacs_get( -1, 0, &ictxt );
        Cblacs_gridinit( &ictxt, "Row", nprow, npcol );
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
/**/
        min_mn = min(m,n);
/**/
        //if (iam==0)
                //printf("\tm=%d\tn = %d\t\t(%d,%d)\t%dx%d\n",m,n,nprow,npcol,nb,nb);
        //printf("Hello World, I am proc %d over %d for MPI, proc %d over %d for BLACS in position (%d,%d) in the process grid\n", 
                        //myrank_mpi,nprocs_mpi,iam,nprocs,myrow,mycol);
/*
*
*     Work only the process in the process grid
*
*/
        //if ((myrow < nprow)&(mycol < npcol)){
        if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)){

/*
*
*     Compute the size of the local matrices (thanks to numroc)
*
*/ 
                mpA    = numroc_( &m     , &nb, &myrow, &izero, &nprow );
                nqA    = numroc_( &n     , &nb, &mycol, &izero, &npcol );
                mpU    = numroc_( &m     , &nb, &myrow, &izero, &nprow );
                nqU    = numroc_( &min_mn, &nb, &mycol, &izero, &npcol );
                mpVT   = numroc_( &min_mn, &nb, &myrow, &izero, &nprow );
                nqVT   = numroc_( &n     , &nb, &mycol, &izero, &npcol );
/*
*
*     Allocate and fill the matrices A and B
*
*/ 
                A = (float *)calloc(mpA*nqA,sizeof(float)) ;
                if (A==NULL){ printf("error of memory allocation A on proc %dx%d\n",myrow,mycol); exit(0); }
/**/            
//              seed = iam*(mpA*nqA*2); srand(seed);
                idist = 2;
                iseed[0] = mpA%4096;
                iseed[1] = iam%4096;
                iseed[2] = nqA%4096;
                iseed[3] = 23;
/**/            
                k = 0;
                for (i = 0; i < mpA; i++) {
                        for (j = 0; j < nqA; j++) {
                                slarnv_( &idist, iseed, &ione, &(A[k]) );
                                k++;    
                        }
                }
/*
*
*     Initialize the array descriptor for the distributed matrices xA, U and VT
*
*/ 
                itemp = max( 1, mpA );
                descinit_( descA,  &m, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
                itemp = max( 1, mpA );
                descinit_( descU,  &m, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
                itemp = max( 1, mpVT );
                descinit_( descVT, &min_mn, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
/**/
                eps = pslamch_( &ictxt, "Epsilon" );
/**/
                if ( ((jobU=='V')&(jobVT=='N')) ||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                U_VN = (float *)calloc(mpU*nqU,sizeof(float)) ;
                if (U_VN==NULL){ printf("error of memory allocation U_VN on proc %dx%d\n",myrow,mycol); exit(0); }
                S_VN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_VN==NULL){ printf("error of memory allocation S_VN on proc %dx%d\n",myrow,mycol); exit(0); }
                infoVN = driver_psgesvd( 'V', 'N', m, n, A, 1, 1, descA,
                        S_VN, U_VN, 1, 1, descU, VT_VN, 1, 1, descVT,
                        &MPIelapsedVN);
                orthU_VN  = verif_orthogonality(m,min_mn,U_VN , 1, 1, descU);
                res_repres_VN = verif_repres_VN( m, n, A, 1, 1, descA, U_VN, 1, 1, descU, S_VN);
                if (infoVN==min_mn+1) hetereogeneityVN = 'H'; else hetereogeneityVN = 'N';
                if ( iam==0 )
                        printf(" V    N   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e   %7.1e                        %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoVN,hetereogeneityVN,res_repres_VN/(S_VN[0]/S_VN[min_mn-1]),
                                orthU_VN,MPIelapsedVN,S_VN[0]/S_VN[min_mn-1]);
                if (infoVN==min_mn+1) nbhetereogeneity++ ;
                else if ((res_repres_VN/eps/(S_VN[0]/S_VN[min_mn-1])>threshold)||(orthU_VN/eps>threshold)||(infoVN!=0)) nbfailure++;
                }
/**/
                if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                VT_NV = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
                if (VT_NV==NULL){ printf("error of memory allocation VT_NV on proc %dx%d\n",myrow,mycol); exit(0); }
                S_NV = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_NV==NULL){ printf("error of memory allocation S_NV on proc %dx%d\n",myrow,mycol); exit(0); }
                infoNV = driver_psgesvd( 'N', 'V', m, n, A, 1, 1, descA,
                        S_NV, U_NV, 1, 1, descU, VT_NV, 1, 1, descVT,
                        &MPIelapsedNV);
                orthVT_NV = verif_orthogonality(min_mn,n,VT_NV, 1, 1, descVT);
                res_repres_NV = verif_repres_NV( m, n, A, 1, 1, descA, VT_NV, 1, 1, descVT, S_NV);
                if (infoNV==min_mn+1) hetereogeneityNV = 'H'; else hetereogeneityNV = 'N';
                if ( iam==0 )
                        printf(" N    V   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e             %7.1e              %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoNV,hetereogeneityNV,res_repres_NV/(S_NV[0]/S_NV[min_mn-1]),
                                orthVT_NV,MPIelapsedNV,S_NV[0]/S_NV[min_mn-1]);
                if (infoNV==min_mn+1) nbhetereogeneity++ ;
                else if ((res_repres_NV/eps/(S_NV[0]/S_NV[min_mn-1])>threshold)||(orthVT_NV/eps>threshold)||(infoNV!=0)) nbfailure++;
                }
/**/
                if ( ((jobU=='N')&(jobVT=='N')) || ((jobU=='V')&(jobVT=='V')) || (jobU == 'A' ) || (jobVT=='A') ) {
                nbtestcase++;   
                U_VV = (float *)calloc(mpU*nqU,sizeof(float)) ;
                if (U_VV==NULL){ printf("error of memory allocation U_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                VT_VV = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
                if (VT_VV==NULL){ printf("error of memory allocation VT_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                S_VV = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_VV==NULL){ printf("error of memory allocation S_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                infoVV = driver_psgesvd( 'V', 'V', m, n, A, 1, 1, descA,
                        S_VV, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT,
                        &MPIelapsedVV);
                orthU_VV  = verif_orthogonality(m,min_mn,U_VV , 1, 1, descU);
                orthVT_VV = verif_orthogonality(min_mn,n,VT_VV, 1, 1, descVT);
                residF =  verif_representativity( m, n,     A, 1, 1, descA,
                                                         U_VV, 1, 1, descU,
                                                        VT_VV, 1, 1, descVT,
                                                         S_VV);
                if (infoVV==min_mn+1) hetereogeneityVV = 'H'; else hetereogeneityVV = 'N';
                if ( iam==0 )
                        printf(" V    V   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e   %7.1e   %7.1e              %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoVV,hetereogeneityVV,residF,orthU_VV,orthVT_VV,MPIelapsedVV,S_VV[0]/S_VV[min_mn-1]);
                if (infoVV==min_mn+1) nbhetereogeneity++ ;
                else if ((residF/eps>threshold)||(orthU_VV/eps>threshold)||(orthVT_VV/eps>threshold)||(infoVV!=0)) nbfailure++;
                }
/**/
                if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                S_NN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_NN==NULL){ printf("error of memory allocation S_NN on proc %dx%d\n",myrow,mycol); exit(0); }
                infoNN = driver_psgesvd( 'N', 'N', m, n, A, 1, 1, descA,
                        S_NN, U_NN, 1, 1, descU, VT_NN, 1, 1, descVT,
                        &MPIelapsedNN);
                S_res_NN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_res_NN==NULL){ printf("error of memory allocation S on proc %dx%d\n",myrow,mycol); exit(0); }
                scopy_(&min_mn,S_VV,&ione,S_res_NN,&ione);
                saxpy_ (&min_mn,&rtmone,S_NN,&ione,S_res_NN,&ione);
                residS_NN = snrm2_(&min_mn,S_res_NN,&ione) / snrm2_(&min_mn,S_VV,&ione);
                free(S_res_NN);
                if (infoNN==min_mn+1) hetereogeneityNN = 'H'; else hetereogeneityNN = 'N';
                if ( iam==0 )
                        printf(" N    N   %6d %6d  %3d  %3d %3d  ||  %3d     %c                                  %7.1e   %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoNN,hetereogeneityNN,residS_NN,MPIelapsedNN,S_NN[0]/S_NN[min_mn-1]);
                if (infoNN==min_mn+1) nbhetereogeneity++ ;
                else if ((residS_NN/eps>threshold)||(infoNN!=0)) nbfailure++;
                }
/**/
                if (((jobU=='V')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_VN); free(U_VN); }
                if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(VT_NV); free(S_NV); }
                if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_NN); }
                if (((jobU=='N')&(jobVT=='N'))||((jobU=='V')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(U_VV); free(S_VV); free(VT_VV);}
                free(A);
                Cblacs_gridexit( 0 );
        }
/*
*     Print ending messages
*/
        }
        if ( iam==0 ){
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                printf("               [ nbhetereogeneity = %d / %d ]\n",nbhetereogeneity, nbtestcase);
                printf("               [ nbfailure        = %d / %d ]\n",nbfailure, nbtestcase-nbhetereogeneity);
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                printf("\n");
        }
/**/
        free(t_jobU  );
        free(t_jobVT );
        free(t_m     );
        free(t_n     );
        free(t_nb    );
        free(t_nprow );
        free(t_npcol );
        MPI_Finalize();
        exit(0);
}
Example #16
0
int main()
{
	const MKL_INT m = 1000;
	const MKL_INT k = 100000;
	const MKL_INT n = 1000;
	const MKL_INT nb = 100;
	const MKL_INT nprow = 2;
	const MKL_INT npcol = 2;

    MKL_INT iam, nprocs, ictxt, myrow, mycol;
    MDESC   descA, descB, descC, descA_local, descB_local, descC_local;
	MKL_INT info;
	MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local;
	MKL_INT a_lld, b_lld, c_lld;

    blacs_pinfo_( &iam, &nprocs );
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

    double *a = 0;
    double *b = 0;
	double *c = 0;

    if (iam==0)
    {
		a = gen_a(m, k);
		b = gen_b(k, n);
		c = (double*)calloc(m*n, sizeof(double));

		puts("a=");
		print(a, m, k);

		puts("b=");
		print(b, k, n);
    }

    a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
    a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol );

	b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow );
	b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
	c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) );
    double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) );
    double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) );

    a_lld = MAX( a_m_local, 1 );
	b_lld = MAX( b_m_local, 1 );
	c_lld = MAX( c_m_local, 1 );

	if (iam==0)
	{
			printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local,
							c_m_local, c_n_local);
			printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld);
	}

    descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info );
    descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info );
    descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info );

    descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info );
    descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info );
    descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info );

	printf("Rank %d: start distribute data\n", iam);
    pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
	printf("Rank %d: finished distribute data\n", iam);

	if (iam==0)
	{
			puts("a");
			print(A, a_m_local, a_n_local);
			puts("b");
			print(B, b_m_local, b_n_local);
	}

    pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
	printf("Rank %d: finished dgemm\n", iam);
	if (iam == 0)
	{
			puts("c");
			print(C, c_m_local, c_n_local);
	}

	pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local);

	if (iam==0)
	{
			puts("global c");
			print(c, m, n);
	}

	free(A);
	free(B);
	free(C);
	if (iam==0)
	{
			free(a);
			free(b);
			free(c);
	}

    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
}
Example #17
0
int main(int argc, char **argv) {
    int info, i, j, pcol, Adim;
    double *D;
    int *DESCD;
    CSRdouble BT_i, B_j, Xsparse, Zsparse, Btsparse;

    /*BT_i.allocate(0,0,0);
    B_j.allocate(0,0,0);
    Xsparse.allocate(0,0,0);
    Zsparse.allocate(0,0,0);
    Btsparse.allocate(0,0,0);*/

    //Initialise MPI and some MPI-variables
    info = MPI_Init ( &argc, &argv );
    if ( info != 0 ) {
        printf ( "Error in MPI initialisation: %d\n",info );
        return info;
    }

    position= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( position==NULL ) {
        printf ( "unable to allocate memory for processor position coordinate\n" );
        return EXIT_FAILURE;
    }

    dims= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( dims==NULL ) {
        printf ( "unable to allocate memory for grid dimensions coordinate\n" );
        return EXIT_FAILURE;
    }

    //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI

    blacs_pinfo_ ( &iam,&size ); 				//determine the number of processes involved
    info=MPI_Dims_create ( size, 2, dims );			//determine the best 2D cartesian grid with the number of processes
    if ( info != 0 ) {
        printf ( "Error in MPI creation of dimensions: %d\n",info );
        return info;
    }

    //Until now the code can only work with square process grids
    //So we try to get the biggest square grid possible with the number of processes involved
    if (*dims != *(dims+1)) {
        while (*dims * *dims > size)
            *dims -=1;
        *(dims+1)= *dims;
        if (iam==0)
            printf("WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - (*dims * *dims));
        size = *dims * *dims;
        //cout << "New size of process grid: " << size << endl;
    }

    blacs_get_ ( &i_negone,&i_zero,&ICTXT2D );

    //Initialisation of the BLACS process grid, which is referenced as ICTXT2D
    blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 );

    if (iam < size) {

        //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column)
        blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 );
        if ( *position ==-1 ) {
            printf ( "Error in proces grid\n" );
            return -1;
        }

        //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp)
        info=read_input ( *++argv );
        if ( info!=0 ) {
            printf ( "Something went wrong when reading input file for processor %d\n",iam );
            return -1;
        }

        //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point.
        blacs_barrier_ ( &ICTXT2D,"ALL" );
        if ( * ( position+1 ) ==0 && *position==0 )
            printf ( "Reading of input-file succesful\n" );

        if ( * ( position+1 ) ==0 && *position==0 ) {
            printf("\nA linear mixed model with %d observations, %d genotypes, %d random effects and %d fixed effects\n", n,k,m,l);
            printf("was analyzed using %d (%d x %d) processors\n",size,*dims,*(dims+1));
        }

        //Dimension of A (sparse matrix) is the number of fixed effects(m) + the sparse random effects (l)
        Adim=m+l;

        //Dimension of D (dense matrix) is the number of dense effects (k)
        Ddim=k;

        pcol= * ( position+1 );

        //Define number of blocks needed to store a complete column/row of D
        Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1;

        //Define the number of rowblocks needed by the current process to store its part of the dense matrix D
        Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1;
        Drows= Drows<1? 1 : Drows;

        //Define the number of columnblocks needed by the current process to store its part of the dense matrix D
        Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1;
        Dcols=Dcols<1? 1 : Dcols;

        //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise)
        lld_D=Drows*blocksize;

        //Initialise the descriptor of the dense distributed matrix
        DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCD==NULL ) {
            printf ( "unable to allocate memory for descriptor for C\n" );
            return -1;
        }

        //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0)
        //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process
        descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        //Allocate the space necessary to store the part of D that is held into memory of this process.
        D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) );
        if ( D==NULL ) {
            printf ( "unable to allocate memory for Matrix D  (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) );
            return EXIT_FAILURE;
        }

        blacs_barrier_ ( &ICTXT2D,"ALL" );
        if (iam==0)
            printf ( "Start set up of B & D\n" );

        blacs_barrier_ ( &ICTXT2D,"ALL" );
        //set_up_BD is declared in readdist.cpp and constructs the parts of matrices B & D in each processor
        //which are necessary to create the distributed Schur complement of D
        info = set_up_BD ( DESCD, D, BT_i, B_j, Btsparse );

        //printdense(Drows*blocksize, Dcols * blocksize,D,"matrix_D.txt");

        blacs_barrier_ ( &ICTXT2D,"ALL" );
        if (iam==0)
            printf ( "Matrices B & D set up\n" );

        if(printD_bool) {

            int array_of_gsizes[2], array_of_distribs[2], array_of_dargs[2], array_of_psize[2] ;
            int buffersize;
            MPI_Datatype file_type;
            MPI_File fh;
            MPI_Status status;
            array_of_gsizes[0]=Dblocks * blocksize;
            array_of_gsizes[1]=Dblocks * blocksize;
            array_of_distribs[0]=MPI_DISTRIBUTE_CYCLIC;
            array_of_distribs[1]=MPI_DISTRIBUTE_CYCLIC;
            array_of_dargs[0]=blocksize;
            array_of_dargs[1]=blocksize;
            array_of_psize[0]=*dims;
            array_of_psize[1]=*(dims + 1);

            MPI_Type_create_darray(size,iam,2,array_of_gsizes, array_of_distribs,
                                   array_of_dargs, array_of_psize, MPI_ORDER_FORTRAN,
                                   MPI_DOUBLE, &file_type);
            MPI_Type_commit(&file_type);
            info = MPI_File_open(MPI_COMM_WORLD, filenameD,
                                 MPI_MODE_CREATE | MPI_MODE_WRONLY,
                                 MPI_INFO_NULL, &fh);
            /*if ( ( Drows-1 ) % *(dims+1) == *position && ( Dcols-1 ) % *(dims) == pcol && Ddim%blocksize !=0 )
                buffersize=((Drows-1) * blocksize + Ddim % blocksize) * ((Dcols-1) * blocksize + Ddim % blocksize);
            else if ( ( Drows-1 ) % *(dims+1) == *position && Ddim%blocksize !=0 )
                buffersize=((Drows-1) * blocksize + Ddim % blocksize) * Dcols * blocksize;
            else if ( ( Dcols-1 ) % *(dims) == *position && Ddim%blocksize !=0 )
                buffersize=((Dcols-1) * blocksize + Ddim % blocksize) * Drows * blocksize;
            else*/
            buffersize= Dcols * Drows * blocksize * blocksize;

            MPI_File_set_view(fh, 0, MPI_DOUBLE, file_type, "native", MPI_INFO_NULL);
            info =MPI_File_write_all(fh, D,buffersize, MPI_DOUBLE,
                                     &status);
	    MPI_File_close(&fh);
            if(iam==0) {
                printf("Matrix D (dimension %d) is printed in file %s\n", Dblocks*blocksize,filenameD);
            }
            if(filenameD != NULL)
                free(filenameD);
            filenameD=NULL;
            //delete[] array_of_gsizes, delete[] array_of_distribs, delete[] array_of_dargs, delete[] array_of_psize;
        }



        //Now every matrix has to set up the sparse matrix A, consisting of X'X, X'Z, Z'X and Z'Z + lambda*I
        Xsparse.loadFromFile ( filenameX );
        Zsparse.loadFromFile ( filenameZ );

        if(filenameX != NULL)
            free(filenameX);
        filenameX=NULL;
        if(filenameZ != NULL)
            free(filenameZ);
        filenameZ=NULL;

        smat_t *X_smat, *Z_smat;

        X_smat= (smat_t *) calloc(1,sizeof(smat_t));
        Z_smat= (smat_t *) calloc(1,sizeof(smat_t));

        X_smat = smat_new_from ( Xsparse.nrows,Xsparse.ncols,Xsparse.pRows,Xsparse.pCols,Xsparse.pData,0,0 );
        Z_smat = smat_new_from ( Zsparse.nrows,Zsparse.ncols,Zsparse.pRows,Zsparse.pCols,Zsparse.pData,0,0 );

        smat_t *Xt_smat, *Zt_smat;
        Xt_smat= (smat_t *) calloc(1,sizeof(smat_t));
        Zt_smat= (smat_t *) calloc(1,sizeof(smat_t));
        Xt_smat = smat_copy_trans ( X_smat );
        Zt_smat = smat_copy_trans ( Z_smat );

        CSRdouble Asparse;
        smat_t *XtX_smat, *XtZ_smat, *ZtZ_smat, *lambda_smat, *ZtZlambda_smat;

        XtX_smat= (smat_t *) calloc(1,sizeof(smat_t));
        XtZ_smat= (smat_t *) calloc(1,sizeof(smat_t));
        ZtZ_smat= (smat_t *) calloc(1,sizeof(smat_t));


        XtX_smat = smat_matmul ( Xt_smat, X_smat );
        XtZ_smat = smat_matmul ( Xt_smat, Z_smat );
        ZtZ_smat = smat_matmul ( Zt_smat,Z_smat );

        Xsparse.clear();
        Zsparse.clear();
        smat_free(Xt_smat);
        smat_free(Zt_smat);
        /*smat_free(X_smat);
        smat_free(Z_smat);*/

        CSRdouble Imat;

        makeIdentity ( l, Imat );

        lambda_smat= (smat_t *) calloc(1,sizeof(smat_t));

        lambda_smat = smat_new_from ( Imat.nrows,Imat.ncols,Imat.pRows,Imat.pCols,Imat.pData,0,0 );

        smat_scale_diag ( lambda_smat, -lambda );

        ZtZlambda_smat= (smat_t *) calloc(1,sizeof(smat_t));

        ZtZlambda_smat = smat_add ( lambda_smat, ZtZ_smat );

        smat_free(ZtZ_smat);
        //smat_free(lambda_smat);


        smat_to_symmetric_structure ( XtX_smat );
        smat_to_symmetric_structure ( ZtZlambda_smat );

        CSRdouble XtX_sparse, XtZ_sparse, ZtZ_sparse;

        XtX_sparse.make2 ( XtX_smat->m,XtX_smat->n,XtX_smat->nnz,XtX_smat->ia,XtX_smat->ja,XtX_smat->a );
        XtZ_sparse.make2 ( XtZ_smat->m,XtZ_smat->n,XtZ_smat->nnz,XtZ_smat->ia,XtZ_smat->ja,XtZ_smat->a );
        ZtZ_sparse.make2 ( ZtZlambda_smat->m,ZtZlambda_smat->n,ZtZlambda_smat->nnz,ZtZlambda_smat->ia,ZtZlambda_smat->ja,ZtZlambda_smat->a );

        /*smat_free(XtX_smat);
        smat_free(XtZ_smat);
        smat_free(ZtZlambda_smat);*/
        Imat.clear();

        if (iam==0) {
            cout << "***                                           [  t     t  ] *** " << endl;
            cout << "***                                           [ X X   X Z ] *** " << endl;
            cout << "***                                           [           ] *** " << endl;
            cout << "*** G e n e r a t i n g    m a t r i x    A = [           ] *** " << endl;
            cout << "***                                           [  t     t  ] *** " << endl;
            cout << "***                                           [ Z X   Z Z ] *** " << endl;
        }

        //Sparse matrix A only contains the upper triangular part of A
        create2x2SymBlockMatrix ( XtX_sparse, XtZ_sparse, ZtZ_sparse, Asparse );
        //Asparse.writeToFile("A_sparse.csr");

        smat_free(XtX_smat);
        smat_free(XtZ_smat);
        smat_free(ZtZlambda_smat);
        XtX_sparse.clear();
        XtZ_sparse.clear();
        ZtZ_sparse.clear();

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        if(printsparseC_bool) {
            CSRdouble Dmat, Dblock, Csparse;
            Dblock.nrows=Dblocks * blocksize;
            Dblock.ncols=Dblocks * blocksize;
            Dblock.allocate(Dblocks * blocksize, Dblocks * blocksize, 0);
            Dmat.allocate(0,0,0);
            for (i=0; i<Drows; ++i) {
                for(j=0; j<Dcols; ++j) {
                    dense2CSR_sub(D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock,( * ( dims) * i + *position ) *blocksize,
                                  ( * ( dims+1 ) * j + pcol ) *blocksize);
                    if ( Dblock.nonzeros>0 ) {
                        if ( Dmat.nonzeros==0 ) {
                            Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData );
                        }
                        else {
                            Dmat.addBCSR ( Dblock );
                        }
                    }

                    Dblock.clear();
                }
            }
            blacs_barrier_(&ICTXT2D,"A");
            if ( iam!=0 ) {
                //Each process other than root sends its Dmat to the root process.
                MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD );
                Dmat.clear();
            }
            else {
                for ( i=1; i<size; ++i ) {
                    // The root process receives parts of Dmat sequentially from all processes and directly adds them together.
                    int nonzeroes, count;
                    MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status );
                    /*MPI_Get_count(&status, MPI_INT, &count);
                    printf("Process 0 received %d elements of process %d\n",count,i);*/
                    if(nonzeroes>0) {
                        printf("Nonzeroes : %d\n ",nonzeroes);
                        Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes );
                        MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_DOUBLE, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        Dmat.addBCSR ( Dblock );
                    }
                }
                //Dmat.writeToFile("D_sparse.csr");
                Dmat.reduceSymmetric();
                Btsparse.transposeIt(1);
                create2x2SymBlockMatrix(Asparse,Btsparse, Dmat, Csparse);
                Btsparse.clear();
                Dmat.clear();
                Csparse.writeToFile(filenameC);
                Csparse.clear();
                if(filenameC != NULL)
                    free(filenameC);
                filenameC=NULL;
            }
        }
        Btsparse.clear();
        blacs_barrier_(&ICTXT2D,"A");

        //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol
        double * AB_sol;
        int * DESCAB_sol;
        DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCAB_sol==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize)
        descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        AB_sol=(double *) calloc(Adim * Dcols*blocksize,sizeof(double));

        // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp)
        // The solution of A * Y = B_j is stored in AB_sol (= A^-1 * B_j)
        blacs_barrier_(&ICTXT2D,"A");
        make_Sij_parallel_denseB ( Asparse, BT_i, B_j, D, lld_D, AB_sol );
        BT_i.clear();
        B_j.clear();

        //From here on the Schur complement S of D is stored in D

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        //The Schur complement is factorised (by ScaLAPACK)
        pdpotrf_ ( "U",&k,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        //From here on the factorization of the Schur complement S is stored in D

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        //The Schur complement is inverted (by ScaLAPACK)
        pdpotri_ ( "U",&k,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Inverse of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        //From here on the inverse of the Schur complement S is stored in D

        blacs_barrier_(&ICTXT2D,"A");

        double* InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) );

        //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process.
        if(*position == pcol) {
            for (i=0; i<Ddim; ++i) {
                if (pcol == (i/blocksize) % *dims) {
                    int Dpos = i%blocksize + ((i/blocksize) / *dims) * blocksize ;
                    *(InvD_T_Block + Adim +i) = *( D + Dpos + lld_D * Dpos);
                }
            }
            for ( i=0,j=0; i<Dblocks; ++i,++j ) {
                if ( j==*dims )
                    j=0;
                if ( *position==j ) {
                    dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero );
                }
                if ( *position==0 ) {
                    dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j );
                }
            }
        }

        blacs_barrier_(&ICTXT2D,"A");

        //Only the root process performs a selected inversion of A.
        if (iam==0) {

            int pardiso_message_level = 1;

            int pardiso_mtype=-2;

            ParDiSO pardiso ( pardiso_mtype, pardiso_message_level );
            int number_of_processors = 1;
            char* var = getenv("OMP_NUM_THREADS");
            if(var != NULL) {
                sscanf( var, "%d", &number_of_processors );
            }
            else {
                printf("Set environment OMP_NUM_THREADS to 1");
                exit(1);
            }

            pardiso.iparm[2]  = 2;
            pardiso.iparm[3]  = number_of_processors;
            pardiso.iparm[8]  = 0;
            pardiso.iparm[11] = 1;
            pardiso.iparm[13]  = 0;
            pardiso.iparm[28]  = 0;

            //This function calculates the factorisation of A once again so this might be optimized.
            pardiso.findInverseOfA ( Asparse );

            printf("Processor %d inverted matrix A\n",iam);
        }
        blacs_barrier_(&ICTXT2D,"A");

        // To minimize memory usage, and because only the diagonal elements of the inverse are needed, Y' * S is calculated row by rowblocks
        // the diagonal element is calculates as the dot product of this row and the corresponding column of Y. (Y is solution of AY=B)
        double* YSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) );
        int * DESCYSROW;
        DESCYSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCYSROW==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //YSrow (1,Ddim) is distributed across processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize)
        descinit_ ( DESCYSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        blacs_barrier_(&ICTXT2D,"A");

        //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1.
        for (i=1; i<=Adim; ++i) {
            pdsymm_ ("R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,YSrow,&i_one,&i_one,DESCYSROW);
            pddot_(&Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,YSrow,&i_one,&i_one,DESCYSROW,&i_one);
            /*if(*position==1 && pcol==1)
            printf("Dot product in process (1,1) is: %g\n", *(InvD_T_Block+i-1));
            if(*position==0 && pcol==1)
            printf("Dot product in process (0,1) is: %g\n",*(InvD_T_Block+i-1));*/
        }
        blacs_barrier_(&ICTXT2D,"A");
        if(YSrow != NULL)
            free(YSrow);
        YSrow = NULL;
        if(DESCYSROW != NULL)
            free(DESCYSROW);
        DESCYSROW = NULL;
        if(AB_sol != NULL)
            free(AB_sol);
        AB_sol = NULL;
        if(DESCAB_sol != NULL)
            free(DESCAB_sol);
        DESCAB_sol = NULL;
        if(D != NULL)
            free(D);
        D = NULL;
        if(DESCD != NULL)
            free(DESCD);
        DESCD = NULL;

        //Only in the root process we add the diagonal elements of A^-1
        if (iam ==0) {
            for(i=0; i<Adim; ++i) {
                j=Asparse.pRows[i];
                *(InvD_T_Block+i) += Asparse.pData[j];
            }
            Asparse.clear();
            printdense ( Adim+k,1,InvD_T_Block,"diag_inverse_C_parallel.txt" );
        }
        if(InvD_T_Block != NULL)
            free(InvD_T_Block);
        InvD_T_Block = NULL;
	blacs_gridexit_(&ICTXT2D);
    }
    //cout << iam << " reached end before MPI_Barrier" << endl;
    MPI_Barrier(MPI_COMM_WORLD);
    //MPI_Finalize();

    return 0;
}
Example #18
0
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *X=NULL, *B=NULL, *Y=NULL, *Xtrue=NULL;
  myscalar elem;
  int descA[BLACSCTXTSIZE], descVec[BLACSCTXTSIZE], descelem[BLACSCTXTSIZE];
  int n;
  int nrhs;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal rdummy, res;

  n=1024; /* Size of the problem */
  nrhs=3; /* Number of RHS */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    myreal pi=3.1416, d=0.1;
    for(i=1;i<=locr;i++) {
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
    }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nrhs,&nb,&mycol,&IZERO,&npcol);
    dummy=std::max(1,locr);
    descinit_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
    Xtrue=new myscalar[locr*locc]();
    for(i=0;i<locr*locc;i++)
      Xtrue[i]=static_cast<myscalar>(rand())/(static_cast<myscalar>(RAND_MAX));
  } else {
    descset_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution and intermediate vector space */
  if(myid<nprow*npcol) {
    X=new myscalar[locr*locc]();
    Y=new myscalar[locr*locc]();
  }

  /* Initializing RHS as A*Xtrue */
  if(myid<nprow*npcol) {
    B=new myscalar[locr*locc]();
    pgemm('N','N',n,nrhs,n,ONE,A,IONE,IONE,descA,Xtrue,IONE,IONE,descVec,ZERO,B,IONE,IONE,descVec);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-12;
  sdp.split_HSS=768; /* Size of A11 */

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Factorization */
  sdp.partially_factor(A,descA,sdp.split_HSS);

  /* Schur complement update */
  sdp.compute_schur();

  /* Extracting a random element from the Schur complement */
  if(myid<nprow*npcol) {
    descinit_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else {
    descset_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }
  i=1+rand()%(n-sdp.split_HSS);
  j=1+rand()%(n-sdp.split_HSS);
  MPI_Bcast((void *)&i,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  MPI_Bcast((void *)&j,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  sdp.extract_schur(&elem,descelem,&i,IONE,&j,IONE);
  if(!myid)
    std::cout << "Element (" << i << "," << j << ") of Schur complement = " << elem << std::endl << std::endl;

  /* Condensation */
  sdp.reduce_RHS(Y,descVec,B,descVec);

  /* Solve the Schur complement system (touches only the bottom part of X)*/
  sdp.verbose=false;
  if(!myid)
    std::cout << "Solving Schur complement system with Conjugate Gradient..." << std::endl << std::endl;
  CG(&sdp,X,Y,descVec,n,nrhs,3000,1e-14);
  sdp.verbose=true;

  /* Expansion (touches only the top part of X) */
  sdp.expand_solution(X,descVec,Y,descVec);

  /* Accuracy checking */
  sdp.check_solution(A,descA,X,descVec,B,descVec);

  /* Forward error */
  if(myid<nprow*npcol) {
    for(i=0;i<locr*locc;i++)
      Y[i]=X[i]-Xtrue[i];
    res=plange('F',n,nrhs,Y,IONE,IONE,descVec,&rdummy);
    res/=plange('F',n,nrhs,Xtrue,IONE,IONE,descVec,&rdummy);
    if(!myid)
      std::cout << "Forward error = " << res << std::endl;
  }

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] X;
  delete[] Y;
  delete[] Xtrue;

  /* The end */
  MPI_Finalize();
  return 0;

}
Example #19
0
int set_up_BD ( int * DESCD, double * Dmat, CSRdouble& BT_i, CSRdouble& B_j, CSRdouble& Btsparse ) {

    // Read-in of matrices X, Z and T from file (filename[X,Z,T])
    // X and Z are read in entrely by every process
    // T is read in strip by strip (number of rows in each process is at maximum = blocksize)
    // D is constructed directly in a distributed way
    // B is first assembled sparse in root process and afterwards the necessary parts
    // for constructing the distributed Schur complement are sent to each process

    FILE *fT;
    int ni, i,j, info;
    int *DESCT;
    double *Tblock, *temp;
    int nTblocks, nstrips, pTblocks, stripcols, lld_T, pcol, colcur,rowcur;

    CSRdouble Xtsparse, Ztsparse,XtT_sparse,ZtT_sparse,XtT_temp, ZtT_temp;

    Xtsparse.loadFromFile ( filenameX );
    Ztsparse.loadFromFile ( filenameZ );

    Xtsparse.transposeIt ( 1 );
    Ztsparse.transposeIt ( 1 );

    XtT_sparse.allocate ( m,k,0 );
    ZtT_sparse.allocate ( l,k,0 );



    pcol= * ( position+1 );

    // Matrix T is read in by strips of size (blocksize * *(dims+1), k)
    // Strips of T are read in row-wise and thus it is as if we store strips of T' (transpose) column-wise with dimensions (k, blocksize * *(dims+1))
    // However we must then also transpose the process grid to distribute T' correctly

    // number of strips in which we divide matrix T'
    nstrips= n % ( blocksize * * ( dims+1 ) ) ==0 ?  n / ( blocksize * * ( dims+1 ) ) : ( n / ( blocksize * * ( dims+1 ) ) ) +1;

    //the number of columns of T' included in each strip
    stripcols= blocksize * * ( dims+1 );

    //number of blocks necessary to store complete column of T'
    nTblocks= k%blocksize==0 ? k/blocksize : k/blocksize +1;

    //number of blocks necessary in this process to store complete column of T'
    pTblocks= ( nTblocks - *position ) % *dims == 0 ? ( nTblocks- *position ) / *dims : ( nTblocks- *position ) / *dims +1;
    pTblocks= pTblocks <1? 1:pTblocks;

    //local leading dimension of the strip of T' (different from process to process)
    lld_T=pTblocks*blocksize;

    // Initialisation of descriptor of strips of matrix T'
    DESCT= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
    if ( DESCT==NULL ) {
        printf ( "unable to allocate memory for descriptor for Z\n" );
        return -1;
    }
    // strip of T (k,stripcols) is distributed across ICTXT2D starting in process (0,0) in blocks of size (blocksize,blocksize)
    // the local leading dimension in this process is lld_T
    descinit_ ( DESCT, &k, &stripcols, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_T, &info );
    if ( info!=0 ) {
        printf ( "Descriptor of matrix Z returns info: %d\n",info );
        return info;
    }

    // Allocation of memory for the strip of T' in all processes

    Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) );
    if ( Tblock==NULL ) {
        printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)",*position,* ( position+1 ) );
        return -1;
    }

    // Initialisation of matrix D (all diagonal elements of D equal to lambda)
    temp=Dmat;
    for ( i=0,rowcur=0,colcur=0; i<Dblocks; ++i, ++colcur, ++rowcur ) {
        if ( rowcur==*dims ) {
            rowcur=0;
            temp += blocksize;
        }
        if ( colcur==* ( dims+1 ) ) {
            colcur=0;
            temp += blocksize*lld_D;
        }
        if ( *position==rowcur && * ( position+1 ) == colcur ) {
            for ( j=0; j<blocksize; ++j ) {
                * ( temp + j  * lld_D +j ) =lambda;
            }
            if ( i==Dblocks-1 && Ddim % blocksize != 0 ) {
                for ( j=blocksize-1; j>= Ddim % blocksize; --j ) {
                    * ( temp + j * lld_D + j ) =0.0;
                }
            }
        }
    }

    fT=fopen ( filenameT,"rb" );
    if ( fT==NULL ) {
        printf ( "Error opening file\n" );
        return -1;
    }

    // Set up of matrix D and B per strip of T'

    for ( ni=0; ni<nstrips; ++ni ) {
        if ( ni==nstrips-1 ) {
            if(Tblock != NULL)
                free ( Tblock );
            Tblock=NULL;

            Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) );
            if ( Tblock==NULL ) {
                printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)\n",*position,* ( position+1 ) );
                return -1;
            }
        }

        //Each process only reads in a part of the strip of T'
        //When k is not a multiple of blocksize, read-in of the last elements of the rows of T is tricky
        if ( ( nTblocks-1 ) % *dims == *position && k%blocksize !=0 ) {
            if ( ni==0 ) {
                info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET );
                if ( info!=0 ) {
                    printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                    return -1;
                }
            } else {
                info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR );
                if ( info!=0 ) {
                    printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                    return -1;
                }
            }
            for ( i=0; i<blocksize; ++i ) {
                info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR );
                if ( info!=0 ) {
                    printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                    return -1;
                }
                for ( j=0; j < pTblocks-1; ++j ) {
                    fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT );
                    info=fseek ( fT, ( long ) ( ( ( *dims ) -1 ) * blocksize * sizeof ( double ) ),SEEK_CUR );
                    if ( info!=0 ) {
                        printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                        return -1;
                    }
                }
                fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),k%blocksize,fT );
            }
            //Normal read-in of the strips of T from a binary file (each time blocksize elements are read in)
        } else {
            if ( ni==0 ) {
                info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET );
                if ( info!=0 ) {
                    printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                    return -1;
                }
            } else {
                info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR );
                if ( info!=0 ) {
                    printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                    return -1;
                }
            }
            for ( i=0; i<blocksize; ++i ) {
                info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR );
                if ( info!=0 ) {
                    printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                    return -1;
                }
                for ( j=0; j < pTblocks-1; ++j ) {
                    fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT );
                    info=fseek ( fT, ( long ) ( ( * ( dims )-1 ) * blocksize * sizeof ( double ) ),SEEK_CUR );
                    if ( info!=0 ) {
                        printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                        return -1;
                    }
                }
                fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT );
                info=fseek ( fT, ( long ) ( ( k - blocksize * ( ( pTblocks-1 ) * *dims + *position +1 ) ) * sizeof ( double ) ),SEEK_CUR );
                if ( info!=0 ) {
                    printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info );
                    return -1;
                }
            }
        }

        blacs_barrier_ ( &ICTXT2D,"A" );

        // End of read-in

        // Matrix D is the sum of the multiplications of all strips of T' by their transpose
        // Up unitl now, the entire matrix is stored, not only upper/lower triangular, which is possible since D is symmetric
        // Be aware, that you akways have to allocate memory for the enitre matrix, even when only dealing with the upper/lower triangular part

        pdgemm_ ( "N","T",&k,&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &i_one, &i_one, DESCD ); //Z'Z
        //pdsyrk_ ( "U","N",&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &t_plus, &t_plus, DESCD );

        // Matrix B consists of X'T and Z'T, since each process only has some parts of T at its disposal,
        // we need to make sure that the correct columns of Z and X are multiplied with the correct columns of T.
        for ( i=0; i<pTblocks; ++i ) {
            XtT_temp.ncols=k;

            //This function multiplies the correct columns of X' with the blocks of T at the disposal of the process
            // The result is also stored immediately at the correct positions of X'T. (see src/tools.cpp)
	    XtT_temp.clear();
            mult_colsA_colsC ( Xtsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize,
                               ( *dims * i + *position ) *blocksize, blocksize, XtT_temp, 0 );
            if ( XtT_temp.nonzeros>0 ) {
                if ( XtT_sparse.nonzeros==0 ){
		  XtT_sparse.clear();
                    XtT_sparse.make2 ( XtT_temp.nrows,XtT_temp.ncols,XtT_temp.nonzeros,XtT_temp.pRows,XtT_temp.pCols,XtT_temp.pData );
		}
                else {
                    XtT_sparse.addBCSR ( XtT_temp );
                }
            }
        }
        //Same as above for calculating Z'T
        for ( i=0; i<pTblocks; ++i ) {
            ZtT_temp.ncols=k;
	    ZtT_temp.clear();
            mult_colsA_colsC ( Ztsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize,
                               blocksize * ( *dims * i + *position ), blocksize, ZtT_temp, 0 );
            if ( ZtT_temp.nonzeros>0 ) {
                if ( ZtT_sparse.nonzeros==0 ){
		  ZtT_sparse.clear();
                    ZtT_sparse.make2 ( ZtT_temp.nrows,ZtT_temp.ncols,ZtT_temp.nonzeros,ZtT_temp.pRows,ZtT_temp.pCols,ZtT_temp.pData );
		}
                else
                    ZtT_sparse.addBCSR ( ZtT_temp );
            }
        }
        blacs_barrier_ ( &ICTXT2D,"A" );
    }
    XtT_temp.clear();
    ZtT_temp.clear();
    Xtsparse.clear();
    Ztsparse.clear();
    if(DESCT != NULL)
        free ( DESCT );
    DESCT=NULL;
    if(Tblock != NULL)
        free ( Tblock );
    Tblock=NULL;

    //printf("T read in\n");

    info=fclose ( fT );
    if ( info!=0 ) {
        printf ( "Error in closing open streams" );
        return -1;
    }
    if(filenameT != NULL)
        free(filenameT);
    filenameT=NULL;

    //Each process only has calculated some parts of B
    //All parts are collected by the root process (iam==0), which assembles B
    //Each process then receives BT_i and B_j corresponding to the D_ij available to the process
    if ( iam!=0 ) {
        //Each process other than root sends its X' * T and Z' * T to the root process.
        MPI_Send ( & ( XtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD );
        MPI_Send ( & ( XtT_sparse.pRows[0] ),XtT_sparse.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD );
        MPI_Send ( & ( XtT_sparse.pCols[0] ),XtT_sparse.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD );
        MPI_Send ( & ( XtT_sparse.pData[0] ),XtT_sparse.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD );
        XtT_sparse.clear();
        MPI_Send ( & ( ZtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD );
        MPI_Send ( & ( ZtT_sparse.pRows[0] ),ZtT_sparse.nrows + 1, MPI_INT,0,4*size + iam,MPI_COMM_WORLD );
        MPI_Send ( & ( ZtT_sparse.pCols[0] ),ZtT_sparse.nonzeros, MPI_INT,0,iam+ 5*size,MPI_COMM_WORLD );
        MPI_Send ( & ( ZtT_sparse.pData[0] ),ZtT_sparse.nonzeros, MPI_DOUBLE,0,iam+6*size,MPI_COMM_WORLD );
        ZtT_sparse.clear();
        //printf("Process %d sent ZtT and XtT\n",iam);

        // And eventually receives the necessary BT_i and B_j
        // Blocking sends are used, which is why the order of the receives is critical depending on the coordinates of the process
        int nonzeroes;
        if (*position >= pcol) {
            MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status );
            BT_i.allocate ( blocksize*Drows,m+l,nonzeroes );
            MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status );
            int count;
            MPI_Get_count(&status,MPI_INT,&count);
            BT_i.nrows=count-1;
            MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status );
            MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status );

            MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status );

            B_j.allocate ( blocksize*Dcols,m+l,nonzeroes );

            MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status );
            MPI_Get_count(&status,MPI_INT,&count);
            B_j.nrows=count-1;
            MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status );
            MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status );

            //Actually BT_j is sent, so it still needs to be transposed
            B_j.transposeIt ( 1 );
        }
        else {
            MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status );

            B_j.allocate ( blocksize*Dcols,m+l,nonzeroes );

            MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status );
            int count;
            MPI_Get_count(&status,MPI_INT,&count);
            B_j.nrows=count-1;

            MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status );

            MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status );

            B_j.transposeIt ( 1 );

            MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status );
            BT_i.allocate ( blocksize*Drows,m+l,nonzeroes );
            MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status );
            MPI_Get_count(&status,MPI_INT,&count);
            BT_i.nrows=count-1;
            MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status );
            MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status );
        }
    }
    else {
        for ( i=1; i<size; ++i ) {
            // The root process receives parts of X' * T and Z' * T sequentially from all processes and directly adds them together.
            int nonzeroes;
            MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status );
            if(nonzeroes>0) {
                XtT_temp.allocate ( m,k,nonzeroes );
                MPI_Recv ( & ( XtT_temp.pRows[0] ),m + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status );
                MPI_Recv ( & ( XtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status );
                MPI_Recv ( & ( XtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status );

                XtT_sparse.addBCSR ( XtT_temp );
                XtT_temp.clear();
            }

            MPI_Recv ( &nonzeroes,1, MPI_INT,i,i,MPI_COMM_WORLD,&status );

            if(nonzeroes>0) {
                ZtT_temp.allocate ( l,k,nonzeroes );

                MPI_Recv ( & ( ZtT_temp.pRows[0] ),l + 1, MPI_INT,i,4*size + i,MPI_COMM_WORLD,&status );
                MPI_Recv ( & ( ZtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+ 5*size,MPI_COMM_WORLD,&status );
                MPI_Recv ( & ( ZtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+6*size,MPI_COMM_WORLD,&status );

                ZtT_sparse.addBCSR ( ZtT_temp );
                ZtT_temp.clear();
            }
        }
        XtT_sparse.transposeIt ( 1 );
        ZtT_sparse.transposeIt ( 1 );

        // B' is created by concatening blocks X'T and Z'T
        create1x2BlockMatrix ( XtT_sparse, ZtT_sparse,Btsparse );
        XtT_sparse.clear();
        ZtT_sparse.clear();
        /*Btsparse.transposeIt(1);
            Btsparse.writeToFile("B_sparse.csr");
        Btsparse.transposeIt(1);*/

        // For each process row i BT_i is created which is also sent to processes in column i to become B_j.
        for ( int rowproc= *dims - 1; rowproc>= 0; --rowproc ) {
            BT_i.ncols=Btsparse.ncols;
            BT_i.nrows=0;
            BT_i.nonzeros=0;
            int Drows_rowproc;
            if (rowproc!=0) {
                Drows_rowproc= ( Dblocks - rowproc ) % *dims == 0 ? ( Dblocks- rowproc ) / *dims : ( Dblocks- rowproc ) / *dims +1;
                Drows_rowproc= Drows_rowproc<1? 1 : Drows_rowproc;
            }
            else
                Drows_rowproc=Drows;
            for ( i=0; i<Drows_rowproc; ++i ) {
                //Each process in row i can hold several blocks of contiguous rows of D for which we need the corresponding rows of B_T
                // Therefore we use the function extendrows to create BT_i (see src/tools.cpp)
                BT_i.extendrows ( Btsparse, ( i * *dims + rowproc ) * blocksize,blocksize );
            }
            for ( int colproc= ( rowproc==0 ? 1 : 0 ); colproc < * ( dims+1 ); ++colproc ) {
                int rankproc;
                rankproc= blacs_pnum_ (&ICTXT2D, &rowproc,&colproc);

                MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc,MPI_COMM_WORLD );
                MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+size,MPI_COMM_WORLD );
                MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+2*size,MPI_COMM_WORLD );
                MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+3*size,MPI_COMM_WORLD );

                //printf("BT_i's sent to processor %d\n",rankproc);

                rankproc= blacs_pnum_ (&ICTXT2D, &colproc,&rowproc);
                MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc+4*size,MPI_COMM_WORLD );
                MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+5*size,MPI_COMM_WORLD );
                MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+6*size,MPI_COMM_WORLD );
                MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+7*size,MPI_COMM_WORLD );

                //printf("B_j's sent to processor %d\n",rankproc);
            }
        }
        B_j.make2 ( BT_i.nrows,BT_i.ncols,BT_i.nonzeros,BT_i.pRows,BT_i.pCols,BT_i.pData );
        B_j.transposeIt ( 1 );
    }
    return 0;
}
Example #20
0
/*==== MAIN FUNCTION =================================================*/
int main( int argc, char *argv[] ){

/*  ==== Declarations =================================================== */

/*  File variables */
    FILE    *fin;

/*  Matrix descriptors */
    MDESC   descA, descB, descC, descA_local, descB_local;

/*  Local scalars */
    MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol;
    MKL_INT n, nb, mp, nq, lld, lld_local;
    MKL_INT i, j, info;
    int     n_int, nb_int, nprow_int, npcol_int;
    double  thresh, diffnorm, anorm, bnorm, residual, eps;

/*  Local arrays */
    double  *A_local, *B_local, *A, *B, *C, *work;
    MKL_INT iwork[ 4 ];


/*  ==== Executable statements ========================================== */

/*  Get information about how many processes are used for program execution
    and number of current process */
    blacs_pinfo_( &iam, &nprocs );

/*  Init temporary 1D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "C", &nprocs, &i_one );

/*  Open input file */
    if ( iam == 0 ) {
        fin = fopen( "../in/pblas3ex.in", "r" );
        if ( fin == NULL ) {
            printf( "Error while open input file." );
            return 2;
        }
    }

/*  Read data and send it to all processes */
    if ( iam == 0 ) {

/*      Read parameters */
        fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int );
        fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int );
        fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int );
        fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int );
        fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh );
        n = (MKL_INT) n_int;
        nb = (MKL_INT) nb_int;
        nprow = (MKL_INT) nprow_int;
        npcol = (MKL_INT) npcol_int;

/*      Check if all parameters are correct */
        if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) {
            printf( "One or several input parameters has incorrect value. Limitations:\n" );
            printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" );
            printf( "p*q = number of processes\n" );
            printf( "threshold - double (set to negative to swicth off check)\n");
            return 2;
        }

/*      Pack data into array and send it to other processes */
        iwork[ 0 ] = n;
        iwork[ 1 ] = nb;
        iwork[ 2 ] = nprow;
        iwork[ 3 ] = npcol;
        igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four );
        dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one );
    } else {

/*      Recieve and unpack data */
        igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero );
        dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero );
        n = iwork[ 0 ];
        nb = iwork[ 1 ];
        nprow = iwork[ 2 ];
        npcol = iwork[ 3 ];
    }
    if ( iam == 0 ) { fclose( fin ); }

/*  Destroy temporary process grid */
    blacs_gridexit_( &ictxt );

/*  Init workind 2D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

/*  Create on process 0 two matrices: A - orthonormal, B -random */
    if ( ( myrow == 0 ) && ( mycol == 0 ) ){

/*      Allocate arrays */
        A_local = (double*) calloc( n*n, sizeof( double ) );
        B_local = (double*) calloc( n*n, sizeof( double ) );

/*      Set arrays */
        for ( i=0; i<n; i++ ){
            for ( j=0; j<n; j++ ){
                B_local[ i+n*j ] = one*rand()/RAND_MAX;
            }
            B_local[ i+n*i ] += two;
        }
        for ( j=0; j<n; j++ ){
            for ( i=0; i<n; i++ ){
                if ( j < n-1 ){
                    if ( i <= j ){
                        A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) );
                    } else if ( i == j+1 ) {
                        A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) );
                    } else {
                        A_local[ i+n*j ] = zero;
                    }
                } else {
                    A_local[ i+n*(n-1) ] = one / sqrt( ( double )n );
                }
            }
        }

/*      Print information of task */
        printf( "=== START OF EXAMPLE ===================\n" );
        printf( "Matrix-matrix multiplication: A*B = C\n\n" );
        printf( "/  1/q_1 ........   1/q_n-1     1/q_n  \\ \n" );
        printf( "|        .                             | \n" );
        printf( "|         `.           :         :     | \n" );
        printf( "| -1/q_1    `.         :         :     | \n" );
        printf( "|        .    `.       :         :     |  =  A \n" );
        printf( "|   0     `.    `                      | \n" );
        printf( "|   : `.    `.      1/q_n-1     1/q_n  | \n" );
        printf( "|   :   `.    `.                       | \n" );
        printf( "\\   0 .... 0     -(n-1)/q_n-1   1/q_n  / \n\n" );
        printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" );
        printf( "A  -  n*n real matrix (orthonormal) \n" );
        printf( "B  -  random n*n real matrix\n\n" );
        printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol );
        printf( "=== PROGRESS ===========================\n" );
    } else {

/*      Other processes don't contain parts of initial arrays */
        A_local = NULL;
        B_local = NULL;
    }

/*  Compute precise length of local pieces and allocate array on
    each process for parts of distributed vectors */
    mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow );
    nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol );
    A = (double*) calloc( mp*nq, sizeof( double ) );
    B = (double*) calloc( mp*nq, sizeof( double ) );
    C = (double*) calloc( mp*nq, sizeof( double ) );

/*  Compute leading dimensions */
    lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 );
    lld = MAX( mp, 1 );

/*  Initialize descriptors for initial arrays located on 0 process */
    descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );
    descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );

/*  Initialize descriptors for distributed arrays */
    descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );

/*  Distribute matrices from 0 process over process grid */
    pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); }

/*  Destroy arrays on 0 process - they are not necessary anymore */
    if( ( myrow == 0 ) && ( mycol == 0 ) ){
        free( A_local );
        free( B_local );
    }

/*  Compute norm of A and B */
    work = (double*) calloc( mp, sizeof( double ) );
    anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work );
    bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); }

/*  Compute product C = A*B */
    pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
    if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); }

/*  Compute difference  B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */
    pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC,
             &negone, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); }

/*  Compute norm of B - inv_A*C (which is contained in B) */
    diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    free( work );
    if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); }

/*  Print results */
    if( iam == 0 ){
        printf( ".. Solutions are compared ..\n" );
        printf( "== Results ==\n" );
        printf( "||A|| = %03.11f\n", anorm );
        printf( "||B|| = %03.11f\n", bnorm );
        printf( "=== END OF EXAMPLE =====================\n" );
    }

/*  Compute machine epsilon */
    eps = pdlamch_( &ictxt, "e" );

/*  Compute residual */
    residual = diffnorm /( two*anorm*bnorm*eps );

/*  Destroy arrays */
    free( A );
    free( B );
    free( C );

/*  Destroy process grid */    
    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
    
/*  Check if residual passed or failed the threshold */
    if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){
        printf( "FAILED. Residual = %05.16f\n", residual );
        return 1;
    } else {
        return 0;
    }

/*========================================================================
  ====== End of PBLAS Level 3 example ====================================
  ======================================================================*/
}
Example #21
0
void test_gemr2d(int M, int N)
{
    int repeat = 10;
    
    int32_t one = 1;
    int32_t isrc = 0;

    int32_t num_ranks;
    MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    int32_t rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int32_t bs_row_A = M / num_ranks + std::min(1, M % num_ranks);
    int32_t bs_col_A = 1;

    int32_t bs_row_B = 1;
    int32_t bs_col_B = 1;

    int32_t blacs_handler = Csys2blacs_handle(MPI_COMM_WORLD);
    int32_t context1 = blacs_handler;
    int32_t context2 = blacs_handler;

    /* create BLACS context */
    Cblacs_gridinit(&context1, "C", num_ranks, 1);
    /* get row and column ranks */
    int32_t rank_row1, rank_col1;
    Cblacs_gridinfo(context1, &num_ranks, &one, &rank_row1, &rank_col1);
    /* get local number of rows and columns of a matrix */
    int32_t num_rows_local1, num_cols_local1;
    num_rows_local1 = numroc_(&M, &bs_row_A, &rank_row1, &isrc, &num_ranks);
    num_cols_local1 = numroc_(&N, &bs_col_A, &rank_col1, &isrc, &one);


    Cblacs_gridinit(&context2, "C", 1, num_ranks);
    int32_t rank_row2, rank_col2;
    Cblacs_gridinfo(context2, &one, &num_ranks, &rank_row2, &rank_col2);
    int32_t num_rows_local2, num_cols_local2;
    num_rows_local2 = numroc_(&M, &bs_row_B, &rank_row2, &isrc, &one);
    num_cols_local2 = numroc_(&N, &bs_col_B, &rank_col2, &isrc, &num_ranks);

    if (rank == 0)
    {
        printf("local dimensions of A: %i x %i\n", num_rows_local1, num_cols_local1);
        printf("local dimensions of B: %i x %i\n", num_rows_local2, num_cols_local2);
    } 

    int32_t descA[9], descB[9], info;
    descinit_(descA, &M, &N, &bs_row_A, &bs_col_A, &isrc, &isrc, &context1, &num_rows_local1, &info);
    descinit_(descB, &M, &N, &bs_row_B, &bs_col_B, &isrc, &isrc, &context2, &num_rows_local2, &info);

    std::vector<double_complex> A(num_rows_local1 * num_cols_local1);
    std::vector<double_complex> B(num_rows_local2 * num_cols_local2, double_complex(0, 0));
    std::vector<double_complex> C(num_rows_local1 * num_cols_local1);

    for (int i = 0; i < num_rows_local1 * num_cols_local1; i++)
    {
        A[i] = double_complex(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX);
        C[i] = A[i];
    }

    double time = -MPI_Wtime();
    for (int i = 0; i < repeat; i++)
    {
        pzgemr2d_(&M, &N, &A[0], &one, &one, descA, &B[0], &one, &one, descB, &context1);
    }
    time += MPI_Wtime();

    if (rank == 0)
    {
        printf("average time %.4f sec, swap speed: %.4f GB/sec\n", time / repeat,
               sizeof(double_complex) * repeat * M * N / double(1 << 30) / time);
    }

    /* check correctness */
    pzgemr2d_(&M, &N, &B[0], &one, &one, descB, &A[0], &one, &one, descA, &context1);
    for (int i = 0; i < num_rows_local1 * num_cols_local1; i++)
    {
        if (std::abs(A[i] - C[i]) > 1e-14)
        {
            printf("Fail.\n");
            exit(0);
        }
    }

    Cblacs_gridexit(context1);
    Cblacs_gridexit(context2);
    Cfree_blacs_system_handle(blacs_handler);
}
Example #22
0
int main ( int argc, char **argv ) {
    int info, i, j, pcol;
    double *D, *AB_sol, *InvD_T_Block, *XSrow;
    int *DESCD, *DESCAB_sol, *DESCXSROW;
    bool readAFromFile = false;
    //CSRdouble BT_i, B_j;
    double* BT_i;
    double* B_j;
    int s_BT_i = 0;
    int s_B_j  = 0;              // size of BT_i (#rows) and B_j (#cols)
    int nx, ny, nz;
    CSRdouble Asparse, Btsparse;

    if (argc < 6) 
    {
        // needed args: nx, ny, nz, Ddim, blocksize; 
        // optional: C.csr - if provided, the output file is written.
        cout << "Too few arguments." << endl;
        cout << "Usage: " << argv[0] << "   nx      ny     nz  Ddim blocksize [C.csr]" << endl;
        cout << "Usage: " << argv[0] << " -fileA filename Adim Ddim blocksize [C.csr]" << endl;
        exit(-1);
    }

    if (strcmp(argv[1], "-fileA") == 0)   // if (argv[1] == "-fileA")
    {
        readAFromFile = true;

        filenameA = new char[250];
        strcpy(filenameA, argv[2]);

        Adim = atoi(argv[3]);
        
    }
    else
    {
        nx   = atoi(argv[1]);
        ny   = atoi(argv[2]);
        nz   = atoi(argv[3]);
        Adim = nx * ny * nz;
    }

    Ddim      = atoi(argv[4]);
    blocksize = atoi(argv[5]);



    //printf("Adim=%d, Ddim=%d, blocksize=%d *** %s", Adim, Ddim, blocksize, outputc);
    //exit(-123);

    //Initialise MPI and some MPI-variables
    info = MPI_Init ( &argc, &argv );
    if ( info != 0 ) {
        printf ( "Error in MPI initialisation: %d\n",info );
        return info;
    }

    position= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( position==NULL ) {
        printf ( "unable to allocate memory for processor position coordinate\n" );
        return EXIT_FAILURE;
    }

    dims= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( dims==NULL ) {
        printf ( "unable to allocate memory for grid dimensions coordinate\n" );
        return EXIT_FAILURE;
    }

    //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI

    blacs_pinfo_ ( &iam,&size ); 				//determine the number of processes involved


    info=MPI_Dims_create ( size, 2, dims );			//determine the best 2D cartesian grid with the number of processes
    if ( info != 0 ) {
        printf ( "Error in MPI creation of dimensions: %d\n",info );
        return info;
    }
//Until now the code can only work with square process grids
    //So we try to get the biggest square grid possible with the number of processes involved
    if ( *dims != * ( dims+1 ) ) {
        while ( *dims * *dims > size )
            *dims -=1;
        * ( dims+1 ) = *dims;
        if ( iam==0 )
            printf ( "WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - ( *dims * *dims ) );
        size = *dims * *dims;
        //cout << "New size of process grid: " << size << endl;
    }

    blacs_get_ ( &i_negone,&i_zero,&ICTXT2D );

    //Initialisation of the BLACS process grid, which is referenced as ICTXT2D
    blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 );



    if ( iam < size ) {

        //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column)
        blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 );
        if ( *position ==-1 ) {
            printf ( "Error in proces grid\n" );
            return -1;
        }


        

        /*
        //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp)
        
        info=read_input ( *++argv );
        if ( info!=0 ) 
        {
            printf ( "Something went wrong when reading input file for processor %d\n",iam );
            return -1;
        }
        */
        

        //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point.
        blacs_barrier_ ( &ICTXT2D,"ALL" );

        if ( * ( position+1 ) ==0 && *position==0 )
            printf ( "Reading of input-file succesful\n" );

        if ( * ( position+1 ) ==0 && *position==0 ) {
            printf ( "\nA sparse square matrix of dimension %d with a dense square submatrix with dimension %d \n", Adim+Ddim,Ddim );
            printf ( "was analyzed using %d (%d x %d) processors\n",size,*dims,* ( dims+1 ) );
        }

        pcol= * ( position+1 );

        //Define number of blocks needed to store a complete column/row of D
        Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1;

        //Define the number of rowblocks needed by the current process to store its part of the dense matrix D
        Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1;
        Drows= Drows<1? 1 : Drows;

        //Define the number of columnblocks needed by the current process to store its part of the dense matrix D
        Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1;
        Dcols=Dcols<1? 1 : Dcols;

        //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise)
        lld_D=Drows*blocksize;

        // cout << "Hi! I am " << iam << ". My position is ( " << *position << "," << *(position+1) << ") and I have... Dblocks: " << Dblocks << ";   Drows: " << Drows << ";   Dcols: " << Dcols << ";   blocksize: " << blocksize << endl;

        //Initialise the descriptor of the dense distributed matrix
        DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCD==NULL ) {
            printf ( "unable to allocate memory for descriptor for C\n" );
            return -1;
        }

        //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0)
        //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process
        descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        //Allocate the space necessary to store the part of D that is held into memory of this process.
        D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) );
        if ( D==NULL ) {
            printf ( "unable to allocate memory for Matrix D  (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) );
            return EXIT_FAILURE;
        }


        blacs_barrier_ ( &ICTXT2D,"ALL" ); //added

        B_j  = new double[Adim * Dcols * blocksize];
        BT_i = new double[Adim * Drows * blocksize];

        //read_in_BD ( DESCD,D, BT_i, B_j, Btsparse ) ;
        if ( iam == 0 )
            cout << "Generating A, B and D... \n" << endl;
        generate_BD(D, BT_i, B_j, &s_BT_i, &s_B_j);

        cout << "- B, D generated." << endl;

        //Now every process has to read in the sparse matrix A

        //makeDiagonalPerturbD(Adim, 1000.0, 1e-10, Asparse); cout << "A is a pert. diag." << endl;
        //makeRandCSRUpper(Adim, 0.001, Asparse);
        //cout << "nnz(A) = " << Asparse.nonzeros << endl;
        //Asparse.loadFromFileSym("/users/drosos/simple/matrices/NornePrimaryJacobian.csr");

        if (readAFromFile)
        {
            Asparse.loadFromFile(filenameA);
            cout << "A loaded from file" << endl;
            //Asparse.reduceSymmetric();
        }
        else
        {
            make3DLaplace(nx, ny, nz, Asparse);
            cout << "A is Laplacian" << endl;
            shiftIndices(Asparse, -1);
        }
        cout << "- A generated." << endl;
        Asparse.matrixType = SYMMETRIC;

        assert(Asparse.nrows == Adim);
        assert(Asparse.ncols == Adim);

        //if (iam == 0) Asparse.writeToFile("A_debug.csr"); exit(-1234);
	

        if (argc == 7)        // if the name of the output file for C is given as parameter
        {
            filenameC = new char[250];
            //strcpy(filenameC, argv[6]);
            sprintf(filenameC, "/scratch/daint/verbof/sparsedense/C_%d_%d.csr", Adim, Ddim);

            CSRdouble Dmat, Dblock, Csparse, Bblock;
            Dblock.nrows=Dblocks * blocksize;
            Dblock.ncols=Dblocks * blocksize;
            Dblock.allocate ( Dblocks * blocksize, Dblocks * blocksize, 0 );
            Dmat.allocate ( 0,0,0 );
            for ( i=0; i<Drows; ++i ) {
                for ( j=0; j<Dcols; ++j ) {
                    dense2CSR_sub ( D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock, ( * ( dims ) * i + *position ) *blocksize,
                                    ( * ( dims+1 ) * j + pcol ) *blocksize );
                    if ( Dblock.nonzeros>0 ) {
                        if ( Dmat.nonzeros==0 ) {
                            Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData );
                        } else {
                            Dmat.addBCSR ( Dblock );
                        }
                    }

                    Dblock.clear();
                }
            }
            if ( *position==0 ) {
                Bblock.nrows=Adim;
                Bblock.ncols=Dblocks * blocksize;
                Bblock.allocate ( Adim, Dblocks * blocksize, 0 );
                Btsparse.allocate ( 0,0,0 );
                for ( j=0; j<Dcols; ++j ) {
                    dense2CSR_sub ( B_j + j * Adim * blocksize,Adim,blocksize,Adim,Bblock,0, ( * ( dims+1 ) * j + pcol ) *blocksize );
                    if ( Bblock.nonzeros>0 ) {
                        if ( Btsparse.nonzeros==0 ) {
                            Btsparse.make2 ( Bblock.nrows,Bblock.ncols,Bblock.nonzeros,Bblock.pRows,Bblock.pCols,Bblock.pData );
                        } else {
                            Btsparse.addBCSR ( Bblock );
                        }
                    }

                    Bblock.clear();
                }

            }
            blacs_barrier_ ( &ICTXT2D,"A" );
            if ( iam!=0 ) {
                //Each process other than root sends its Dmat to the root process.
                MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD );
                Dmat.clear();
                if ( *position==0 ) {
                    MPI_Send ( & ( Btsparse.nonzeros ),1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD );
                    MPI_Send ( & ( Btsparse.pRows[0] ),Btsparse.nrows + 1, MPI_INT,0,iam+5*size,MPI_COMM_WORLD );
                    MPI_Send ( & ( Btsparse.pCols[0] ),Btsparse.nonzeros, MPI_INT,0,iam+6*size,MPI_COMM_WORLD );
                    MPI_Send ( & ( Btsparse.pData[0] ),Btsparse.nonzeros, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD );
                    Btsparse.clear();
                }
            } else {
	     
		//Btsparse.writeToFile("Btsparse_pre.csr");
                for ( i=1; i<size; ++i ) {
                    // The root process receives parts of Dmat sequentially from all processes and directly adds them together.
                    int nonzeroes, count;
                    MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status );
                    /*MPI_Get_count(&status, MPI_INT, &count);
                    printf("Process 0 received %d elements of process %d\n",count,i);*/
                    if ( nonzeroes>0 ) {
                        printf ( "Nonzeroes : %d\n ",nonzeroes );
                        Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes );
                        MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_DOUBLE, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        Dmat.addBCSR ( Dblock );
                        Dblock.clear();
                    }
                     
                    if ( i / *dims == 0 ) {
                        MPI_Recv ( &nonzeroes,1,MPI_INT,i,i+4*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        if ( nonzeroes>0 ) {
                            printf ( "Nonzeroes : %d\n ",nonzeroes );
                            Bblock.allocate ( Adim,Dblocks * blocksize,nonzeroes );
                            MPI_Recv ( & ( Bblock.pRows[0] ), Adim + 1, MPI_INT,i,i+5*size,MPI_COMM_WORLD,&status );
                            /*MPI_Get_count(&status, MPI_INT, &count);
                            printf("Process 0 received %d elements of process %d\n",count,i);*/
                            MPI_Recv ( & ( Bblock.pCols[0] ),nonzeroes, MPI_INT,i,i+6*size,MPI_COMM_WORLD,&status );
                            /*MPI_Get_count(&status, MPI_INT, &count);
                            printf("Process 0 received %d elements of process %d\n",count,i);*/
                            MPI_Recv ( & ( Bblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+7*size,MPI_COMM_WORLD,&status );
                            /*MPI_Get_count(&status, MPI_DOUBLE, &count);
                            printf("Process 0 received %d elements of process %d\n",count,i);*/
                            Btsparse.addBCSR ( Bblock );
                            Bblock.clear();
                        }
                    }
                }
                //Dmat.writeToFile("D_sparse.csr");
                printf ( "Number of nonzeroes in D: %d\n",Dmat.nonzeros );
                Dmat.reduceSymmetric();
                //Dmat.writeToFile("D_sparse_symm.csr");

                //Btsparse.writeToFile("Btsparse.csr");
                Dmat.changeCols(Ddim);
                Dmat.changeRows(Ddim);
                //Dmat.writeToFile("Dsparse_red.csr");
                Btsparse.changeCols(Ddim);
                create2x2SymBlockMatrix ( Asparse,Btsparse, Dmat, Csparse );
                Btsparse.clear();
                Dmat.clear();
		


                ParDiSO p(-2,0);

                p.init(Csparse, 1);
                p.factorize(Csparse);

                //Csparse.fillSymmetric();
                //Csparse.writeToFilePSelInv(filenameC);
                //Csparse.writeToFile(filenameC);
                Csparse.clear();

                //double* Cdense = new double[Csparse.nrows * Csparse.ncols];
                //CSR2dense(Csparse, Cdense);
                //printdense(Adim+Ddim, Adim+Ddim, Cdense, "C.txt");

                if ( filenameC != NULL )
                    free ( filenameC );
                filenameC=NULL;
            }

            if (iam == 0)
            {
                cout << "\n - C saved in file " << filenameC << "! Exiting... \n\n" << endl; 
                exit(-12345);
            }
        }

        //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol
        DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCAB_sol==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize)
        descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        AB_sol= ( double * ) calloc ( Adim * s_B_j,sizeof ( double ) );
	
	blacs_barrier_ ( &ICTXT2D,"A" );

        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tick ( totaltime );

        if ( iam == 0 )
            watch.tick ( cresctime );

        // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp)
        // The solution of A * X = B_j is stored in AB_sol (= A^-1 * B_j)

        /*
        char * BT_i_debugFile = new char[100];
        char * B_j_debugFile  = new char[100];

        sprintf(BT_i_debugFile, "BT_i_debug_%d.txt", iam);
        sprintf(B_j_debugFile,  "B_j_debug_%d.txt",  iam);

        BT_i.writeToFile(BT_i_debugFile);
         B_j.writeToFile(B_j_debugFile);
        */

        make_Sij_denseB ( Asparse, BT_i, B_j, s_BT_i, s_B_j, D, lld_D, AB_sol );

        /*
        char * AB_sol_debugFile = new char[100];
        char * D_debugFile      = new char[100];

        sprintf(AB_sol_debugFile, "AB_sol_debug_%d.txt", iam);
        sprintf(D_debugFile,      "D_debug_%d.txt",      iam);

        printDenseDouble(AB_sol_debugFile, ios::out, Drows*blocksize, Dcols*blocksize, AB_sol);
        printDenseDouble(D_debugFile,      ios::out, Ddim,            Ddim,            D);

        cout << iam << " just wrote debug stuff... " << endl;

        */
	blacs_barrier_ ( &ICTXT2D,"ALL" );
	
	if ( iam == 0 )
            watch.tack ( cresctime );


        if ( iam !=0 ) {
            Asparse.clear();
            pardiso_var.clear();
        }

        //BT_i.clear();
        //B_j.clear();

        delete[] BT_i;
        delete[] B_j;



        blacs_barrier_ ( &ICTXT2D,"ALL" );

        /********************** TIMING **********************/
        

        if ( iam == 0 )
            watch.tick ( facsctime );

        //The Schur complement is factorised (by ScaLAPACK)
        pdpotrf_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( facsctime );

        if ( iam == 0 )
            watch.tick ( invsctime );

        //The Schur complement is inverteded (by ScaLAPACK)
        pdpotri_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Inverse of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        blacs_barrier_ ( &ICTXT2D,"A" );

        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( invsctime );
	
	InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) );

        if ( iam == 0 )
            watch.tick ( gathrtime );
        

        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tick ( sndrctime );

        //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process.
        if ( *position == pcol ) {
            for ( i=0; i<Ddim; ++i ) {
                if ( pcol == ( i/blocksize ) % *dims ) {
                    int Dpos = i%blocksize + ( ( i/blocksize ) / *dims ) * blocksize ;
                    * ( InvD_T_Block + Adim +i ) = * ( D + Dpos + lld_D * Dpos );
                }
            }
            for ( i=0,j=0; i<Dblocks; ++i,++j ) {
                if ( j==*dims )
                    j=0;
                if ( *position==j ) {
                    dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero );
                }
                if ( *position==0 ) {
                    dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j );
                }
            }
        }

        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( sndrctime );

        if ( position != NULL ) {
            free ( position );
            position=NULL;
        }

        if ( dims != NULL ) {
            free ( dims );
            dims=NULL;
        }

        //Only the root process performs a selected inversion of A.
        if ( iam==0 ) {

            watch.tick ( invrAtime );

            /*int pardiso_message_level = 1;

            int pardiso_mtype=-2;

            ParDiSO pardiso ( pardiso_mtype, pardiso_message_level );*/

            int number_of_processors = 1;
            char* var = getenv ( "OMP_NUM_THREADS" );
            if ( var != NULL )
                sscanf ( var, "%d", &number_of_processors );
            else {
                printf ( "Set environment OMP_NUM_THREADS to 1" );
                exit ( 1 );
            }

            pardiso_var.iparm[2]  = 2;
            pardiso_var.iparm[3]  = number_of_processors;
            pardiso_var.iparm[8]  = 0;
            pardiso_var.iparm[11] = 1;
            pardiso_var.iparm[13]  = 0;
            pardiso_var.iparm[28]  = 0;

            //This function calculates the factorisation of A once again so this might be optimized.
            pardiso_var.findInverseOfA ( Asparse );

            cout << "Memory allocated by pardiso: " << pardiso_var.memoryAllocated() << endl;

            printf ( "Processor %d inverted matrix A\n",iam );

            watch.tack ( invrAtime );
        }

        blacs_barrier_ ( &ICTXT2D,"A" );


        // To minimize memory usage, and because only the diagonal elements of the inverse are needed, X' * S is calculated row by row
        // the diagonal element is calculated as the dot product of this row and the corresponding column of X. (X is solution of AX=B)
        XSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) );
        DESCXSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCXSROW==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //XSrow (1,Ddim) is distributed acrros processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize)
        descinit_ ( DESCXSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        blacs_barrier_ ( &ICTXT2D,"A" );

        if ( iam == 0 )
            cout << "Calculating diagonal elements of the first block of the inverse... \n" << endl;



        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tick ( dotprtime );


        //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1.
        for ( i=1; i<=Adim; ++i ) {
            pdsymm_ ( "R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,XSrow,&i_one,&i_one,DESCXSROW );
            pddot_ ( &Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,XSrow,&i_one,&i_one,DESCXSROW,&i_one );
        }


        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( dotprtime );



        if ( D!=NULL ) {
            free ( D );
            D=NULL;
        }
        if ( AB_sol!=NULL ) {
            free ( AB_sol );
            AB_sol=NULL;
        }
        if ( XSrow !=NULL ) {
            free ( XSrow );
            XSrow=NULL;
        }
        if ( DESCD!=NULL ) {
            free ( DESCD );
            DESCD=NULL;
        }
        if ( DESCAB_sol!=NULL ) {
            free ( DESCAB_sol );
            DESCAB_sol=NULL;
        }
        if ( DESCXSROW!=NULL ) {
            free ( DESCXSROW );
            DESCXSROW=NULL;
        }


        //Only in the root process we add the diagonal elements of A^-1
        if ( iam ==0 ) {
            for ( i = 0; i < Adim; i++ ) {
                j                  = Asparse.pRows[i];
                * ( InvD_T_Block+i ) += Asparse.pData[j];
            }


            /********************** TIMING **********************/

            watch.tack ( gathrtime );
            watch.tack ( totaltime );


            /*
            //cout << "Extraction completed by ";
            for (i = 0; i < Ddim; i++)
            {
                cout << "Extracting row " << i << "/" << Ddim << endl;
                //cout << setw(3) << std::setfill('0') << int(i*100.0 / (Ddim-1)) << "%" << "\b\b\b\b";

                diagonal[Asparse.nrows + i] = InvD_T_Block[i*Ddim + i];
            }
            cout << endl;
            */


            Asparse.clear();

            /*
            cout << "Extracting diagonal... \n" << endl;
            cout << "Saving diagonal... \n" << endl;

            char* diagOutFile = new char[50];
            sprintf ( diagOutFile, "diag_inverse_C_parallel_%d.txt", size );

            printdense ( Adim+Ddim, 1, InvD_T_Block, diagOutFile );

            delete[] diagOutFile;
            */
        }

        if ( InvD_T_Block !=NULL ) {
            free ( InvD_T_Block );
            InvD_T_Block=NULL;
        }


        if ( iam == 0 ) {


            // Conversion milliseconds -> seconds
            cresctime /= 1000.0;
            facsctime /= 1000.0;
            invsctime /= 1000.0;
            gathrtime /= 1000.0;
            invrAtime /= 1000.0;
            totaltime /= 1000.0;



            cout << "********************************* TIME REPORT ********************************** \n" << endl;
            cout << "                     SCHUR COMPLEMENT      BUILDING: " << cresctime << " seconds" << endl;
            cout << "                     SCHUR COMPLEMENT FACTORIZATION: " << facsctime << " seconds" << endl;
            cout << "                     SCHUR COMPLEMENT     INVERSION: " << invsctime << " seconds" << endl;
            cout << "                                                                                " << endl;
            cout << "         FINAL OPERATIONS (INVERSION OF A INCLUDED): " << gathrtime << " seconds" << endl;
            cout << "                                     INVERSION OF A: " << invrAtime << " seconds" << endl;
            cout << "                                         TOTAL TIME: " << totaltime << " seconds" << endl;
            cout << "******************************************************************************** \n" << endl;

            /*
             * double totaltime = 0.0;     // Total execution time
             * double cresctime = 0.0;     // Schur-complement (total)
             * double facsctime = 0.0;     // Schur-complement factorization
             * double invsctime = 0.0;     // Schur-complement inversion
             * double gathrtime = 0.0;     // Last operationsdouble invrAtime = 0.0;     // Inversion of A
             * */

            char* timingFile = new char[50];
            sprintf ( timingFile, "weak_tests.csv" );

            std::fstream timeF;
            timeF.open ( timingFile, std::fstream::out | std::fstream::app );
            timeF.setf ( ios::scientific, ios::floatfield );

            //timeF << "PROBLEM SIZE: " << Adim/1000 << "k + " << Ddim/1000 << "k" << endl;
            //timeF <<
            //"#PROCS,SCHUR_BUILD,SCHUR_FACT,SCHUR_INV,INV(A),FINAL_OPS,
            //SEND_RECV, DOT_PROD,TOTAL" << endl;
            timeF << size << "," << cresctime << "," << facsctime << "," << invsctime << "," << invrAtime << "," << gathrtime << "," << sndrctime << "," << dotprtime << "," << totaltime << endl;

            timeF.close();
        }


        blacs_barrier_ ( &ICTXT2D,"A" );
        blacs_gridexit_ ( &ICTXT2D );
    }

    //cout << iam << " reached end before MPI_Barrier" << endl;
    MPI_Barrier ( MPI_COMM_WORLD );
    MPI_Finalize();

    return 0;
}
Example #23
0
File: diag.c Project: qsnake/gpaw
int main(int argc, char *argv[]) {

     // Some constants
     int minusone = -1;
     int zero = 0;
     int one = 1;
     double dzero = 0.0; 

     // ConText
     int ConTxt = minusone;

     // order
     char order = 'R';
     char scope = 'A';

     // root process
     int root = zero;

     // BLACS/SCALAPACK parameters
     // the size of the blocks the distributed matrix is split into
     // (applies to both rows and columns)
     int mb = 32;
     int nb = mb; // PDSYEVxxx constraint

     // the number of rows and columns in the processor grid
     // only square processor grids due to C vs. Fortran ordering
     int nprow = 2;
     int npcol = nprow; // only square processor grids, 

     // starting row and column in grid, do not change
     int rsrc = zero; 
     int csrc = zero;

     // dimensions of the matrix to diagonalize
     int m = 1000;
     int n = m; // only square matrices

     int info = zero;

     // Rest of code will only work for:
     // nprow = npcol
     // mb = nb;
     // m = n;
     // rsrc = crsc;

     // Paramteres for Trivial Matrix
     double alpha = 0.1; // off-diagonal
     double beta = 75.0; // diagonal
     
     // For timing:
     double tdiag0, tdiag, ttotal0, ttotal;

     // BLACS Communicator
     MPI_Comm blacs_comm;
     int nprocs;
     int iam;
     int myrow, mycol;

     MPI_Init(&argc, &argv);
     MPI_Barrier(MPI_COMM_WORLD);
     ttotal0 = MPI_Wtime();
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
     MPI_Comm_rank(MPI_COMM_WORLD, &iam);

     if (argc > one) {
       nprow = strtod(argv[1],NULL);
       m = strtod(argv[2],NULL);
       npcol = nprow;
       n = m;
     }
    
     if (iam == root) {
       printf("world size %d \n",nprocs);
       printf("n %d \n", n);
       printf("nprow %d \n", nprow);
       printf("npcol %d \n", npcol);
     }

     // We can do this on any subcommunicator.
#ifdef CartComm
     int dim[2];
     int pbc[2];
     dim[0] = nprow;
     dim[1] = npcol;
     pbc[0] = 0;
     pbc[1] = 0;
     MPI_Cart_create(MPI_COMM_WORLD, 2, dim, pbc, 1, &blacs_comm);
#else
     blacs_comm = MPI_COMM_WORLD;
#endif

     // initialize the grid
     // The lines below are equivalent to the one call to:
     if (blacs_comm != MPI_COMM_NULL) {
       ConTxt = Csys2blacs_handle_(blacs_comm);
       Cblacs_gridinit_(&ConTxt, &order, nprow, npcol);

       // get information back about the grid
       Cblacs_gridinfo_(ConTxt, &nprow, &npcol, &myrow, &mycol);
     }

     if (ConTxt != minusone) {

          int desc[9];

          // get the size of the distributed matrix
          int locM = numroc_(&m, &mb, &myrow, &rsrc, &nprow);
          int locN = numroc_(&n, &nb, &mycol, &csrc, &npcol);

	  // printf ("locM = %d \n", locM);
	  // printf ("locN = %d \n", locN);

          int lld = MAX(one,locM);

          // build the descriptor
          descinit_(desc, &m, &n, &mb, &nb, &rsrc, &csrc, &ConTxt, &lld, &info);
          // Allocate arrays
	  // eigenvalues
	  double* eigvals = malloc(n * sizeof(double));

          // allocate the distributed matrices
          double* mata = malloc(locM*locN * sizeof(double));
          // allocate the distributed matrix of eigenvectors
          double* z = malloc(locM*locN * sizeof(double));

          // Eigensolver parameters
          int ibtype = one;
          char jobz = 'V'; // eigenvectors also
          char range = 'A'; // all eiganvalues
          char uplo = 'L'; // work with upper

          double vl, vu;
          int il, iu;

          char cmach = 'U';

          double abstol = 2.0 * pdlamch_(&ConTxt, &cmach);

          int eigvalm, nz;

          double orfac = -1.0;
          //double orfac = 0.001;

          int* ifail;
          ifail = malloc(m * sizeof(int));

          int* iclustr;
          iclustr =  malloc(2*nprow*npcol * sizeof(int));

          double* gap;
          gap =  malloc(nprow*npcol * sizeof(double));

          double* work;
          work = malloc(3 * sizeof(double));
          int querylwork = minusone;
          int* iwork;
          iwork = malloc(1 * sizeof(int));
          int queryliwork = minusone;

          // Build a trivial distributed matrix: Diagonal matrix
	  pdlaset_(&uplo, &m, &n, &alpha, &beta, mata, &one, &one, desc);

	  // First there is a workspace query

          // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl,
          //          &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one,
          //          &one, desc, work, &querylwork, iwork, &queryliwork, ifail, iclustr, gap, &info);
          pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals,
		   z, &one, &one, desc,
		   work, &querylwork, iwork, &queryliwork, &info);
          //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals,
          //        z, &one, &one, desc, work, &querylwork, &info);

          int lwork = (int)work[0];
          //printf("lwork %d\n", lwork);
          free(work);
          int liwork = (int)iwork[0];
          //printf("liwork %d\n", liwork);
          free(iwork);

          work = (double*)malloc(lwork * sizeof(double));
          iwork = (int*)malloc(liwork * sizeof(int));

	  // This is actually diagonalizes the matrix
          // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl,
          //          &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one,
          //          &one, desc, work, &lwork, iwork, &liwork, ifail, iclustr, gap, &info);
  
          Cblacs_barrier(ConTxt, &scope);
          tdiag0 = MPI_Wtime();
          pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals,
                   z, &one, &one, desc,
                   work, &lwork, iwork, &liwork, &info);

          //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals,
          //        z, &one, &one, desc, work, &lwork, &info);
          Cblacs_barrier(ConTxt, &scope);
          tdiag = MPI_Wtime() - tdiag0;

          free(work);
          free(iwork);
          free(gap);
          free(iclustr);
          free(ifail);
          free(z);
          free(mata);

          // Destroy BLACS grid
          Cblacs_gridexit_(ConTxt);

	  // Check eigenvalues
	  if (myrow == zero && mycol == zero) {
	    for (int i = 0; i < n; i++)
	      {
                if (fabs(eigvals[i] - beta) > 0.0001) 
		    printf("Problem: eigval %d != %f5.2 but %f\n", 
                            i, beta, eigvals[i]);
	      }
	    
	    if (info != zero) {
	      printf("info = %d \n", info);
	    }
	    
	    printf("Time (s) diag: %f\n", tdiag);
	  }

	  free(eigvals);
     }

     MPI_Barrier(MPI_COMM_WORLD);
     ttotal = MPI_Wtime() - ttotal0;
     if (iam == 0)
          printf("Time (s) total: %f\n", ttotal);
     MPI_Finalize();
}
Example #24
0
int main(int argc, char **argv) {

  int ictxt, nside, ngrid, nblock, nthread;
  int rank, size;
  int ic, ir, nc, nr;

  int i, j;

  char *fname;
  
  int info, ZERO=0, ONE=1;

  struct timeval st, et;

  double dtnn, dtnt, dttn, dttt;
  double gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt;

  /* Initialising MPI stuff */
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  printf("Process %i of %i.\n", rank, size);


  /* Parsing arguments */
  if(argc < 6) {
    exit(-3);
  }

  nside = atoi(argv[1]);
  ngrid = atoi(argv[2]);
  nblock = atoi(argv[3]);
  nthread = atoi(argv[4]);
  fname = argv[5];

  if(rank == 0) {
    printf("Multiplying matrices of size %i x %i\n", nside, nside);
    printf("Process grid size %i x %i\n", ngrid, ngrid);
    printf("Block size %i x %i\n", nblock, nblock);
    printf("Using %i OpenMP threads\n", nthread);
  }



  #ifdef _OPENMP
  if(rank == 0) printf("Setting OMP_NUM_THREADS=%i\n", nthread);
  omp_set_num_threads(nthread);
  #endif

  /* Setting up BLACS */
  Cblacs_pinfo( &rank, &size ) ;
  Cblacs_get(-1, 0, &ictxt );
  Cblacs_gridinit(&ictxt, "Row", ngrid, ngrid);
  Cblacs_gridinfo(ictxt, &nr, &nc, &ir, &ic);

  int descA[9], descB[9], descC[9];

  /* Fetch local array sizes */
  int Ar, Ac, Br, Bc, Cr, Cc;

  Ar = numroc_( &nside, &nblock, &ir, &ZERO, &nr);
  Ac = numroc_( &nside, &nblock, &ic, &ZERO, &nc);

  Br = numroc_( &nside, &nblock, &ir, &ZERO, &nr);
  Bc = numroc_( &nside, &nblock, &ic, &ZERO, &nc);

  Cr = numroc_( &nside, &nblock, &ir, &ZERO, &nr);
  Cc = numroc_( &nside, &nblock, &ic, &ZERO, &nc);

  printf("Local array section %i x %i\n", Ar, Ac);

  /* Set descriptors */
  descinit_(descA, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Ar, &info);
  descinit_(descB, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Br, &info);
  descinit_(descC, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Cr, &info);

  /* Initialise and fill arrays */
  double *A = (double *)malloc(Ar*Ac*sizeof(double));
  double *B = (double *)malloc(Br*Bc*sizeof(double));
  double *C = (double *)malloc(Cr*Cc*sizeof(double));


  for(i = 0; i < Ar; i++) {
    for(j = 0; j < Ac; j++) {
      A[j*Ar + i] = drand48();
      B[j*Br + i] = drand48();
      C[j*Cr + i] = 0.0;
    }
  }

  double alpha = 1.0, beta = 0.0;

  //========================
  
  if(rank == 0) printf("Starting multiplication (NN).\n");

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&st, NULL);

  pdgemm_("N", "N", &nside, &nside, &nside,
	  &alpha,
	  A, &ONE, &ONE, descA,
	  B, &ONE, &ONE, descB,
	  &beta,
	  C, &ONE, &ONE, descC );

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&et, NULL);
  dtnn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6);
  gfpc_nn = 2.0*pow(nside, 3) / (dtnn * 1e9 * ngrid * ngrid * nthread);

  if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnn, gfpc_nn);

  //========================



  //========================

  if(rank == 0) printf("Starting multiplication (NT).\n");

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&st, NULL);

  pdgemm_("N", "T", &nside, &nside, &nside,
	  &alpha,
	  A, &ONE, &ONE, descA,
	  B, &ONE, &ONE, descB,
	  &beta,
	  C, &ONE, &ONE, descC );

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&et, NULL);
  dtnt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6);
  gfpc_nt = 2.0*pow(nside, 3) / (dtnt * 1e9 * ngrid * ngrid * nthread);

  if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnt, gfpc_nt);

  //========================



  //========================

  if(rank == 0) printf("Starting multiplication (TN).\n");

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&st, NULL);

  pdgemm_("T", "N", &nside, &nside, &nside,
	  &alpha,
	  A, &ONE, &ONE, descA,
	  B, &ONE, &ONE, descB,
	  &beta,
	  C, &ONE, &ONE, descC );

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&et, NULL);
  dttn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6);
  gfpc_tn = 2.0*pow(nside, 3) / (dttn * 1e9 * ngrid * ngrid * nthread);

  if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttn, gfpc_tn);

  //========================



  //========================

  if(rank == 0) printf("Starting multiplication (TT).\n");

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&st, NULL);

  pdgemm_("T", "T", &nside, &nside, &nside,
	  &alpha,
	  A, &ONE, &ONE, descA,
	  B, &ONE, &ONE, descB,
	  &beta,
	  C, &ONE, &ONE, descC );

  Cblacs_barrier(ictxt,"A");
  gettimeofday(&et, NULL);
  dttt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6);
  gfpc_tt = 2.0*pow(nside, 3) / (dttt * 1e9 * ngrid * ngrid * nthread);

  if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttt, gfpc_tt);

  //========================




  if(rank == 0) {
    FILE * fd;
    fd = fopen(fname, "w");
    fprintf(fd, "%g %g %g %g %i %i %i %i %g %g %g %g\n", gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt, nside, ngrid, nblock, nthread, dtnn, dtnt, dttn, dttt);
    fclose(fd);
  }

  Cblacs_gridexit( 0 );
  MPI_Finalize();

}