void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){
  int i,j,k;

  /*
   BLACS_PARAMS:
   [0]  = ictxt;
   [1]  = my_proc;
   [2]  = total_procs;
   [3]  = Nproc_row;
   [4]  = Nproc_col;
   [5]  = my_row;
   [6]  = my_col;
   [7]  = Global_rows;
   [8]  = Global_columns;
   [9]  = Block_Size_row;
   [10] = Block_Size_col;
   [11] = LOC_rows;
   [12] = LOC_columns;
   */


   blacs_pinfo_( i, j);
   BLACS_PARAMS[1] = i;
   BLACS_PARAMS[2] = j;

   blacs_get_( -1, 0,  k);   // <- LG check the arguments of this call
   BLACS_PARAMS[0] = k;

   i = BLACS_PARAMS[3];
   j = BLACS_PARAMS[4];
   blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j);

   blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j);
   BLACS_PARAMS[5] = i;
   BLACS_PARAMS[6] = j;

   i = 0;
   BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]);
   BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]);

   i = 0;
   j = 0;
   descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8],
                    BLACS_PARAMS[9], BLACS_PARAMS[10],
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
   if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);

   descinit_(DESCB, BLACS_PARAMS[7], 1,
                    BLACS_PARAMS[9], 1,
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
  if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);


}
Example #2
0
int main(int argc, char **argv)
{
    #define test_A(i,j) test_A[(size_t)(j)*N+(i)]
    #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)]
    int N,NB,w,LDA,BB;
    size_t memsize; //bytes
    int iam, nprocs, mydevice;
    int ICTXT, nprow, npcol, myprow, mypcol;
    int i_one = 1, i_zero = 0, i_negone = -1;
    double d_one = 1.0, d_zero = 0.0, d_negone = -1.0;
    int IASEED = 100;
/*  printf("N=?\n");
    scanf("%ld",&N);
    printf("NB=?\n");
    scanf("%d", &NB);
    printf("width of Y panel=?\n");
    scanf("%ld",&w);
*/
    if(argc < 4){
        printf("invalid arguments N NB memsize(M)\n");
        exit(1);
    }
    N = atoi(argv[1]);
    NB = atoi(argv[2]);
    memsize = (size_t)atoi(argv[3])*1024*1024;
    BB = (N + NB - 1) / NB;
    w = memsize/sizeof(double)/BB/NB/NB - 1;
    assert(w > 0);
    LDA = N + 0; //padding

    int do_io = (N <= NSIZE);
    double llttime;
    double gflops;
    
    nprow = npcol = 1;
    blacs_pinfo_(&iam, &nprocs);
    blacs_get_(&i_negone, &i_zero, &ICTXT);
    blacs_gridinit_(&ICTXT, "R", &nprow, &npcol);
    blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol);
    #ifdef USE_MIC
        #ifdef __INTEL_OFFLOAD
            printf("offload compilation enabled\ninitialize each MIC\n");
            offload_init(&iam, &mydevice);
            #pragma offload target(mic:0)
            {
                mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
            }
        #else
            if(isroot)
                printf("offload compilation not enabled\n");
            exit(0);
        #endif
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){
                cuStatus = cublasCreate(&worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasInit();
        #endif
    #endif

    double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol
#ifdef VERIFY
    double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify
#endif
    
    /*Initialize A */
    int i,j;
    printf("Initialize A ... "); fflush(stdout);
    llttime = MPI_Wtime();
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A, &LDA, &i_zero, &i_zero,
         &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol); 
    llttime = MPI_Wtime() - llttime;
    printf("time %lf\n", llttime);
              
    /*print test_A*/
    if(do_io){
        printf("Original A=\n\n");
        matprint(test_A, N, LDA, 'A');
    }

    /*Use directed unblocked Cholesky factorization*/    
    /*
    t1 = clock();
    Test_dpotrf(test_A2,N);
    t2 = clock();
    printf ("time for unblocked Cholesky factorization on host %f \n",
        ((float) (t2 - t1)) / CLOCKS_PER_SEC);
    */
    
    /*print test_A*/
    /*
    if(do_io){
        printf("Unblocked result:\n\n");
        matprint(test_A2,N,'L');   
    }
    */ 

    /*Use tile algorithm*/
    Quark *quark = QUARK_New(OOC_NTHREADS);
    QUARK_DOT_DAG_Enable(quark, 0);
    #ifdef USE_MIC
//      mklmem(NB);
        printf("QUARK MIC affinity binding\n");
        QUARK_bind(quark);
        printf("offload warm up\n");
        warmup(quark);
    #endif
    QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0));
    printf("LLT start %lf\n", MPI_Wtime());
    llttime = Cholesky(quark,test_A,N,NB,LDA,memsize);
    printf("LLT end %lf\n", MPI_Wtime());
    QUARK_Delete(quark);
    #ifdef USE_MIC
        offload_destroy();
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){ 
                cuStatus = cublasDestroy(worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasShutdown();
        #endif
    #endif

    gflops = (double) N;
    gflops = gflops/3.0 + 0.5;
    gflops = gflops*(double)(N)*(double)(N);
    gflops = gflops/llttime/1024.0/1024.0/1024.0;
    printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n",
        N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops);
    #ifdef USE_MIC
        #pragma offload target(mic:0)
        {
            memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
        }
        printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0);
    #endif

    /*Update and print L*/             
    if(do_io){
        printf("L:\n\n");
        matprint(test_A,N,LDA,'L');
    }
#ifdef VERIFY
    printf("Verify... ");
    llttime = MPI_Wtime();
  /*
   * ------------------------
   * check difference betwen 
   * test_A and test_A2
   * ------------------------
   */
    /*
    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = j; i < N; i++)
          {
            double err = (test_A (i, j) - test_A2 (i, j));
            err = ABS (err);
            maxerr = MAX (err, maxerr);
            maxerr2 = maxerr2 + err * err;
          };
      };
    maxerr2 = sqrt (ABS (maxerr2));
    printf ("max difference between test_A and test_A2 %lf \n", maxerr);
    printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2);
    };
    */

  /*
   * ------------------
   * over-write test_A2
   * ------------------
   */
   
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A2, &LDA, &i_zero,
         &i_zero, &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol);

  /*
   * ---------------------------------------
   * after solve, test_A2 should be identity
   * ---------------------------------------
   */
  // test_A = chol(B) = L;
  // test_A2 = B
  // solve L*L'*X = B
  // if L is correct, X is identity */
     
    {
    int uplo = 'L';
    const char *uplo_char = ((uplo == (int) 'U')
                    || (uplo == (int) 'u')) ? "U" : "L";
    int info = 0;
    int nrhs = N;
    int LDA = N;
    int ldb = N;
    dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info);
    assert (info == 0);
    }

    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = 0; i < N; i++)
          {
            double eyeij = (i == j) ? 1.0 : 0.0;
            double err = (test_A2 (i, j) - eyeij);
            err = ABS (err);
            maxerr = MAX (maxerr, err);
            maxerr2 = maxerr2 + err * err;
          };
      };

    maxerr2 = sqrt (ABS (maxerr2));
    printf("time %lf\n", MPI_Wtime() - llttime);
    printf ("max error %lf \n", maxerr);
    printf ("max L2 error %lf \n", maxerr2);
    }
#endif

    free(test_A);test_A=NULL;
#ifdef VERIFY
    free(test_A2);test_A2=NULL;
#endif
    blacs_gridexit_(&ICTXT);
    blacs_exit_(&i_zero);
    return 0;
    #undef test_A
    #undef test_A2
}
Example #3
0
int main(int argc, char **argv) {
    int info, i, j, pcol, Adim;
    double *D;
    int *DESCD;
    CSRdouble BT_i, B_j, Xsparse, Zsparse, Btsparse;

    /*BT_i.allocate(0,0,0);
    B_j.allocate(0,0,0);
    Xsparse.allocate(0,0,0);
    Zsparse.allocate(0,0,0);
    Btsparse.allocate(0,0,0);*/

    //Initialise MPI and some MPI-variables
    info = MPI_Init ( &argc, &argv );
    if ( info != 0 ) {
        printf ( "Error in MPI initialisation: %d\n",info );
        return info;
    }

    position= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( position==NULL ) {
        printf ( "unable to allocate memory for processor position coordinate\n" );
        return EXIT_FAILURE;
    }

    dims= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( dims==NULL ) {
        printf ( "unable to allocate memory for grid dimensions coordinate\n" );
        return EXIT_FAILURE;
    }

    //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI

    blacs_pinfo_ ( &iam,&size ); 				//determine the number of processes involved
    info=MPI_Dims_create ( size, 2, dims );			//determine the best 2D cartesian grid with the number of processes
    if ( info != 0 ) {
        printf ( "Error in MPI creation of dimensions: %d\n",info );
        return info;
    }

    //Until now the code can only work with square process grids
    //So we try to get the biggest square grid possible with the number of processes involved
    if (*dims != *(dims+1)) {
        while (*dims * *dims > size)
            *dims -=1;
        *(dims+1)= *dims;
        if (iam==0)
            printf("WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - (*dims * *dims));
        size = *dims * *dims;
        //cout << "New size of process grid: " << size << endl;
    }

    blacs_get_ ( &i_negone,&i_zero,&ICTXT2D );

    //Initialisation of the BLACS process grid, which is referenced as ICTXT2D
    blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 );

    if (iam < size) {

        //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column)
        blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 );
        if ( *position ==-1 ) {
            printf ( "Error in proces grid\n" );
            return -1;
        }

        //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp)
        info=read_input ( *++argv );
        if ( info!=0 ) {
            printf ( "Something went wrong when reading input file for processor %d\n",iam );
            return -1;
        }

        //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point.
        blacs_barrier_ ( &ICTXT2D,"ALL" );
        if ( * ( position+1 ) ==0 && *position==0 )
            printf ( "Reading of input-file succesful\n" );

        if ( * ( position+1 ) ==0 && *position==0 ) {
            printf("\nA linear mixed model with %d observations, %d genotypes, %d random effects and %d fixed effects\n", n,k,m,l);
            printf("was analyzed using %d (%d x %d) processors\n",size,*dims,*(dims+1));
        }

        //Dimension of A (sparse matrix) is the number of fixed effects(m) + the sparse random effects (l)
        Adim=m+l;

        //Dimension of D (dense matrix) is the number of dense effects (k)
        Ddim=k;

        pcol= * ( position+1 );

        //Define number of blocks needed to store a complete column/row of D
        Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1;

        //Define the number of rowblocks needed by the current process to store its part of the dense matrix D
        Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1;
        Drows= Drows<1? 1 : Drows;

        //Define the number of columnblocks needed by the current process to store its part of the dense matrix D
        Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1;
        Dcols=Dcols<1? 1 : Dcols;

        //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise)
        lld_D=Drows*blocksize;

        //Initialise the descriptor of the dense distributed matrix
        DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCD==NULL ) {
            printf ( "unable to allocate memory for descriptor for C\n" );
            return -1;
        }

        //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0)
        //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process
        descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        //Allocate the space necessary to store the part of D that is held into memory of this process.
        D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) );
        if ( D==NULL ) {
            printf ( "unable to allocate memory for Matrix D  (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) );
            return EXIT_FAILURE;
        }

        blacs_barrier_ ( &ICTXT2D,"ALL" );
        if (iam==0)
            printf ( "Start set up of B & D\n" );

        blacs_barrier_ ( &ICTXT2D,"ALL" );
        //set_up_BD is declared in readdist.cpp and constructs the parts of matrices B & D in each processor
        //which are necessary to create the distributed Schur complement of D
        info = set_up_BD ( DESCD, D, BT_i, B_j, Btsparse );

        //printdense(Drows*blocksize, Dcols * blocksize,D,"matrix_D.txt");

        blacs_barrier_ ( &ICTXT2D,"ALL" );
        if (iam==0)
            printf ( "Matrices B & D set up\n" );

        if(printD_bool) {

            int array_of_gsizes[2], array_of_distribs[2], array_of_dargs[2], array_of_psize[2] ;
            int buffersize;
            MPI_Datatype file_type;
            MPI_File fh;
            MPI_Status status;
            array_of_gsizes[0]=Dblocks * blocksize;
            array_of_gsizes[1]=Dblocks * blocksize;
            array_of_distribs[0]=MPI_DISTRIBUTE_CYCLIC;
            array_of_distribs[1]=MPI_DISTRIBUTE_CYCLIC;
            array_of_dargs[0]=blocksize;
            array_of_dargs[1]=blocksize;
            array_of_psize[0]=*dims;
            array_of_psize[1]=*(dims + 1);

            MPI_Type_create_darray(size,iam,2,array_of_gsizes, array_of_distribs,
                                   array_of_dargs, array_of_psize, MPI_ORDER_FORTRAN,
                                   MPI_DOUBLE, &file_type);
            MPI_Type_commit(&file_type);
            info = MPI_File_open(MPI_COMM_WORLD, filenameD,
                                 MPI_MODE_CREATE | MPI_MODE_WRONLY,
                                 MPI_INFO_NULL, &fh);
            /*if ( ( Drows-1 ) % *(dims+1) == *position && ( Dcols-1 ) % *(dims) == pcol && Ddim%blocksize !=0 )
                buffersize=((Drows-1) * blocksize + Ddim % blocksize) * ((Dcols-1) * blocksize + Ddim % blocksize);
            else if ( ( Drows-1 ) % *(dims+1) == *position && Ddim%blocksize !=0 )
                buffersize=((Drows-1) * blocksize + Ddim % blocksize) * Dcols * blocksize;
            else if ( ( Dcols-1 ) % *(dims) == *position && Ddim%blocksize !=0 )
                buffersize=((Dcols-1) * blocksize + Ddim % blocksize) * Drows * blocksize;
            else*/
            buffersize= Dcols * Drows * blocksize * blocksize;

            MPI_File_set_view(fh, 0, MPI_DOUBLE, file_type, "native", MPI_INFO_NULL);
            info =MPI_File_write_all(fh, D,buffersize, MPI_DOUBLE,
                                     &status);
	    MPI_File_close(&fh);
            if(iam==0) {
                printf("Matrix D (dimension %d) is printed in file %s\n", Dblocks*blocksize,filenameD);
            }
            if(filenameD != NULL)
                free(filenameD);
            filenameD=NULL;
            //delete[] array_of_gsizes, delete[] array_of_distribs, delete[] array_of_dargs, delete[] array_of_psize;
        }



        //Now every matrix has to set up the sparse matrix A, consisting of X'X, X'Z, Z'X and Z'Z + lambda*I
        Xsparse.loadFromFile ( filenameX );
        Zsparse.loadFromFile ( filenameZ );

        if(filenameX != NULL)
            free(filenameX);
        filenameX=NULL;
        if(filenameZ != NULL)
            free(filenameZ);
        filenameZ=NULL;

        smat_t *X_smat, *Z_smat;

        X_smat= (smat_t *) calloc(1,sizeof(smat_t));
        Z_smat= (smat_t *) calloc(1,sizeof(smat_t));

        X_smat = smat_new_from ( Xsparse.nrows,Xsparse.ncols,Xsparse.pRows,Xsparse.pCols,Xsparse.pData,0,0 );
        Z_smat = smat_new_from ( Zsparse.nrows,Zsparse.ncols,Zsparse.pRows,Zsparse.pCols,Zsparse.pData,0,0 );

        smat_t *Xt_smat, *Zt_smat;
        Xt_smat= (smat_t *) calloc(1,sizeof(smat_t));
        Zt_smat= (smat_t *) calloc(1,sizeof(smat_t));
        Xt_smat = smat_copy_trans ( X_smat );
        Zt_smat = smat_copy_trans ( Z_smat );

        CSRdouble Asparse;
        smat_t *XtX_smat, *XtZ_smat, *ZtZ_smat, *lambda_smat, *ZtZlambda_smat;

        XtX_smat= (smat_t *) calloc(1,sizeof(smat_t));
        XtZ_smat= (smat_t *) calloc(1,sizeof(smat_t));
        ZtZ_smat= (smat_t *) calloc(1,sizeof(smat_t));


        XtX_smat = smat_matmul ( Xt_smat, X_smat );
        XtZ_smat = smat_matmul ( Xt_smat, Z_smat );
        ZtZ_smat = smat_matmul ( Zt_smat,Z_smat );

        Xsparse.clear();
        Zsparse.clear();
        smat_free(Xt_smat);
        smat_free(Zt_smat);
        /*smat_free(X_smat);
        smat_free(Z_smat);*/

        CSRdouble Imat;

        makeIdentity ( l, Imat );

        lambda_smat= (smat_t *) calloc(1,sizeof(smat_t));

        lambda_smat = smat_new_from ( Imat.nrows,Imat.ncols,Imat.pRows,Imat.pCols,Imat.pData,0,0 );

        smat_scale_diag ( lambda_smat, -lambda );

        ZtZlambda_smat= (smat_t *) calloc(1,sizeof(smat_t));

        ZtZlambda_smat = smat_add ( lambda_smat, ZtZ_smat );

        smat_free(ZtZ_smat);
        //smat_free(lambda_smat);


        smat_to_symmetric_structure ( XtX_smat );
        smat_to_symmetric_structure ( ZtZlambda_smat );

        CSRdouble XtX_sparse, XtZ_sparse, ZtZ_sparse;

        XtX_sparse.make2 ( XtX_smat->m,XtX_smat->n,XtX_smat->nnz,XtX_smat->ia,XtX_smat->ja,XtX_smat->a );
        XtZ_sparse.make2 ( XtZ_smat->m,XtZ_smat->n,XtZ_smat->nnz,XtZ_smat->ia,XtZ_smat->ja,XtZ_smat->a );
        ZtZ_sparse.make2 ( ZtZlambda_smat->m,ZtZlambda_smat->n,ZtZlambda_smat->nnz,ZtZlambda_smat->ia,ZtZlambda_smat->ja,ZtZlambda_smat->a );

        /*smat_free(XtX_smat);
        smat_free(XtZ_smat);
        smat_free(ZtZlambda_smat);*/
        Imat.clear();

        if (iam==0) {
            cout << "***                                           [  t     t  ] *** " << endl;
            cout << "***                                           [ X X   X Z ] *** " << endl;
            cout << "***                                           [           ] *** " << endl;
            cout << "*** G e n e r a t i n g    m a t r i x    A = [           ] *** " << endl;
            cout << "***                                           [  t     t  ] *** " << endl;
            cout << "***                                           [ Z X   Z Z ] *** " << endl;
        }

        //Sparse matrix A only contains the upper triangular part of A
        create2x2SymBlockMatrix ( XtX_sparse, XtZ_sparse, ZtZ_sparse, Asparse );
        //Asparse.writeToFile("A_sparse.csr");

        smat_free(XtX_smat);
        smat_free(XtZ_smat);
        smat_free(ZtZlambda_smat);
        XtX_sparse.clear();
        XtZ_sparse.clear();
        ZtZ_sparse.clear();

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        if(printsparseC_bool) {
            CSRdouble Dmat, Dblock, Csparse;
            Dblock.nrows=Dblocks * blocksize;
            Dblock.ncols=Dblocks * blocksize;
            Dblock.allocate(Dblocks * blocksize, Dblocks * blocksize, 0);
            Dmat.allocate(0,0,0);
            for (i=0; i<Drows; ++i) {
                for(j=0; j<Dcols; ++j) {
                    dense2CSR_sub(D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock,( * ( dims) * i + *position ) *blocksize,
                                  ( * ( dims+1 ) * j + pcol ) *blocksize);
                    if ( Dblock.nonzeros>0 ) {
                        if ( Dmat.nonzeros==0 ) {
                            Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData );
                        }
                        else {
                            Dmat.addBCSR ( Dblock );
                        }
                    }

                    Dblock.clear();
                }
            }
            blacs_barrier_(&ICTXT2D,"A");
            if ( iam!=0 ) {
                //Each process other than root sends its Dmat to the root process.
                MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD );
                Dmat.clear();
            }
            else {
                for ( i=1; i<size; ++i ) {
                    // The root process receives parts of Dmat sequentially from all processes and directly adds them together.
                    int nonzeroes, count;
                    MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status );
                    /*MPI_Get_count(&status, MPI_INT, &count);
                    printf("Process 0 received %d elements of process %d\n",count,i);*/
                    if(nonzeroes>0) {
                        printf("Nonzeroes : %d\n ",nonzeroes);
                        Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes );
                        MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_DOUBLE, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        Dmat.addBCSR ( Dblock );
                    }
                }
                //Dmat.writeToFile("D_sparse.csr");
                Dmat.reduceSymmetric();
                Btsparse.transposeIt(1);
                create2x2SymBlockMatrix(Asparse,Btsparse, Dmat, Csparse);
                Btsparse.clear();
                Dmat.clear();
                Csparse.writeToFile(filenameC);
                Csparse.clear();
                if(filenameC != NULL)
                    free(filenameC);
                filenameC=NULL;
            }
        }
        Btsparse.clear();
        blacs_barrier_(&ICTXT2D,"A");

        //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol
        double * AB_sol;
        int * DESCAB_sol;
        DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCAB_sol==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize)
        descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        AB_sol=(double *) calloc(Adim * Dcols*blocksize,sizeof(double));

        // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp)
        // The solution of A * Y = B_j is stored in AB_sol (= A^-1 * B_j)
        blacs_barrier_(&ICTXT2D,"A");
        make_Sij_parallel_denseB ( Asparse, BT_i, B_j, D, lld_D, AB_sol );
        BT_i.clear();
        B_j.clear();

        //From here on the Schur complement S of D is stored in D

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        //The Schur complement is factorised (by ScaLAPACK)
        pdpotrf_ ( "U",&k,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        //From here on the factorization of the Schur complement S is stored in D

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        //The Schur complement is inverted (by ScaLAPACK)
        pdpotri_ ( "U",&k,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Inverse of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        //From here on the inverse of the Schur complement S is stored in D

        blacs_barrier_(&ICTXT2D,"A");

        double* InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) );

        //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process.
        if(*position == pcol) {
            for (i=0; i<Ddim; ++i) {
                if (pcol == (i/blocksize) % *dims) {
                    int Dpos = i%blocksize + ((i/blocksize) / *dims) * blocksize ;
                    *(InvD_T_Block + Adim +i) = *( D + Dpos + lld_D * Dpos);
                }
            }
            for ( i=0,j=0; i<Dblocks; ++i,++j ) {
                if ( j==*dims )
                    j=0;
                if ( *position==j ) {
                    dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero );
                }
                if ( *position==0 ) {
                    dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j );
                }
            }
        }

        blacs_barrier_(&ICTXT2D,"A");

        //Only the root process performs a selected inversion of A.
        if (iam==0) {

            int pardiso_message_level = 1;

            int pardiso_mtype=-2;

            ParDiSO pardiso ( pardiso_mtype, pardiso_message_level );
            int number_of_processors = 1;
            char* var = getenv("OMP_NUM_THREADS");
            if(var != NULL) {
                sscanf( var, "%d", &number_of_processors );
            }
            else {
                printf("Set environment OMP_NUM_THREADS to 1");
                exit(1);
            }

            pardiso.iparm[2]  = 2;
            pardiso.iparm[3]  = number_of_processors;
            pardiso.iparm[8]  = 0;
            pardiso.iparm[11] = 1;
            pardiso.iparm[13]  = 0;
            pardiso.iparm[28]  = 0;

            //This function calculates the factorisation of A once again so this might be optimized.
            pardiso.findInverseOfA ( Asparse );

            printf("Processor %d inverted matrix A\n",iam);
        }
        blacs_barrier_(&ICTXT2D,"A");

        // To minimize memory usage, and because only the diagonal elements of the inverse are needed, Y' * S is calculated row by rowblocks
        // the diagonal element is calculates as the dot product of this row and the corresponding column of Y. (Y is solution of AY=B)
        double* YSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) );
        int * DESCYSROW;
        DESCYSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCYSROW==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //YSrow (1,Ddim) is distributed across processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize)
        descinit_ ( DESCYSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        blacs_barrier_(&ICTXT2D,"A");

        //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1.
        for (i=1; i<=Adim; ++i) {
            pdsymm_ ("R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,YSrow,&i_one,&i_one,DESCYSROW);
            pddot_(&Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,YSrow,&i_one,&i_one,DESCYSROW,&i_one);
            /*if(*position==1 && pcol==1)
            printf("Dot product in process (1,1) is: %g\n", *(InvD_T_Block+i-1));
            if(*position==0 && pcol==1)
            printf("Dot product in process (0,1) is: %g\n",*(InvD_T_Block+i-1));*/
        }
        blacs_barrier_(&ICTXT2D,"A");
        if(YSrow != NULL)
            free(YSrow);
        YSrow = NULL;
        if(DESCYSROW != NULL)
            free(DESCYSROW);
        DESCYSROW = NULL;
        if(AB_sol != NULL)
            free(AB_sol);
        AB_sol = NULL;
        if(DESCAB_sol != NULL)
            free(DESCAB_sol);
        DESCAB_sol = NULL;
        if(D != NULL)
            free(D);
        D = NULL;
        if(DESCD != NULL)
            free(DESCD);
        DESCD = NULL;

        //Only in the root process we add the diagonal elements of A^-1
        if (iam ==0) {
            for(i=0; i<Adim; ++i) {
                j=Asparse.pRows[i];
                *(InvD_T_Block+i) += Asparse.pData[j];
            }
            Asparse.clear();
            printdense ( Adim+k,1,InvD_T_Block,"diag_inverse_C_parallel.txt" );
        }
        if(InvD_T_Block != NULL)
            free(InvD_T_Block);
        InvD_T_Block = NULL;
	blacs_gridexit_(&ICTXT2D);
    }
    //cout << iam << " reached end before MPI_Barrier" << endl;
    MPI_Barrier(MPI_COMM_WORLD);
    //MPI_Finalize();

    return 0;
}
Example #4
0
int main()
{
	const MKL_INT m = 1000;
	const MKL_INT k = 100000;
	const MKL_INT n = 1000;
	const MKL_INT nb = 100;
	const MKL_INT nprow = 2;
	const MKL_INT npcol = 2;

    MKL_INT iam, nprocs, ictxt, myrow, mycol;
    MDESC   descA, descB, descC, descA_local, descB_local, descC_local;
	MKL_INT info;
	MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local;
	MKL_INT a_lld, b_lld, c_lld;

    blacs_pinfo_( &iam, &nprocs );
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

    double *a = 0;
    double *b = 0;
	double *c = 0;

    if (iam==0)
    {
		a = gen_a(m, k);
		b = gen_b(k, n);
		c = (double*)calloc(m*n, sizeof(double));

		puts("a=");
		print(a, m, k);

		puts("b=");
		print(b, k, n);
    }

    a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
    a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol );

	b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow );
	b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
	c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) );
    double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) );
    double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) );

    a_lld = MAX( a_m_local, 1 );
	b_lld = MAX( b_m_local, 1 );
	c_lld = MAX( c_m_local, 1 );

	if (iam==0)
	{
			printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local,
							c_m_local, c_n_local);
			printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld);
	}

    descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info );
    descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info );
    descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info );

    descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info );
    descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info );
    descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info );

	printf("Rank %d: start distribute data\n", iam);
    pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
	printf("Rank %d: finished distribute data\n", iam);

	if (iam==0)
	{
			puts("a");
			print(A, a_m_local, a_n_local);
			puts("b");
			print(B, b_m_local, b_n_local);
	}

    pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
	printf("Rank %d: finished dgemm\n", iam);
	if (iam == 0)
	{
			puts("c");
			print(C, c_m_local, c_n_local);
	}

	pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local);

	if (iam==0)
	{
			puts("global c");
			print(c, m, n);
	}

	free(A);
	free(B);
	free(C);
	if (iam==0)
	{
			free(a);
			free(b);
			free(c);
	}

    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
}
Example #5
0
int main ( int argc, char **argv ) {
    int info, i, j, pcol;
    double *D, *AB_sol, *InvD_T_Block, *XSrow;
    int *DESCD, *DESCAB_sol, *DESCXSROW;
    bool readAFromFile = false;
    //CSRdouble BT_i, B_j;
    double* BT_i;
    double* B_j;
    int s_BT_i = 0;
    int s_B_j  = 0;              // size of BT_i (#rows) and B_j (#cols)
    int nx, ny, nz;
    CSRdouble Asparse, Btsparse;

    if (argc < 6) 
    {
        // needed args: nx, ny, nz, Ddim, blocksize; 
        // optional: C.csr - if provided, the output file is written.
        cout << "Too few arguments." << endl;
        cout << "Usage: " << argv[0] << "   nx      ny     nz  Ddim blocksize [C.csr]" << endl;
        cout << "Usage: " << argv[0] << " -fileA filename Adim Ddim blocksize [C.csr]" << endl;
        exit(-1);
    }

    if (strcmp(argv[1], "-fileA") == 0)   // if (argv[1] == "-fileA")
    {
        readAFromFile = true;

        filenameA = new char[250];
        strcpy(filenameA, argv[2]);

        Adim = atoi(argv[3]);
        
    }
    else
    {
        nx   = atoi(argv[1]);
        ny   = atoi(argv[2]);
        nz   = atoi(argv[3]);
        Adim = nx * ny * nz;
    }

    Ddim      = atoi(argv[4]);
    blocksize = atoi(argv[5]);



    //printf("Adim=%d, Ddim=%d, blocksize=%d *** %s", Adim, Ddim, blocksize, outputc);
    //exit(-123);

    //Initialise MPI and some MPI-variables
    info = MPI_Init ( &argc, &argv );
    if ( info != 0 ) {
        printf ( "Error in MPI initialisation: %d\n",info );
        return info;
    }

    position= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( position==NULL ) {
        printf ( "unable to allocate memory for processor position coordinate\n" );
        return EXIT_FAILURE;
    }

    dims= ( int* ) calloc ( 2,sizeof ( int ) );
    if ( dims==NULL ) {
        printf ( "unable to allocate memory for grid dimensions coordinate\n" );
        return EXIT_FAILURE;
    }

    //BLACS is the interface used by PBLAS and ScaLAPACK on top of MPI

    blacs_pinfo_ ( &iam,&size ); 				//determine the number of processes involved


    info=MPI_Dims_create ( size, 2, dims );			//determine the best 2D cartesian grid with the number of processes
    if ( info != 0 ) {
        printf ( "Error in MPI creation of dimensions: %d\n",info );
        return info;
    }
//Until now the code can only work with square process grids
    //So we try to get the biggest square grid possible with the number of processes involved
    if ( *dims != * ( dims+1 ) ) {
        while ( *dims * *dims > size )
            *dims -=1;
        * ( dims+1 ) = *dims;
        if ( iam==0 )
            printf ( "WARNING: %d processor(s) unused due to reformatting to a square process grid\n", size - ( *dims * *dims ) );
        size = *dims * *dims;
        //cout << "New size of process grid: " << size << endl;
    }

    blacs_get_ ( &i_negone,&i_zero,&ICTXT2D );

    //Initialisation of the BLACS process grid, which is referenced as ICTXT2D
    blacs_gridinit_ ( &ICTXT2D,"R",dims, dims+1 );



    if ( iam < size ) {

        //The rank (iam) of the process is mapped to a 2D grid: position= (process row, process column)
        blacs_pcoord_ ( &ICTXT2D,&iam,position, position+1 );
        if ( *position ==-1 ) {
            printf ( "Error in proces grid\n" );
            return -1;
        }


        

        /*
        //Filenames, dimensions of all matrices and other important variables are read in as global variables (see src/readinput.cpp)
        
        info=read_input ( *++argv );
        if ( info!=0 ) 
        {
            printf ( "Something went wrong when reading input file for processor %d\n",iam );
            return -1;
        }
        */
        

        //blacs_barrier is used to stop any process of going beyond this point before all processes have made it up to this point.
        blacs_barrier_ ( &ICTXT2D,"ALL" );

        if ( * ( position+1 ) ==0 && *position==0 )
            printf ( "Reading of input-file succesful\n" );

        if ( * ( position+1 ) ==0 && *position==0 ) {
            printf ( "\nA sparse square matrix of dimension %d with a dense square submatrix with dimension %d \n", Adim+Ddim,Ddim );
            printf ( "was analyzed using %d (%d x %d) processors\n",size,*dims,* ( dims+1 ) );
        }

        pcol= * ( position+1 );

        //Define number of blocks needed to store a complete column/row of D
        Dblocks= Ddim%blocksize==0 ? Ddim/blocksize : Ddim/blocksize +1;

        //Define the number of rowblocks needed by the current process to store its part of the dense matrix D
        Drows= ( Dblocks - *position ) % *dims == 0 ? ( Dblocks- *position ) / *dims : ( Dblocks- *position ) / *dims +1;
        Drows= Drows<1? 1 : Drows;

        //Define the number of columnblocks needed by the current process to store its part of the dense matrix D
        Dcols= ( Dblocks - pcol ) % * ( dims+1 ) == 0 ? ( Dblocks- pcol ) / * ( dims+1 ) : ( Dblocks- pcol ) / * ( dims+1 ) +1;
        Dcols=Dcols<1? 1 : Dcols;

        //Define the local leading dimension of D (keeping in mind that matrices are always stored column-wise)
        lld_D=Drows*blocksize;

        // cout << "Hi! I am " << iam << ". My position is ( " << *position << "," << *(position+1) << ") and I have... Dblocks: " << Dblocks << ";   Drows: " << Drows << ";   Dcols: " << Dcols << ";   blocksize: " << blocksize << endl;

        //Initialise the descriptor of the dense distributed matrix
        DESCD= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCD==NULL ) {
            printf ( "unable to allocate memory for descriptor for C\n" );
            return -1;
        }

        //D with dimensions (Ddim,Ddim) is distributed over all processes in ICTXT2D, with the first element in process (0,0)
        //D is distributed into blocks of size (blocksize,blocksize), having a local leading dimension lld_D in this specific process
        descinit_ ( DESCD, &Ddim, &Ddim, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_D, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        //Allocate the space necessary to store the part of D that is held into memory of this process.
        D = ( double* ) calloc ( Drows * blocksize * Dcols * blocksize,sizeof ( double ) );
        if ( D==NULL ) {
            printf ( "unable to allocate memory for Matrix D  (required: %ld bytes)\n", Drows * blocksize * Dcols * blocksize * sizeof ( double ) );
            return EXIT_FAILURE;
        }


        blacs_barrier_ ( &ICTXT2D,"ALL" ); //added

        B_j  = new double[Adim * Dcols * blocksize];
        BT_i = new double[Adim * Drows * blocksize];

        //read_in_BD ( DESCD,D, BT_i, B_j, Btsparse ) ;
        if ( iam == 0 )
            cout << "Generating A, B and D... \n" << endl;
        generate_BD(D, BT_i, B_j, &s_BT_i, &s_B_j);

        cout << "- B, D generated." << endl;

        //Now every process has to read in the sparse matrix A

        //makeDiagonalPerturbD(Adim, 1000.0, 1e-10, Asparse); cout << "A is a pert. diag." << endl;
        //makeRandCSRUpper(Adim, 0.001, Asparse);
        //cout << "nnz(A) = " << Asparse.nonzeros << endl;
        //Asparse.loadFromFileSym("/users/drosos/simple/matrices/NornePrimaryJacobian.csr");

        if (readAFromFile)
        {
            Asparse.loadFromFile(filenameA);
            cout << "A loaded from file" << endl;
            //Asparse.reduceSymmetric();
        }
        else
        {
            make3DLaplace(nx, ny, nz, Asparse);
            cout << "A is Laplacian" << endl;
            shiftIndices(Asparse, -1);
        }
        cout << "- A generated." << endl;
        Asparse.matrixType = SYMMETRIC;

        assert(Asparse.nrows == Adim);
        assert(Asparse.ncols == Adim);

        //if (iam == 0) Asparse.writeToFile("A_debug.csr"); exit(-1234);
	

        if (argc == 7)        // if the name of the output file for C is given as parameter
        {
            filenameC = new char[250];
            //strcpy(filenameC, argv[6]);
            sprintf(filenameC, "/scratch/daint/verbof/sparsedense/C_%d_%d.csr", Adim, Ddim);

            CSRdouble Dmat, Dblock, Csparse, Bblock;
            Dblock.nrows=Dblocks * blocksize;
            Dblock.ncols=Dblocks * blocksize;
            Dblock.allocate ( Dblocks * blocksize, Dblocks * blocksize, 0 );
            Dmat.allocate ( 0,0,0 );
            for ( i=0; i<Drows; ++i ) {
                for ( j=0; j<Dcols; ++j ) {
                    dense2CSR_sub ( D + i * blocksize + j * lld_D * blocksize,blocksize,blocksize,lld_D,Dblock, ( * ( dims ) * i + *position ) *blocksize,
                                    ( * ( dims+1 ) * j + pcol ) *blocksize );
                    if ( Dblock.nonzeros>0 ) {
                        if ( Dmat.nonzeros==0 ) {
                            Dmat.make2 ( Dblock.nrows,Dblock.ncols,Dblock.nonzeros,Dblock.pRows,Dblock.pCols,Dblock.pData );
                        } else {
                            Dmat.addBCSR ( Dblock );
                        }
                    }

                    Dblock.clear();
                }
            }
            if ( *position==0 ) {
                Bblock.nrows=Adim;
                Bblock.ncols=Dblocks * blocksize;
                Bblock.allocate ( Adim, Dblocks * blocksize, 0 );
                Btsparse.allocate ( 0,0,0 );
                for ( j=0; j<Dcols; ++j ) {
                    dense2CSR_sub ( B_j + j * Adim * blocksize,Adim,blocksize,Adim,Bblock,0, ( * ( dims+1 ) * j + pcol ) *blocksize );
                    if ( Bblock.nonzeros>0 ) {
                        if ( Btsparse.nonzeros==0 ) {
                            Btsparse.make2 ( Bblock.nrows,Bblock.ncols,Bblock.nonzeros,Bblock.pRows,Bblock.pCols,Bblock.pData );
                        } else {
                            Btsparse.addBCSR ( Bblock );
                        }
                    }

                    Bblock.clear();
                }

            }
            blacs_barrier_ ( &ICTXT2D,"A" );
            if ( iam!=0 ) {
                //Each process other than root sends its Dmat to the root process.
                MPI_Send ( & ( Dmat.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pRows[0] ),Dmat.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pCols[0] ),Dmat.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD );
                MPI_Send ( & ( Dmat.pData[0] ),Dmat.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD );
                Dmat.clear();
                if ( *position==0 ) {
                    MPI_Send ( & ( Btsparse.nonzeros ),1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD );
                    MPI_Send ( & ( Btsparse.pRows[0] ),Btsparse.nrows + 1, MPI_INT,0,iam+5*size,MPI_COMM_WORLD );
                    MPI_Send ( & ( Btsparse.pCols[0] ),Btsparse.nonzeros, MPI_INT,0,iam+6*size,MPI_COMM_WORLD );
                    MPI_Send ( & ( Btsparse.pData[0] ),Btsparse.nonzeros, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD );
                    Btsparse.clear();
                }
            } else {
	     
		//Btsparse.writeToFile("Btsparse_pre.csr");
                for ( i=1; i<size; ++i ) {
                    // The root process receives parts of Dmat sequentially from all processes and directly adds them together.
                    int nonzeroes, count;
                    MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status );
                    /*MPI_Get_count(&status, MPI_INT, &count);
                    printf("Process 0 received %d elements of process %d\n",count,i);*/
                    if ( nonzeroes>0 ) {
                        printf ( "Nonzeroes : %d\n ",nonzeroes );
                        Dblock.allocate ( Dblocks * blocksize,Dblocks * blocksize,nonzeroes );
                        MPI_Recv ( & ( Dblock.pRows[0] ), Dblocks * blocksize + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        MPI_Recv ( & ( Dblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_DOUBLE, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        Dmat.addBCSR ( Dblock );
                        Dblock.clear();
                    }
                     
                    if ( i / *dims == 0 ) {
                        MPI_Recv ( &nonzeroes,1,MPI_INT,i,i+4*size,MPI_COMM_WORLD,&status );
                        /*MPI_Get_count(&status, MPI_INT, &count);
                        printf("Process 0 received %d elements of process %d\n",count,i);*/
                        if ( nonzeroes>0 ) {
                            printf ( "Nonzeroes : %d\n ",nonzeroes );
                            Bblock.allocate ( Adim,Dblocks * blocksize,nonzeroes );
                            MPI_Recv ( & ( Bblock.pRows[0] ), Adim + 1, MPI_INT,i,i+5*size,MPI_COMM_WORLD,&status );
                            /*MPI_Get_count(&status, MPI_INT, &count);
                            printf("Process 0 received %d elements of process %d\n",count,i);*/
                            MPI_Recv ( & ( Bblock.pCols[0] ),nonzeroes, MPI_INT,i,i+6*size,MPI_COMM_WORLD,&status );
                            /*MPI_Get_count(&status, MPI_INT, &count);
                            printf("Process 0 received %d elements of process %d\n",count,i);*/
                            MPI_Recv ( & ( Bblock.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+7*size,MPI_COMM_WORLD,&status );
                            /*MPI_Get_count(&status, MPI_DOUBLE, &count);
                            printf("Process 0 received %d elements of process %d\n",count,i);*/
                            Btsparse.addBCSR ( Bblock );
                            Bblock.clear();
                        }
                    }
                }
                //Dmat.writeToFile("D_sparse.csr");
                printf ( "Number of nonzeroes in D: %d\n",Dmat.nonzeros );
                Dmat.reduceSymmetric();
                //Dmat.writeToFile("D_sparse_symm.csr");

                //Btsparse.writeToFile("Btsparse.csr");
                Dmat.changeCols(Ddim);
                Dmat.changeRows(Ddim);
                //Dmat.writeToFile("Dsparse_red.csr");
                Btsparse.changeCols(Ddim);
                create2x2SymBlockMatrix ( Asparse,Btsparse, Dmat, Csparse );
                Btsparse.clear();
                Dmat.clear();
		


                ParDiSO p(-2,0);

                p.init(Csparse, 1);
                p.factorize(Csparse);

                //Csparse.fillSymmetric();
                //Csparse.writeToFilePSelInv(filenameC);
                //Csparse.writeToFile(filenameC);
                Csparse.clear();

                //double* Cdense = new double[Csparse.nrows * Csparse.ncols];
                //CSR2dense(Csparse, Cdense);
                //printdense(Adim+Ddim, Adim+Ddim, Cdense, "C.txt");

                if ( filenameC != NULL )
                    free ( filenameC );
                filenameC=NULL;
            }

            if (iam == 0)
            {
                cout << "\n - C saved in file " << filenameC << "! Exiting... \n\n" << endl; 
                exit(-12345);
            }
        }

        //AB_sol will contain the solution of A*X=B, distributed across the process rows. Processes in the same process row possess the same part of AB_sol
        DESCAB_sol= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCAB_sol==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //AB_sol (Adim, Ddim) is distributed across all processes in ICTXT2D starting from process (0,0) into blocks of size (Adim, blocksize)
        descinit_ ( DESCAB_sol, &Adim, &Ddim, &Adim, &blocksize, &i_zero, &i_zero, &ICTXT2D, &Adim, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        AB_sol= ( double * ) calloc ( Adim * s_B_j,sizeof ( double ) );
	
	blacs_barrier_ ( &ICTXT2D,"A" );

        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tick ( totaltime );

        if ( iam == 0 )
            watch.tick ( cresctime );

        // Each process calculates the Schur complement of the part of D at its disposal. (see src/schur.cpp)
        // The solution of A * X = B_j is stored in AB_sol (= A^-1 * B_j)

        /*
        char * BT_i_debugFile = new char[100];
        char * B_j_debugFile  = new char[100];

        sprintf(BT_i_debugFile, "BT_i_debug_%d.txt", iam);
        sprintf(B_j_debugFile,  "B_j_debug_%d.txt",  iam);

        BT_i.writeToFile(BT_i_debugFile);
         B_j.writeToFile(B_j_debugFile);
        */

        make_Sij_denseB ( Asparse, BT_i, B_j, s_BT_i, s_B_j, D, lld_D, AB_sol );

        /*
        char * AB_sol_debugFile = new char[100];
        char * D_debugFile      = new char[100];

        sprintf(AB_sol_debugFile, "AB_sol_debug_%d.txt", iam);
        sprintf(D_debugFile,      "D_debug_%d.txt",      iam);

        printDenseDouble(AB_sol_debugFile, ios::out, Drows*blocksize, Dcols*blocksize, AB_sol);
        printDenseDouble(D_debugFile,      ios::out, Ddim,            Ddim,            D);

        cout << iam << " just wrote debug stuff... " << endl;

        */
	blacs_barrier_ ( &ICTXT2D,"ALL" );
	
	if ( iam == 0 )
            watch.tack ( cresctime );


        if ( iam !=0 ) {
            Asparse.clear();
            pardiso_var.clear();
        }

        //BT_i.clear();
        //B_j.clear();

        delete[] BT_i;
        delete[] B_j;



        blacs_barrier_ ( &ICTXT2D,"ALL" );

        /********************** TIMING **********************/
        

        if ( iam == 0 )
            watch.tick ( facsctime );

        //The Schur complement is factorised (by ScaLAPACK)
        pdpotrf_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Cholesky decomposition of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        blacs_barrier_ ( &ICTXT2D,"ALL" );

        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( facsctime );

        if ( iam == 0 )
            watch.tick ( invsctime );

        //The Schur complement is inverteded (by ScaLAPACK)
        pdpotri_ ( "U",&Ddim,D,&i_one,&i_one,DESCD,&info );
        if ( info != 0 ) {
            printf ( "Inverse of D was unsuccessful, error returned: %d\n",info );
            return -1;
        }

        blacs_barrier_ ( &ICTXT2D,"A" );

        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( invsctime );
	
	InvD_T_Block = ( double* ) calloc ( Dblocks * blocksize + Adim ,sizeof ( double ) );

        if ( iam == 0 )
            watch.tick ( gathrtime );
        

        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tick ( sndrctime );

        //Diagonal elements of the (1,1) block of C^-1 are still distributed and here they are gathered in InvD_T_Block in the root process.
        if ( *position == pcol ) {
            for ( i=0; i<Ddim; ++i ) {
                if ( pcol == ( i/blocksize ) % *dims ) {
                    int Dpos = i%blocksize + ( ( i/blocksize ) / *dims ) * blocksize ;
                    * ( InvD_T_Block + Adim +i ) = * ( D + Dpos + lld_D * Dpos );
                }
            }
            for ( i=0,j=0; i<Dblocks; ++i,++j ) {
                if ( j==*dims )
                    j=0;
                if ( *position==j ) {
                    dgesd2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + i * blocksize,&blocksize,&i_zero,&i_zero );
                }
                if ( *position==0 ) {
                    dgerv2d_ ( &ICTXT2D,&blocksize,&i_one,InvD_T_Block + Adim + blocksize*i,&blocksize,&j,&j );
                }
            }
        }

        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( sndrctime );

        if ( position != NULL ) {
            free ( position );
            position=NULL;
        }

        if ( dims != NULL ) {
            free ( dims );
            dims=NULL;
        }

        //Only the root process performs a selected inversion of A.
        if ( iam==0 ) {

            watch.tick ( invrAtime );

            /*int pardiso_message_level = 1;

            int pardiso_mtype=-2;

            ParDiSO pardiso ( pardiso_mtype, pardiso_message_level );*/

            int number_of_processors = 1;
            char* var = getenv ( "OMP_NUM_THREADS" );
            if ( var != NULL )
                sscanf ( var, "%d", &number_of_processors );
            else {
                printf ( "Set environment OMP_NUM_THREADS to 1" );
                exit ( 1 );
            }

            pardiso_var.iparm[2]  = 2;
            pardiso_var.iparm[3]  = number_of_processors;
            pardiso_var.iparm[8]  = 0;
            pardiso_var.iparm[11] = 1;
            pardiso_var.iparm[13]  = 0;
            pardiso_var.iparm[28]  = 0;

            //This function calculates the factorisation of A once again so this might be optimized.
            pardiso_var.findInverseOfA ( Asparse );

            cout << "Memory allocated by pardiso: " << pardiso_var.memoryAllocated() << endl;

            printf ( "Processor %d inverted matrix A\n",iam );

            watch.tack ( invrAtime );
        }

        blacs_barrier_ ( &ICTXT2D,"A" );


        // To minimize memory usage, and because only the diagonal elements of the inverse are needed, X' * S is calculated row by row
        // the diagonal element is calculated as the dot product of this row and the corresponding column of X. (X is solution of AX=B)
        XSrow= ( double* ) calloc ( Dcols * blocksize,sizeof ( double ) );
        DESCXSROW= ( int* ) malloc ( DLEN_ * sizeof ( int ) );
        if ( DESCXSROW==NULL ) {
            printf ( "unable to allocate memory for descriptor for AB_sol\n" );
            return -1;
        }
        //XSrow (1,Ddim) is distributed acrros processes of ICTXT2D starting from process (0,0) into blocks of size (1,blocksize)
        descinit_ ( DESCXSROW, &i_one, &Ddim, &i_one,&blocksize, &i_zero, &i_zero, &ICTXT2D, &i_one, &info );
        if ( info!=0 ) {
            printf ( "Descriptor of matrix C returns info: %d\n",info );
            return info;
        }

        blacs_barrier_ ( &ICTXT2D,"A" );

        if ( iam == 0 )
            cout << "Calculating diagonal elements of the first block of the inverse... \n" << endl;



        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tick ( dotprtime );


        //Calculating diagonal elements 1 by 1 of the (0,0)-block of C^-1.
        for ( i=1; i<=Adim; ++i ) {
            pdsymm_ ( "R","U",&i_one,&Ddim,&d_one,D,&i_one,&i_one,DESCD,AB_sol,&i,&i_one,DESCAB_sol,&d_zero,XSrow,&i_one,&i_one,DESCXSROW );
            pddot_ ( &Ddim,InvD_T_Block+i-1,AB_sol,&i,&i_one,DESCAB_sol,&Adim,XSrow,&i_one,&i_one,DESCXSROW,&i_one );
        }


        blacs_barrier_ ( &ICTXT2D,"A" );
        /********************** TIMING **********************/
        if ( iam == 0 )
            watch.tack ( dotprtime );



        if ( D!=NULL ) {
            free ( D );
            D=NULL;
        }
        if ( AB_sol!=NULL ) {
            free ( AB_sol );
            AB_sol=NULL;
        }
        if ( XSrow !=NULL ) {
            free ( XSrow );
            XSrow=NULL;
        }
        if ( DESCD!=NULL ) {
            free ( DESCD );
            DESCD=NULL;
        }
        if ( DESCAB_sol!=NULL ) {
            free ( DESCAB_sol );
            DESCAB_sol=NULL;
        }
        if ( DESCXSROW!=NULL ) {
            free ( DESCXSROW );
            DESCXSROW=NULL;
        }


        //Only in the root process we add the diagonal elements of A^-1
        if ( iam ==0 ) {
            for ( i = 0; i < Adim; i++ ) {
                j                  = Asparse.pRows[i];
                * ( InvD_T_Block+i ) += Asparse.pData[j];
            }


            /********************** TIMING **********************/

            watch.tack ( gathrtime );
            watch.tack ( totaltime );


            /*
            //cout << "Extraction completed by ";
            for (i = 0; i < Ddim; i++)
            {
                cout << "Extracting row " << i << "/" << Ddim << endl;
                //cout << setw(3) << std::setfill('0') << int(i*100.0 / (Ddim-1)) << "%" << "\b\b\b\b";

                diagonal[Asparse.nrows + i] = InvD_T_Block[i*Ddim + i];
            }
            cout << endl;
            */


            Asparse.clear();

            /*
            cout << "Extracting diagonal... \n" << endl;
            cout << "Saving diagonal... \n" << endl;

            char* diagOutFile = new char[50];
            sprintf ( diagOutFile, "diag_inverse_C_parallel_%d.txt", size );

            printdense ( Adim+Ddim, 1, InvD_T_Block, diagOutFile );

            delete[] diagOutFile;
            */
        }

        if ( InvD_T_Block !=NULL ) {
            free ( InvD_T_Block );
            InvD_T_Block=NULL;
        }


        if ( iam == 0 ) {


            // Conversion milliseconds -> seconds
            cresctime /= 1000.0;
            facsctime /= 1000.0;
            invsctime /= 1000.0;
            gathrtime /= 1000.0;
            invrAtime /= 1000.0;
            totaltime /= 1000.0;



            cout << "********************************* TIME REPORT ********************************** \n" << endl;
            cout << "                     SCHUR COMPLEMENT      BUILDING: " << cresctime << " seconds" << endl;
            cout << "                     SCHUR COMPLEMENT FACTORIZATION: " << facsctime << " seconds" << endl;
            cout << "                     SCHUR COMPLEMENT     INVERSION: " << invsctime << " seconds" << endl;
            cout << "                                                                                " << endl;
            cout << "         FINAL OPERATIONS (INVERSION OF A INCLUDED): " << gathrtime << " seconds" << endl;
            cout << "                                     INVERSION OF A: " << invrAtime << " seconds" << endl;
            cout << "                                         TOTAL TIME: " << totaltime << " seconds" << endl;
            cout << "******************************************************************************** \n" << endl;

            /*
             * double totaltime = 0.0;     // Total execution time
             * double cresctime = 0.0;     // Schur-complement (total)
             * double facsctime = 0.0;     // Schur-complement factorization
             * double invsctime = 0.0;     // Schur-complement inversion
             * double gathrtime = 0.0;     // Last operationsdouble invrAtime = 0.0;     // Inversion of A
             * */

            char* timingFile = new char[50];
            sprintf ( timingFile, "weak_tests.csv" );

            std::fstream timeF;
            timeF.open ( timingFile, std::fstream::out | std::fstream::app );
            timeF.setf ( ios::scientific, ios::floatfield );

            //timeF << "PROBLEM SIZE: " << Adim/1000 << "k + " << Ddim/1000 << "k" << endl;
            //timeF <<
            //"#PROCS,SCHUR_BUILD,SCHUR_FACT,SCHUR_INV,INV(A),FINAL_OPS,
            //SEND_RECV, DOT_PROD,TOTAL" << endl;
            timeF << size << "," << cresctime << "," << facsctime << "," << invsctime << "," << invrAtime << "," << gathrtime << "," << sndrctime << "," << dotprtime << "," << totaltime << endl;

            timeF.close();
        }


        blacs_barrier_ ( &ICTXT2D,"A" );
        blacs_gridexit_ ( &ICTXT2D );
    }

    //cout << iam << " reached end before MPI_Barrier" << endl;
    MPI_Barrier ( MPI_COMM_WORLD );
    MPI_Finalize();

    return 0;
}
Example #6
0
/*==== MAIN FUNCTION =================================================*/
int main( int argc, char *argv[] ){

/*  ==== Declarations =================================================== */

/*  File variables */
    FILE    *fin;

/*  Matrix descriptors */
    MDESC   descA, descB, descC, descA_local, descB_local;

/*  Local scalars */
    MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol;
    MKL_INT n, nb, mp, nq, lld, lld_local;
    MKL_INT i, j, info;
    int     n_int, nb_int, nprow_int, npcol_int;
    double  thresh, diffnorm, anorm, bnorm, residual, eps;

/*  Local arrays */
    double  *A_local, *B_local, *A, *B, *C, *work;
    MKL_INT iwork[ 4 ];


/*  ==== Executable statements ========================================== */

/*  Get information about how many processes are used for program execution
    and number of current process */
    blacs_pinfo_( &iam, &nprocs );

/*  Init temporary 1D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "C", &nprocs, &i_one );

/*  Open input file */
    if ( iam == 0 ) {
        fin = fopen( "../in/pblas3ex.in", "r" );
        if ( fin == NULL ) {
            printf( "Error while open input file." );
            return 2;
        }
    }

/*  Read data and send it to all processes */
    if ( iam == 0 ) {

/*      Read parameters */
        fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int );
        fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int );
        fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int );
        fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int );
        fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh );
        n = (MKL_INT) n_int;
        nb = (MKL_INT) nb_int;
        nprow = (MKL_INT) nprow_int;
        npcol = (MKL_INT) npcol_int;

/*      Check if all parameters are correct */
        if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) {
            printf( "One or several input parameters has incorrect value. Limitations:\n" );
            printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" );
            printf( "p*q = number of processes\n" );
            printf( "threshold - double (set to negative to swicth off check)\n");
            return 2;
        }

/*      Pack data into array and send it to other processes */
        iwork[ 0 ] = n;
        iwork[ 1 ] = nb;
        iwork[ 2 ] = nprow;
        iwork[ 3 ] = npcol;
        igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four );
        dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one );
    } else {

/*      Recieve and unpack data */
        igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero );
        dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero );
        n = iwork[ 0 ];
        nb = iwork[ 1 ];
        nprow = iwork[ 2 ];
        npcol = iwork[ 3 ];
    }
    if ( iam == 0 ) { fclose( fin ); }

/*  Destroy temporary process grid */
    blacs_gridexit_( &ictxt );

/*  Init workind 2D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

/*  Create on process 0 two matrices: A - orthonormal, B -random */
    if ( ( myrow == 0 ) && ( mycol == 0 ) ){

/*      Allocate arrays */
        A_local = (double*) calloc( n*n, sizeof( double ) );
        B_local = (double*) calloc( n*n, sizeof( double ) );

/*      Set arrays */
        for ( i=0; i<n; i++ ){
            for ( j=0; j<n; j++ ){
                B_local[ i+n*j ] = one*rand()/RAND_MAX;
            }
            B_local[ i+n*i ] += two;
        }
        for ( j=0; j<n; j++ ){
            for ( i=0; i<n; i++ ){
                if ( j < n-1 ){
                    if ( i <= j ){
                        A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) );
                    } else if ( i == j+1 ) {
                        A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) );
                    } else {
                        A_local[ i+n*j ] = zero;
                    }
                } else {
                    A_local[ i+n*(n-1) ] = one / sqrt( ( double )n );
                }
            }
        }

/*      Print information of task */
        printf( "=== START OF EXAMPLE ===================\n" );
        printf( "Matrix-matrix multiplication: A*B = C\n\n" );
        printf( "/  1/q_1 ........   1/q_n-1     1/q_n  \\ \n" );
        printf( "|        .                             | \n" );
        printf( "|         `.           :         :     | \n" );
        printf( "| -1/q_1    `.         :         :     | \n" );
        printf( "|        .    `.       :         :     |  =  A \n" );
        printf( "|   0     `.    `                      | \n" );
        printf( "|   : `.    `.      1/q_n-1     1/q_n  | \n" );
        printf( "|   :   `.    `.                       | \n" );
        printf( "\\   0 .... 0     -(n-1)/q_n-1   1/q_n  / \n\n" );
        printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" );
        printf( "A  -  n*n real matrix (orthonormal) \n" );
        printf( "B  -  random n*n real matrix\n\n" );
        printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol );
        printf( "=== PROGRESS ===========================\n" );
    } else {

/*      Other processes don't contain parts of initial arrays */
        A_local = NULL;
        B_local = NULL;
    }

/*  Compute precise length of local pieces and allocate array on
    each process for parts of distributed vectors */
    mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow );
    nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol );
    A = (double*) calloc( mp*nq, sizeof( double ) );
    B = (double*) calloc( mp*nq, sizeof( double ) );
    C = (double*) calloc( mp*nq, sizeof( double ) );

/*  Compute leading dimensions */
    lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 );
    lld = MAX( mp, 1 );

/*  Initialize descriptors for initial arrays located on 0 process */
    descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );
    descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );

/*  Initialize descriptors for distributed arrays */
    descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );

/*  Distribute matrices from 0 process over process grid */
    pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); }

/*  Destroy arrays on 0 process - they are not necessary anymore */
    if( ( myrow == 0 ) && ( mycol == 0 ) ){
        free( A_local );
        free( B_local );
    }

/*  Compute norm of A and B */
    work = (double*) calloc( mp, sizeof( double ) );
    anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work );
    bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); }

/*  Compute product C = A*B */
    pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
    if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); }

/*  Compute difference  B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */
    pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC,
             &negone, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); }

/*  Compute norm of B - inv_A*C (which is contained in B) */
    diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    free( work );
    if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); }

/*  Print results */
    if( iam == 0 ){
        printf( ".. Solutions are compared ..\n" );
        printf( "== Results ==\n" );
        printf( "||A|| = %03.11f\n", anorm );
        printf( "||B|| = %03.11f\n", bnorm );
        printf( "=== END OF EXAMPLE =====================\n" );
    }

/*  Compute machine epsilon */
    eps = pdlamch_( &ictxt, "e" );

/*  Compute residual */
    residual = diffnorm /( two*anorm*bnorm*eps );

/*  Destroy arrays */
    free( A );
    free( B );
    free( C );

/*  Destroy process grid */    
    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
    
/*  Check if residual passed or failed the threshold */
    if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){
        printf( "FAILED. Residual = %05.16f\n", residual );
        return 1;
    } else {
        return 0;
    }

/*========================================================================
  ====== End of PBLAS Level 3 example ====================================
  ======================================================================*/
}
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *X=NULL, *B=NULL, *Y=NULL, *Xtrue=NULL;
  myscalar elem;
  int descA[BLACSCTXTSIZE], descVec[BLACSCTXTSIZE], descelem[BLACSCTXTSIZE];
  int n;
  int nrhs;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal rdummy, res;

  n=1024; /* Size of the problem */
  nrhs=3; /* Number of RHS */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    myreal pi=3.1416, d=0.1;
    for(i=1;i<=locr;i++) {
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
    }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nrhs,&nb,&mycol,&IZERO,&npcol);
    dummy=std::max(1,locr);
    descinit_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
    Xtrue=new myscalar[locr*locc]();
    for(i=0;i<locr*locc;i++)
      Xtrue[i]=static_cast<myscalar>(rand())/(static_cast<myscalar>(RAND_MAX));
  } else {
    descset_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution and intermediate vector space */
  if(myid<nprow*npcol) {
    X=new myscalar[locr*locc]();
    Y=new myscalar[locr*locc]();
  }

  /* Initializing RHS as A*Xtrue */
  if(myid<nprow*npcol) {
    B=new myscalar[locr*locc]();
    pgemm('N','N',n,nrhs,n,ONE,A,IONE,IONE,descA,Xtrue,IONE,IONE,descVec,ZERO,B,IONE,IONE,descVec);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-12;
  sdp.split_HSS=768; /* Size of A11 */

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Factorization */
  sdp.partially_factor(A,descA,sdp.split_HSS);

  /* Schur complement update */
  sdp.compute_schur();

  /* Extracting a random element from the Schur complement */
  if(myid<nprow*npcol) {
    descinit_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else {
    descset_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }
  i=1+rand()%(n-sdp.split_HSS);
  j=1+rand()%(n-sdp.split_HSS);
  MPI_Bcast((void *)&i,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  MPI_Bcast((void *)&j,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  sdp.extract_schur(&elem,descelem,&i,IONE,&j,IONE);
  if(!myid)
    std::cout << "Element (" << i << "," << j << ") of Schur complement = " << elem << std::endl << std::endl;

  /* Condensation */
  sdp.reduce_RHS(Y,descVec,B,descVec);

  /* Solve the Schur complement system (touches only the bottom part of X)*/
  sdp.verbose=false;
  if(!myid)
    std::cout << "Solving Schur complement system with Conjugate Gradient..." << std::endl << std::endl;
  CG(&sdp,X,Y,descVec,n,nrhs,3000,1e-14);
  sdp.verbose=true;

  /* Expansion (touches only the top part of X) */
  sdp.expand_solution(X,descVec,Y,descVec);

  /* Accuracy checking */
  sdp.check_solution(A,descA,X,descVec,B,descVec);

  /* Forward error */
  if(myid<nprow*npcol) {
    for(i=0;i<locr*locc;i++)
      Y[i]=X[i]-Xtrue[i];
    res=plange('F',n,nrhs,Y,IONE,IONE,descVec,&rdummy);
    res/=plange('F',n,nrhs,Xtrue,IONE,IONE,descVec,&rdummy);
    if(!myid)
      std::cout << "Forward error = " << res << std::endl;
  }

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] X;
  delete[] Y;
  delete[] Xtrue;

  /* The end */
  MPI_Finalize();
  return 0;

}
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *B=NULL, *Btrue=NULL;
  int descA[BLACSCTXTSIZE], descB[BLACSCTXTSIZE];
  int n;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int *I, *J;
  int nI, nJ;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal err;

  n=1024; /* Size of the problem */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    for(i=1;i<=locr;i++)
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        myreal pi=3.1416, d=0.1;
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-6;

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Element extraction: a bunch of random indices.
   * Not that duplicates do not matter (the code works).
   */
  nI=1+rand()%n;
  MPI_Bcast((void*)&nI,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  I=new int[nI];
  if(!myid)
    for(i=0;i<nI;i++)
      I[i]=1+rand()%n;
  MPI_Bcast((void*)I,nI,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  nJ=1+rand()%n;
  MPI_Bcast((void*)&nJ,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  J=new int[nJ];
  if(!myid)
    for(j=0;j<nJ;j++)
      J[j]=1+rand()%n;
  MPI_Bcast((void*)J,nJ,MPI_INTEGER,IZERO,MPI_COMM_WORLD);

  /* Extraction for the HSS form */
  if(myid<nprow*npcol) {
    locr=numroc_(&nI,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nJ,&nb,&mycol,&IZERO,&npcol);
    B=new myscalar[locr*locc]();
    dummy=std::max(1,locr);
    descinit_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else
    descset_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  sdp.extract(A,descA,B,descB,I,nI,J,nJ);

  /* Extraction from the original matrix just to compare */
  sdp.use_HSS=false;
  if(myid<nprow*npcol)
    Btrue=new myscalar[locr*locc];
  sdp.extract(A,descA,Btrue,descB,I,nI,J,nJ);

  /* Comparison with elements of input matrix */
  if(myid<nprow*npcol){
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL);
    for(i=0;i<locr*locc;i++)
      Btrue[i]-=B[i];
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL)/err;
  }
  if(!myid)
    std::cout << "Element extraction (" << nI << "x" << nJ << " submatrix): maximum relative error max ||A(I,J)-HSS(I,J)||//||A(I,J)|| = " << err << std::endl << std::endl;

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] Btrue;
  delete[] I;
  delete[] J;

  /* The end */
  MPI_Finalize();
  return 0;

}