void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){
  int i,j,k;

  /*
   BLACS_PARAMS:
   [0]  = ictxt;
   [1]  = my_proc;
   [2]  = total_procs;
   [3]  = Nproc_row;
   [4]  = Nproc_col;
   [5]  = my_row;
   [6]  = my_col;
   [7]  = Global_rows;
   [8]  = Global_columns;
   [9]  = Block_Size_row;
   [10] = Block_Size_col;
   [11] = LOC_rows;
   [12] = LOC_columns;
   */


   blacs_pinfo_( i, j);
   BLACS_PARAMS[1] = i;
   BLACS_PARAMS[2] = j;

   blacs_get_( -1, 0,  k);   // <- LG check the arguments of this call
   BLACS_PARAMS[0] = k;

   i = BLACS_PARAMS[3];
   j = BLACS_PARAMS[4];
   blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j);

   blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j);
   BLACS_PARAMS[5] = i;
   BLACS_PARAMS[6] = j;

   i = 0;
   BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]);
   BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]);

   i = 0;
   j = 0;
   descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8],
                    BLACS_PARAMS[9], BLACS_PARAMS[10],
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
   if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);

   descinit_(DESCB, BLACS_PARAMS[7], 1,
                    BLACS_PARAMS[9], 1,
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
  if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);


}
Example #2
0
/// TODO: I think there is a version of this in scalapackTools.h to use instead
///
///
/// for a given context ICTXT, return the parameters of the ScaLAPACK
///
/// This is slated to be re-worked during Cheshire m4.  It will probably
/// become a method on ScaLAPACK operator.
///
///
static void getSlInfo(const slpp::int_t ICTXT, slpp::int_t& NPROW, slpp::int_t& NPCOL, slpp::int_t& MYPROW, slpp::int_t& MYPCOL, slpp::int_t& MYPNUM)
{
    if(DBG) std::cerr << "getSlInfo: ICTXT: " << ICTXT << std::endl;

    NPROW=-1 ; NPCOL=-1 ; MYPROW=-1 ; MYPCOL=-1 ; MYPNUM= -1;
    blacs_gridinfo_(ICTXT, NPROW, NPCOL, MYPROW, MYPCOL);
    if(DBG) std::cerr << "getSlInfo: blacs_gridinfo_(ICTXT: "<<ICTXT<<") -> "
                      << "NPROW: " << NPROW << ", NPCOL: " << NPCOL
                      << ", MYPROW: " << MYPROW << ", MYPCOL: " << MYPCOL << std::endl;

    if(NPROW < 1 || NPCOL < 1) {
        std::cerr << "getSlInfo: blacs_gridinfo_ error -- aborting" << std::endl;
        ::blacs_abort_(ICTXT, 99); // something that does not look like a signal
    }

    if(MYPROW < 0 || MYPCOL < 0) {
        std::cerr << "getSlInfo: blacs_gridinfo_ error -- aborting" << std::endl;
        ::blacs_abort_(ICTXT, 99); // something that does not look like a signal
    }

    MYPNUM = blacs_pnum_(ICTXT, MYPROW, MYPCOL);
    if(DBG) std::cerr << "getSlInfo: blacs_pnum() -> MYPNUM: " << MYPNUM <<std::endl;
}
Example #3
0
int main(int argc, char **argv)
{
    #define test_A(i,j) test_A[(size_t)(j)*N+(i)]
    #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)]
    int N,NB,w,LDA,BB;
    size_t memsize; //bytes
    int iam, nprocs, mydevice;
    int ICTXT, nprow, npcol, myprow, mypcol;
    int i_one = 1, i_zero = 0, i_negone = -1;
    double d_one = 1.0, d_zero = 0.0, d_negone = -1.0;
    int IASEED = 100;
/*  printf("N=?\n");
    scanf("%ld",&N);
    printf("NB=?\n");
    scanf("%d", &NB);
    printf("width of Y panel=?\n");
    scanf("%ld",&w);
*/
    if(argc < 4){
        printf("invalid arguments N NB memsize(M)\n");
        exit(1);
    }
    N = atoi(argv[1]);
    NB = atoi(argv[2]);
    memsize = (size_t)atoi(argv[3])*1024*1024;
    BB = (N + NB - 1) / NB;
    w = memsize/sizeof(double)/BB/NB/NB - 1;
    assert(w > 0);
    LDA = N + 0; //padding

    int do_io = (N <= NSIZE);
    double llttime;
    double gflops;
    
    nprow = npcol = 1;
    blacs_pinfo_(&iam, &nprocs);
    blacs_get_(&i_negone, &i_zero, &ICTXT);
    blacs_gridinit_(&ICTXT, "R", &nprow, &npcol);
    blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol);
    #ifdef USE_MIC
        #ifdef __INTEL_OFFLOAD
            printf("offload compilation enabled\ninitialize each MIC\n");
            offload_init(&iam, &mydevice);
            #pragma offload target(mic:0)
            {
                mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
            }
        #else
            if(isroot)
                printf("offload compilation not enabled\n");
            exit(0);
        #endif
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){
                cuStatus = cublasCreate(&worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasInit();
        #endif
    #endif

    double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol
#ifdef VERIFY
    double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify
#endif
    
    /*Initialize A */
    int i,j;
    printf("Initialize A ... "); fflush(stdout);
    llttime = MPI_Wtime();
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A, &LDA, &i_zero, &i_zero,
         &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol); 
    llttime = MPI_Wtime() - llttime;
    printf("time %lf\n", llttime);
              
    /*print test_A*/
    if(do_io){
        printf("Original A=\n\n");
        matprint(test_A, N, LDA, 'A');
    }

    /*Use directed unblocked Cholesky factorization*/    
    /*
    t1 = clock();
    Test_dpotrf(test_A2,N);
    t2 = clock();
    printf ("time for unblocked Cholesky factorization on host %f \n",
        ((float) (t2 - t1)) / CLOCKS_PER_SEC);
    */
    
    /*print test_A*/
    /*
    if(do_io){
        printf("Unblocked result:\n\n");
        matprint(test_A2,N,'L');   
    }
    */ 

    /*Use tile algorithm*/
    Quark *quark = QUARK_New(OOC_NTHREADS);
    QUARK_DOT_DAG_Enable(quark, 0);
    #ifdef USE_MIC
//      mklmem(NB);
        printf("QUARK MIC affinity binding\n");
        QUARK_bind(quark);
        printf("offload warm up\n");
        warmup(quark);
    #endif
    QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0));
    printf("LLT start %lf\n", MPI_Wtime());
    llttime = Cholesky(quark,test_A,N,NB,LDA,memsize);
    printf("LLT end %lf\n", MPI_Wtime());
    QUARK_Delete(quark);
    #ifdef USE_MIC
        offload_destroy();
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){ 
                cuStatus = cublasDestroy(worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasShutdown();
        #endif
    #endif

    gflops = (double) N;
    gflops = gflops/3.0 + 0.5;
    gflops = gflops*(double)(N)*(double)(N);
    gflops = gflops/llttime/1024.0/1024.0/1024.0;
    printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n",
        N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops);
    #ifdef USE_MIC
        #pragma offload target(mic:0)
        {
            memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
        }
        printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0);
    #endif

    /*Update and print L*/             
    if(do_io){
        printf("L:\n\n");
        matprint(test_A,N,LDA,'L');
    }
#ifdef VERIFY
    printf("Verify... ");
    llttime = MPI_Wtime();
  /*
   * ------------------------
   * check difference betwen 
   * test_A and test_A2
   * ------------------------
   */
    /*
    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = j; i < N; i++)
          {
            double err = (test_A (i, j) - test_A2 (i, j));
            err = ABS (err);
            maxerr = MAX (err, maxerr);
            maxerr2 = maxerr2 + err * err;
          };
      };
    maxerr2 = sqrt (ABS (maxerr2));
    printf ("max difference between test_A and test_A2 %lf \n", maxerr);
    printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2);
    };
    */

  /*
   * ------------------
   * over-write test_A2
   * ------------------
   */
   
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A2, &LDA, &i_zero,
         &i_zero, &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol);

  /*
   * ---------------------------------------
   * after solve, test_A2 should be identity
   * ---------------------------------------
   */
  // test_A = chol(B) = L;
  // test_A2 = B
  // solve L*L'*X = B
  // if L is correct, X is identity */
     
    {
    int uplo = 'L';
    const char *uplo_char = ((uplo == (int) 'U')
                    || (uplo == (int) 'u')) ? "U" : "L";
    int info = 0;
    int nrhs = N;
    int LDA = N;
    int ldb = N;
    dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info);
    assert (info == 0);
    }

    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = 0; i < N; i++)
          {
            double eyeij = (i == j) ? 1.0 : 0.0;
            double err = (test_A2 (i, j) - eyeij);
            err = ABS (err);
            maxerr = MAX (maxerr, err);
            maxerr2 = maxerr2 + err * err;
          };
      };

    maxerr2 = sqrt (ABS (maxerr2));
    printf("time %lf\n", MPI_Wtime() - llttime);
    printf ("max error %lf \n", maxerr);
    printf ("max L2 error %lf \n", maxerr2);
    }
#endif

    free(test_A);test_A=NULL;
#ifdef VERIFY
    free(test_A2);test_A2=NULL;
#endif
    blacs_gridexit_(&ICTXT);
    blacs_exit_(&i_zero);
    return 0;
    #undef test_A
    #undef test_A2
}
Example #4
0
int main()
{
	const MKL_INT m = 1000;
	const MKL_INT k = 100000;
	const MKL_INT n = 1000;
	const MKL_INT nb = 100;
	const MKL_INT nprow = 2;
	const MKL_INT npcol = 2;

    MKL_INT iam, nprocs, ictxt, myrow, mycol;
    MDESC   descA, descB, descC, descA_local, descB_local, descC_local;
	MKL_INT info;
	MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local;
	MKL_INT a_lld, b_lld, c_lld;

    blacs_pinfo_( &iam, &nprocs );
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

    double *a = 0;
    double *b = 0;
	double *c = 0;

    if (iam==0)
    {
		a = gen_a(m, k);
		b = gen_b(k, n);
		c = (double*)calloc(m*n, sizeof(double));

		puts("a=");
		print(a, m, k);

		puts("b=");
		print(b, k, n);
    }

    a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
    a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol );

	b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow );
	b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
	c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) );
    double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) );
    double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) );

    a_lld = MAX( a_m_local, 1 );
	b_lld = MAX( b_m_local, 1 );
	c_lld = MAX( c_m_local, 1 );

	if (iam==0)
	{
			printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local,
							c_m_local, c_n_local);
			printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld);
	}

    descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info );
    descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info );
    descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info );

    descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info );
    descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info );
    descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info );

	printf("Rank %d: start distribute data\n", iam);
    pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
	printf("Rank %d: finished distribute data\n", iam);

	if (iam==0)
	{
			puts("a");
			print(A, a_m_local, a_n_local);
			puts("b");
			print(B, b_m_local, b_n_local);
	}

    pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
	printf("Rank %d: finished dgemm\n", iam);
	if (iam == 0)
	{
			puts("c");
			print(C, c_m_local, c_n_local);
	}

	pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local);

	if (iam==0)
	{
			puts("global c");
			print(c, m, n);
	}

	free(A);
	free(B);
	free(C);
	if (iam==0)
	{
			free(a);
			free(b);
			free(c);
	}

    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
}
Example #5
0
/*==== MAIN FUNCTION =================================================*/
int main( int argc, char *argv[] ){

/*  ==== Declarations =================================================== */

/*  File variables */
    FILE    *fin;

/*  Matrix descriptors */
    MDESC   descA, descB, descC, descA_local, descB_local;

/*  Local scalars */
    MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol;
    MKL_INT n, nb, mp, nq, lld, lld_local;
    MKL_INT i, j, info;
    int     n_int, nb_int, nprow_int, npcol_int;
    double  thresh, diffnorm, anorm, bnorm, residual, eps;

/*  Local arrays */
    double  *A_local, *B_local, *A, *B, *C, *work;
    MKL_INT iwork[ 4 ];


/*  ==== Executable statements ========================================== */

/*  Get information about how many processes are used for program execution
    and number of current process */
    blacs_pinfo_( &iam, &nprocs );

/*  Init temporary 1D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "C", &nprocs, &i_one );

/*  Open input file */
    if ( iam == 0 ) {
        fin = fopen( "../in/pblas3ex.in", "r" );
        if ( fin == NULL ) {
            printf( "Error while open input file." );
            return 2;
        }
    }

/*  Read data and send it to all processes */
    if ( iam == 0 ) {

/*      Read parameters */
        fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int );
        fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int );
        fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int );
        fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int );
        fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh );
        n = (MKL_INT) n_int;
        nb = (MKL_INT) nb_int;
        nprow = (MKL_INT) nprow_int;
        npcol = (MKL_INT) npcol_int;

/*      Check if all parameters are correct */
        if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) {
            printf( "One or several input parameters has incorrect value. Limitations:\n" );
            printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" );
            printf( "p*q = number of processes\n" );
            printf( "threshold - double (set to negative to swicth off check)\n");
            return 2;
        }

/*      Pack data into array and send it to other processes */
        iwork[ 0 ] = n;
        iwork[ 1 ] = nb;
        iwork[ 2 ] = nprow;
        iwork[ 3 ] = npcol;
        igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four );
        dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one );
    } else {

/*      Recieve and unpack data */
        igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero );
        dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero );
        n = iwork[ 0 ];
        nb = iwork[ 1 ];
        nprow = iwork[ 2 ];
        npcol = iwork[ 3 ];
    }
    if ( iam == 0 ) { fclose( fin ); }

/*  Destroy temporary process grid */
    blacs_gridexit_( &ictxt );

/*  Init workind 2D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

/*  Create on process 0 two matrices: A - orthonormal, B -random */
    if ( ( myrow == 0 ) && ( mycol == 0 ) ){

/*      Allocate arrays */
        A_local = (double*) calloc( n*n, sizeof( double ) );
        B_local = (double*) calloc( n*n, sizeof( double ) );

/*      Set arrays */
        for ( i=0; i<n; i++ ){
            for ( j=0; j<n; j++ ){
                B_local[ i+n*j ] = one*rand()/RAND_MAX;
            }
            B_local[ i+n*i ] += two;
        }
        for ( j=0; j<n; j++ ){
            for ( i=0; i<n; i++ ){
                if ( j < n-1 ){
                    if ( i <= j ){
                        A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) );
                    } else if ( i == j+1 ) {
                        A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) );
                    } else {
                        A_local[ i+n*j ] = zero;
                    }
                } else {
                    A_local[ i+n*(n-1) ] = one / sqrt( ( double )n );
                }
            }
        }

/*      Print information of task */
        printf( "=== START OF EXAMPLE ===================\n" );
        printf( "Matrix-matrix multiplication: A*B = C\n\n" );
        printf( "/  1/q_1 ........   1/q_n-1     1/q_n  \\ \n" );
        printf( "|        .                             | \n" );
        printf( "|         `.           :         :     | \n" );
        printf( "| -1/q_1    `.         :         :     | \n" );
        printf( "|        .    `.       :         :     |  =  A \n" );
        printf( "|   0     `.    `                      | \n" );
        printf( "|   : `.    `.      1/q_n-1     1/q_n  | \n" );
        printf( "|   :   `.    `.                       | \n" );
        printf( "\\   0 .... 0     -(n-1)/q_n-1   1/q_n  / \n\n" );
        printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" );
        printf( "A  -  n*n real matrix (orthonormal) \n" );
        printf( "B  -  random n*n real matrix\n\n" );
        printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol );
        printf( "=== PROGRESS ===========================\n" );
    } else {

/*      Other processes don't contain parts of initial arrays */
        A_local = NULL;
        B_local = NULL;
    }

/*  Compute precise length of local pieces and allocate array on
    each process for parts of distributed vectors */
    mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow );
    nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol );
    A = (double*) calloc( mp*nq, sizeof( double ) );
    B = (double*) calloc( mp*nq, sizeof( double ) );
    C = (double*) calloc( mp*nq, sizeof( double ) );

/*  Compute leading dimensions */
    lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 );
    lld = MAX( mp, 1 );

/*  Initialize descriptors for initial arrays located on 0 process */
    descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );
    descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );

/*  Initialize descriptors for distributed arrays */
    descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );

/*  Distribute matrices from 0 process over process grid */
    pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); }

/*  Destroy arrays on 0 process - they are not necessary anymore */
    if( ( myrow == 0 ) && ( mycol == 0 ) ){
        free( A_local );
        free( B_local );
    }

/*  Compute norm of A and B */
    work = (double*) calloc( mp, sizeof( double ) );
    anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work );
    bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); }

/*  Compute product C = A*B */
    pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
    if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); }

/*  Compute difference  B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */
    pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC,
             &negone, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); }

/*  Compute norm of B - inv_A*C (which is contained in B) */
    diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    free( work );
    if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); }

/*  Print results */
    if( iam == 0 ){
        printf( ".. Solutions are compared ..\n" );
        printf( "== Results ==\n" );
        printf( "||A|| = %03.11f\n", anorm );
        printf( "||B|| = %03.11f\n", bnorm );
        printf( "=== END OF EXAMPLE =====================\n" );
    }

/*  Compute machine epsilon */
    eps = pdlamch_( &ictxt, "e" );

/*  Compute residual */
    residual = diffnorm /( two*anorm*bnorm*eps );

/*  Destroy arrays */
    free( A );
    free( B );
    free( C );

/*  Destroy process grid */    
    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
    
/*  Check if residual passed or failed the threshold */
    if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){
        printf( "FAILED. Residual = %05.16f\n", residual );
        return 1;
    } else {
        return 0;
    }

/*========================================================================
  ====== End of PBLAS Level 3 example ====================================
  ======================================================================*/
}
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *X=NULL, *B=NULL, *Y=NULL, *Xtrue=NULL;
  myscalar elem;
  int descA[BLACSCTXTSIZE], descVec[BLACSCTXTSIZE], descelem[BLACSCTXTSIZE];
  int n;
  int nrhs;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal rdummy, res;

  n=1024; /* Size of the problem */
  nrhs=3; /* Number of RHS */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    myreal pi=3.1416, d=0.1;
    for(i=1;i<=locr;i++) {
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
    }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nrhs,&nb,&mycol,&IZERO,&npcol);
    dummy=std::max(1,locr);
    descinit_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
    Xtrue=new myscalar[locr*locc]();
    for(i=0;i<locr*locc;i++)
      Xtrue[i]=static_cast<myscalar>(rand())/(static_cast<myscalar>(RAND_MAX));
  } else {
    descset_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution and intermediate vector space */
  if(myid<nprow*npcol) {
    X=new myscalar[locr*locc]();
    Y=new myscalar[locr*locc]();
  }

  /* Initializing RHS as A*Xtrue */
  if(myid<nprow*npcol) {
    B=new myscalar[locr*locc]();
    pgemm('N','N',n,nrhs,n,ONE,A,IONE,IONE,descA,Xtrue,IONE,IONE,descVec,ZERO,B,IONE,IONE,descVec);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-12;
  sdp.split_HSS=768; /* Size of A11 */

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Factorization */
  sdp.partially_factor(A,descA,sdp.split_HSS);

  /* Schur complement update */
  sdp.compute_schur();

  /* Extracting a random element from the Schur complement */
  if(myid<nprow*npcol) {
    descinit_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else {
    descset_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }
  i=1+rand()%(n-sdp.split_HSS);
  j=1+rand()%(n-sdp.split_HSS);
  MPI_Bcast((void *)&i,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  MPI_Bcast((void *)&j,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  sdp.extract_schur(&elem,descelem,&i,IONE,&j,IONE);
  if(!myid)
    std::cout << "Element (" << i << "," << j << ") of Schur complement = " << elem << std::endl << std::endl;

  /* Condensation */
  sdp.reduce_RHS(Y,descVec,B,descVec);

  /* Solve the Schur complement system (touches only the bottom part of X)*/
  sdp.verbose=false;
  if(!myid)
    std::cout << "Solving Schur complement system with Conjugate Gradient..." << std::endl << std::endl;
  CG(&sdp,X,Y,descVec,n,nrhs,3000,1e-14);
  sdp.verbose=true;

  /* Expansion (touches only the top part of X) */
  sdp.expand_solution(X,descVec,Y,descVec);

  /* Accuracy checking */
  sdp.check_solution(A,descA,X,descVec,B,descVec);

  /* Forward error */
  if(myid<nprow*npcol) {
    for(i=0;i<locr*locc;i++)
      Y[i]=X[i]-Xtrue[i];
    res=plange('F',n,nrhs,Y,IONE,IONE,descVec,&rdummy);
    res/=plange('F',n,nrhs,Xtrue,IONE,IONE,descVec,&rdummy);
    if(!myid)
      std::cout << "Forward error = " << res << std::endl;
  }

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] X;
  delete[] Y;
  delete[] Xtrue;

  /* The end */
  MPI_Finalize();
  return 0;

}
void CG(StrumpackDensePackage<myscalar,myreal> *sdp, myscalar *X, myscalar *B, int *descVec, int n, int nrhs, int niter, myreal threshold) {
  /* Conjugate Gradients. Calls sdp.schur_product for matvecs. Fills only the bottom part of X. */
  int printit=250;
  int IA;
  int ctxt;
  int nprow, npcol;
  int myrow, mycol;
  int rsrc, csrc;
  int nb;
  int locr, locc;
  int i, j;
  int idummy, ierr;
  int neff;
  int it;
  int desc[BLACSCTXTSIZE], descScal[BLACSCTXTSIZE];
  bool ingrid;
  myreal res;
  myscalar *x=NULL, *b=NULL;
  myscalar *r=NULL, *p=NULL, *Ap=NULL;
  myscalar alpha, rrprev, rrnext;

  IA=sdp->split_HSS+1;
  neff=n-IA+1;

  ctxt=descVec[BLACSctxt];
  rsrc=descVec[BLACSrsrc];
  csrc=descVec[BLACScsrc];
  nb=descVec[BLACSmb];
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);
  ingrid=myrow>=0 && mycol>=0;

  /* X and B have n rows. We create versions with
   * the last neff rows only that we pass to SDP.
   */
  if(ingrid) {
    locr=numroc_(&neff,&nb,&myrow,&rsrc,&nprow);
    locc=numroc_(&IONE,&nb,&mycol,&csrc,&npcol);
    idummy=locr>1?locr:1;
    descinit_(desc,&neff,&IONE,&nb,&nb,&rsrc,&csrc,&ctxt,&idummy,&ierr);
    x=new myscalar[locr*locc]();
    b=new myscalar[locr*locc]();
    r=new myscalar[locr*locc]();
    p=new myscalar[locr*locc]();
    Ap=new myscalar[locr*locc]();
  } else {
    locr=0;
    locc=0;
    descset_(desc,&neff,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Descriptor for the scalar containing the result of the dot product */
  if(ingrid)
    descinit_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&IONE,&ierr);
  else
    descset_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);

  for(j=1;j<=nrhs;j++) {
    /* One RHS at a time */
    if(ingrid) {
      pgeadd('N',neff,IONE,ONE,B,IA,j,descVec,ZERO,b,IONE,IONE,desc);
      pgeadd('N',neff,IONE,ONE,X,IA,j,descVec,ZERO,x,IONE,IONE,desc);
    }

    /* r = b - A x
     * If the Schur was explitly in matrix A, the BLAS call would be
     * pgemm('N','N',neff,IONE,neff,NONE,A,IA,IA,descA,x,IONE,IONE,desc,ONE,r,IONE,IONE,desc);
     *
     */
    if(ingrid)
      placpy('N',neff,IONE,b,IONE,IONE,desc,r,IONE,IONE,desc);
    sdp->schur_product('N',NONE,x,desc,ONE,r,desc);

    /* p = r */
    if(ingrid)
      placpy('N',neff,IONE,r,IONE,IONE,desc,p,IONE,IONE,desc);

    /* "Previous" r' * r */
    rrprev=ZERO;
    if(ingrid)
      pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrprev,IONE,IONE,descScal);
    MPI_Bcast((void *)&rrprev,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);

    it=1;
    while(it<=niter) {
      /* Ap = A * p
       * If the Schur was explitly in matrix A, the BLAS call would be
       * pgemm('N','N',neff,IONE,neff,ONE,A,IA,IA,descA,p,IONE,IONE,desc,ZERO,Ap,IONE,IONE,desc);
       *
       */
      sdp->schur_product('N',ONE,p,desc,ZERO,Ap,desc);

      /* alpha = r'*r / (p' * A * p) = rrprev/(p' * Ap) */
      alpha=ZERO;
      if(ingrid)
        pgemm('C','N',IONE,IONE,neff,ONE,p,IONE,IONE,desc,Ap,IONE,IONE,desc,ZERO,&alpha,IONE,IONE,descScal);
      MPI_Bcast((void *)&alpha,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);
      alpha=rrprev/alpha;

      /* x = x + alpha * p */
      for(i=0;i<locr*locc;i++)
        x[i]+=alpha*p[i];

      /* r = r - alpha * Ap */
      for(i=0;i<locr*locc;i++)
        r[i]-=alpha*Ap[i];

      /* "Next" r' * r */
      rrnext=ZERO;
      if(ingrid)
        pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrnext,IONE,IONE,descScal);
      MPI_Bcast((void *)&rrnext,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);

      /* Residual */
      res=sqrt(rrnext.real());
      if(it%printit==0)
        if(!myrow && !mycol)
          std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl;
      if(res<threshold)
        break;

      /* p = r + rrnext/rrprev * p */
      for(i=0;i<locr*locc;i++)
        p[i]=r[i]+rrnext/rrprev*p[i];

      /* "Previous" r' * r */
      rrprev=rrnext;

      it++;
    }

    if(it>niter)
      it=niter;

    if(it%printit)
      if(!myrow && !mycol)
        std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl << std::endl;

    /* Back to n-sized vector */
    if(ingrid)
      pgeadd('N',neff,IONE,ONE,x,IONE,IONE,desc,ZERO,X,IA,j,descVec);

  }

  delete[] b;
  delete[] x;
  delete[] r;
  delete[] p;
  delete[] Ap;
}
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *B=NULL, *Btrue=NULL;
  int descA[BLACSCTXTSIZE], descB[BLACSCTXTSIZE];
  int n;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int *I, *J;
  int nI, nJ;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal err;

  n=1024; /* Size of the problem */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    for(i=1;i<=locr;i++)
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        myreal pi=3.1416, d=0.1;
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-6;

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Element extraction: a bunch of random indices.
   * Not that duplicates do not matter (the code works).
   */
  nI=1+rand()%n;
  MPI_Bcast((void*)&nI,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  I=new int[nI];
  if(!myid)
    for(i=0;i<nI;i++)
      I[i]=1+rand()%n;
  MPI_Bcast((void*)I,nI,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  nJ=1+rand()%n;
  MPI_Bcast((void*)&nJ,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  J=new int[nJ];
  if(!myid)
    for(j=0;j<nJ;j++)
      J[j]=1+rand()%n;
  MPI_Bcast((void*)J,nJ,MPI_INTEGER,IZERO,MPI_COMM_WORLD);

  /* Extraction for the HSS form */
  if(myid<nprow*npcol) {
    locr=numroc_(&nI,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nJ,&nb,&mycol,&IZERO,&npcol);
    B=new myscalar[locr*locc]();
    dummy=std::max(1,locr);
    descinit_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else
    descset_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  sdp.extract(A,descA,B,descB,I,nI,J,nJ);

  /* Extraction from the original matrix just to compare */
  sdp.use_HSS=false;
  if(myid<nprow*npcol)
    Btrue=new myscalar[locr*locc];
  sdp.extract(A,descA,Btrue,descB,I,nI,J,nJ);

  /* Comparison with elements of input matrix */
  if(myid<nprow*npcol){
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL);
    for(i=0;i<locr*locc;i++)
      Btrue[i]-=B[i];
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL)/err;
  }
  if(!myid)
    std::cout << "Element extraction (" << nI << "x" << nJ << " submatrix): maximum relative error max ||A(I,J)-HSS(I,J)||//||A(I,J)|| = " << err << std::endl << std::endl;

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] Btrue;
  delete[] I;
  delete[] J;

  /* The end */
  MPI_Finalize();
  return 0;

}