/* Square matrix-matrix multiplication */
void matrix_multiply(int M, int N, int K, 
		int blockX_len, int blockY_len) 
{
	/* Local buffers and Global arrays declaration */
	double *a=NULL, *b=NULL, *c=NULL;

	int dims[NDIMS], ld[NDIMS], chunks[NDIMS];
	int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */

	int g_a, g_b, g_c, g_cnt, g_cnt2;
	int offset;
	double alpha = 1.0, beta=0.0;
	int count_p = 0, next_p = 0;
	int count_gac = 0, next_gac = 0;
	double t1,t2,seconds;
        ga_nbhdl_t nbh;
        int count_acc = 0;

	/* Find local processor ID and the number of processes */
	int proc=GA_Nodeid(), nprocs=GA_Nnodes();

	if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 
			|| (K % blockX_len) != 0 || (K % blockY_len) != 0)
		GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101);

	/* Allocate/Set process local buffers */
	a = malloc (blockX_len * blockY_len * sizeof(double)); 
	b = malloc (blockX_len * blockY_len * sizeof(double)); 
	c = malloc (blockX_len * blockY_len * sizeof(double));

	cdims[0] = blockX_len;
	cdims[1] = blockY_len;	

	/* Configure array dimensions */
	for(int i = 0; i < NDIMS; i++) {
		dims[i]  = N;
		chunks[i] = -1;
		ld[i]    = cdims[i]; /* leading dimension/stride of the local buffer */
	}

	/* create a global array g_a and duplicate it to get g_b and g_c*/
	g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks);

	if (!g_a) 
		GA_Error("NGA_Create failed: A", NDIMS);

#if DEBUG>1
	if (proc == 0) 
		printf("  Created Array A\n");
#endif
	/* Ditto for C and B */
	g_b = GA_Duplicate(g_a, "array B");
	g_c = GA_Duplicate(g_a, "array C");

	if (!g_b || !g_c) 
		GA_Error("GA_Duplicate failed",NDIMS);
	if (proc == 0) 
		printf("Created Arrays B and C\n");

	/* Subscript array for read-incr, which is nothing but proc */
	int * rdcnt = malloc (nprocs * sizeof(int));
	memset (rdcnt, 0, nprocs * sizeof(int));
	int * rdcnt2 = malloc (nprocs * sizeof(int));
	memset (rdcnt2, 0, nprocs * sizeof(int));

	/* Create global array of nprocs elements for nxtval */	
	int counter_dim[1];
	counter_dim[0] = nprocs;

	g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL);

	if (!g_cnt) 
		GA_Error("Shared counter failed",1);

	g_cnt2 = GA_Duplicate(g_cnt, "another shared counter");

	if (!g_cnt2) 
		GA_Error("Another shared counter failed",1);

	GA_Zero(g_cnt);
	GA_Zero(g_cnt2);

#if DEBUG>1	
	/* initialize data in matrices a and b */
	if(proc == 0)
		printf("Initializing local buffers - a and b\n");
#endif
	int w = 0; 
	int l = 7;
	for(int i = 0; i < cdims[0]; i++) {
		for(int j = 0; j < cdims[1]; j++) {
			a[i*cdims[1] + j] = (double)(++w%29);
			b[i*cdims[1] + j] = (double)(++l%37);
		}
	}

	/* Copy data to global arrays g_a and g_b from local buffers */
	next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
	for (int i = 0; i < N; i+=cdims[0]) 
	{
		if (next_p == count_p) {
			for (int j = 0; j < N; j+=cdims[1])
			{
				/* Indices of patch */
				lo[0] = i;
				lo[1] = j;
				hi[0] = lo[0] + cdims[0];
				hi[1] = lo[1] + cdims[1];

				hi[0] = hi[0]-1;
				hi[1] = hi[1]-1;
#if DEBUG>1
				printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Put(g_a, lo, hi, a, ld);
				NGA_Put(g_b, lo, hi, b, ld);

			}
			next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
		}		
		count_p++;
	}


#if DEBUG>1
	printf ("After NGA_PUT to global - A and B arrays\n");
#endif
	/* Synchronize all processors to make sure puts from 
	   nprocs has finished before proceeding with dgemm */
	GA_Sync();

	t1 = GA_Wtime();

	next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
	for (int m = 0; m < N; m+=cdims[0])
	{
		for (int k = 0; k < N; k+=cdims[0])
		{
			if (next_gac == count_gac)	
			{
				/* A = m x k */
				lo[0] = m; lo[1] = k;
				hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];

				hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
				printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Get(g_a, lo, hi, a, ld);

				for (int n = 0; n < N; n+=cdims[1])
				{
					memset (c, 0, sizeof(double) * cdims[0] * cdims[1]);
					/* B = k x n */
					lo[0] = k; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_Get(g_b, lo, hi, b, ld);


					//_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0);

					/* TODO I am assuming square matrix blocks, further testing/work 
					   required for rectangular matrices */
					cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */
							cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha,
							a, cdims[0], /* lda */ b, cdims[1], /* ldb */
							beta=1.0, c, cdims[0] /* ldc */);

					NGA_NbWait(&nbh);

					/* C = m x n */
					lo[0] = m; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh);
					count_acc += 1;
				} /* END LOOP N */
				next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
			} /* ENDIF if count == next */
			count_gac++;
		} /* END LOOP K */
	} /* END LOOP M */

	GA_Sync();
	t2 = GA_Wtime();
	seconds = t2 - t1;
	if (proc == 0)
		printf("Time taken for MM (secs):%lf \n", seconds);

        printf("Number of ACC: %d\n", count_acc);

	/* Correctness test - modify data again before this function */
	for (int i = 0; i < NDIMS; i++) {
		lo[i] = 0;
		hi[i] = dims[i]-1;
		ld[i] = dims[i];
	}

	verify(g_a, g_b, g_c, lo, hi, ld, N);

	/* Clear local buffers */
	free(a);
	free(b);
	free(c);
	free(rdcnt);
	free(rdcnt2);

	GA_Sync();

	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);
	GA_Destroy(g_c);
	GA_Destroy(g_cnt);
	GA_Destroy(g_cnt2);
}
Example #2
0
int main(int argc, char **argv) {
  int nmax, nprocs, me, me_plus;
  int g_a_data, g_a_i, g_a_j, isize;
  int gt_a_data, gt_a_i, gt_a_j;
  int g_b, g_c;
  int i, j, jj, k, one, jcnt;
  int chunk, kp1, ld;
  int *p_i, *p_j;
  double *p_data, *p_b, *p_c;
  double t_beg, t_beg2, t_ga_tot, t_get, t_mult, t_cnstrct, t_mpi_in, t_ga_in;
  double t_hypre_strct, t_ga_trans, t_gp_get;
  double t_get_blk_csr, t_trans_blk_csr, t_trans_blk, t_create_csr_ga, t_beg3;
  double t_gp_tget, t_gp_malloc, t_gp_assign, t_beg4;
  double prdot, dotga, dothypre, tempc;
  double prtot, gatot, hypretot, gatot2, hypretot2;
  double prdot2, prtot2;
  int status;
  int idim, jdim, kdim, idum, memsize;
  int lsize, ntot;
  int heap=200000, fudge=100, stack=200000, ma_heap;
  double *cbuf, *vector;
  int pdi, pdj, pdk, ip, jp, kp, ncells;
  int lo[3],hi[3];
  int blo[3], bhi[3];
  int ld_a, ld_b, ld_c, ld_i, ld_j, irows, ioff, joff, total_procs;
  int iproc, iblock, btot;
  double *amat, *bvec;
  int *ivec, *jvec;
  int *proclist, *proc_inv, *icnt;
  int *voffset, *offset, *mapc;
  int iloop, lo_bl, hi_bl;
  char *buf, **buf_ptr;
  int *iparams, *jval, *ival;
  double *rval, *rvalt;
  int imin, imax, jmin, jmax, irow, icol, nnz;
  int nrows, kmin, kmax, lmin, lmax, jdx;
  int LOOPNUM = 100;
  void **blk_ptr;
  void *blk;
  int blk_size, tsize, zero;
  int *iblk, *jblk, *blkidx;
  int *tblk_ptr;
  int *ivalt, *jvalt, *iparamst;
  int *iblk_t, *jblk_t, *blkidx_t;
/*
   Hypre declarations
*/
  int ierr;
#if USE_HYPRE
  HYPRE_StructGrid grid;
  HYPRE_StructStencil stencil;
  HYPRE_StructMatrix matrix;
  HYPRE_StructVector vec_x, vec_y;
  int i4, j4, ndim, nelems, offsets[7][3];
  int stencil_indices[7], hlo[3], hhi[3];
  double weights[7];
  double *values;
  double alpha, beta;
  int *rows, *cols;
#endif
/*
  ***  Intitialize a message passing library
*/
  zero = 0;
  one = 1;
  ierr = MPI_Init(&argc, &argv);
/*
 ***  Initialize GA
 
      There are 2 choices: ga_initialize or ga_initialize_ltd.
      In the first case, there is no explicit limit on memory usage.
      In the second, user can set limit (per processor) in bytes.
*/
  t_beg = GA_Wtime();
  NGA_Initialize();
  GP_Initialize();
  t_ga_in = GA_Wtime() - t_beg;
  NGA_Dgop(&t_ga_in,one,"+");

  t_ga_tot = 0.0;
  t_ga_trans = 0.0;
  t_get_blk_csr = 0.0;
  t_create_csr_ga = 0.0;
  t_trans_blk_csr = 0.0;
  t_trans_blk = 0.0;
  t_gp_get = 0.0;
  t_gp_malloc = 0.0;
  t_gp_assign = 0.0;
  t_mult = 0.0;
  t_get = 0.0;
  t_gp_tget = 0.0;
  t_hypre_strct = 0.0;
  prtot = 0.0;
  prtot2 = 0.0;
  gatot = 0.0;
  hypretot = 0.0;

  me = NGA_Nodeid();
  me_plus = me + 1;
  nprocs = NGA_Nnodes();
  if (me == 0) {
   printf("Time to initialize GA:                                 %12.4f\n",
          t_ga_in/((double)nprocs));
  }
/*
     we can also use GA_set_memory_limit BEFORE first ga_create call
*/
  ma_heap = heap + fudge;
/*      call GA_set_memory_limit(util_mdtob(ma_heap)) */
 
  if (me == 0) {
    printf("\nNumber of cores used: %d\n\nGA initialized\n\n",nprocs);
  }
/*
 ***  Initialize the MA package
      MA must be initialized before any global array is allocated
*/
  if (!MA_init(MT_DBL, stack, ma_heap)) NGA_Error("ma_init failed",-1);
/*
     create a sparse LMAX x LMAX matrix and two vectors of length
     LMAX. The matrix is stored in compressed row format.
     One of the vectors is filled with random data and the other
     is filled with zeros.
*/
  idim = IMAX;
  jdim = JMAX;
  kdim = KMAX;
  ntot = idim*jdim*kdim;
  if (me == 0) {
    printf("\nDimension of matrix: %d\n\n",ntot);
  }
  t_beg = GA_Wtime();
  grid_factor(nprocs,idim,jdim,kdim,&pdi,&pdj,&pdk);
  if (me == 0) {
    printf("\nProcessor grid configuration\n");
    printf("  PDX: %d\n",pdi);
    printf("  PDY: %d\n",pdj);
    printf("  PDZ: %d\n\n",pdk);
    printf(" Number of Loops: %d\n",LOOPNUM);
  }

  create_laplace_mat(idim,jdim,kdim,pdi,pdj,pdk,&g_a_data,&g_a_j,&g_a_i,&mapc);
  t_cnstrct = GA_Wtime() - t_beg;

  g_b = NGA_Create_handle();
  NGA_Set_data(g_b,one,&ntot,MT_DBL);
  NGA_Set_irreg_distr(g_b,mapc,&nprocs);
  status = NGA_Allocate(g_b);
/*
    fill g_b with random values
*/
  NGA_Distribution(g_b,me,blo,bhi);
  NGA_Access(g_b,blo,bhi,&p_b,&ld);
  ld = bhi[0]-blo[0]+1;
  btot = ld;
  vector = (double*)malloc(ld*sizeof(double));
  for (i=0; i<ld; i++) {
    idum  = 0;
    p_b[i] = ran3(&idum);
    vector[i] = p_b[i];
  }
  NGA_Release(g_b,blo,bhi);
  NGA_Sync();

  g_c = NGA_Create_handle();
  NGA_Set_data(g_c,one,&ntot,MT_DBL);
  NGA_Set_irreg_distr(g_c,mapc,&nprocs);
  status = NGA_Allocate(g_c);
  NGA_Zero(g_c);
#if USE_HYPRE
/*
    Assemble HYPRE grid and use that to create matrix. Start by creating
    grid partition
*/
  ndim = 3;
  i = me;
  ip = i%pdi;
  i = (i-ip)/pdi;
  jp = i%pdj;
  kp = (i-jp)/pdj;
  lo[0] = (int)(((double)idim)*((double)ip)/((double)pdi));
  if (ip < pdi-1) {
    hi[0] = (int)(((double)idim)*((double)(ip+1))/((double)pdi)) - 1;
  } else {
    hi[0] = idim - 1;
  }
  lo[1] = (int)(((double)jdim)*((double)jp)/((double)pdj));
  if (jp < pdj-1) {
    hi[1] = (int)(((double)jdim)*((double)(jp+1))/((double)pdj)) - 1;
  } else {
    hi[1] = jdim - 1;
  }
  lo[2] = (int)(((double)kdim)*((double)kp)/((double)pdk));
  if (kp < pdk-1) {
    hi[2] = (int)(((double)kdim)*((double)(kp+1))/((double)pdk)) - 1;
  } else {
    hi[2] = kdim - 1;
  }
/*
   Create grid
*/
  hlo[0] = lo[0];
  hlo[1] = lo[1];
  hlo[2] = lo[2];
  hhi[0] = hi[0];
  hhi[1] = hi[1];
  hhi[2] = hi[2];
  ierr = HYPRE_StructGridCreate(MPI_COMM_WORLD, ndim, &grid);
  ierr = HYPRE_StructGridSetExtents(grid, hlo, hhi);
  ierr = HYPRE_StructGridAssemble(grid);
/*
   Create stencil
*/
  offsets[0][0] = 0;
  offsets[0][1] = 0;
  offsets[0][2] = 0;

  offsets[1][0] = 1;
  offsets[1][1] = 0;
  offsets[1][2] = 0;

  offsets[2][0] = 0;
  offsets[2][1] = 1;
  offsets[2][2] = 0;

  offsets[3][0] = 0;
  offsets[3][1] = 0;
  offsets[3][2] = 1;

  offsets[4][0] = -1;
  offsets[4][1] = 0;
  offsets[4][2] = 0;

  offsets[5][0] = 0;
  offsets[5][1] = -1;
  offsets[5][2] = 0;

  offsets[6][0] = 0;
  offsets[6][1] = 0;
  offsets[6][2] = -1;

  nelems = 7;
  ierr = HYPRE_StructStencilCreate(ndim, nelems, &stencil);
  for (i=0; i<nelems; i++) {
    ierr = HYPRE_StructStencilSetElement(stencil, i, offsets[i]);
  }

  ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
  jcnt = 7*ncells;
  values = (double*)malloc(jcnt*sizeof(double));
  jcnt = 0;
  weights[0] = 6.0;
  weights[1] = -1.0;
  weights[2] = -1.0;
  weights[3] = -1.0;
  weights[4] = -1.0;
  weights[5] = -1.0;
  weights[6] = -1.0;
  for (i=0; i<ncells; i++) {
    for (j=0; j<7; j++) {
      values[jcnt] = weights[j];
      jcnt++;
    }
  }

  ierr = HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &matrix);
  ierr = HYPRE_StructMatrixInitialize(matrix);
  for (i=0; i<7; i++) {
    stencil_indices[i] = i;
  }
  ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, 7, stencil_indices, values);
  free(values);
/*
   Check all six sides of current box to see if any are boundaries.
   Set values to zero if they are.
*/
  if (hi[0] == idim-1) {
    ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
    hlo[0] = idim-1;
    hhi[0] = idim-1;
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 1;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (hi[1] == jdim-1) {
    ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = jdim-1;
    hhi[1] = jdim-1;
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 2;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  } 
  if (hi[2] == kdim-1) {
    ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = kdim-1;
    hhi[2] = kdim-1;
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 3;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (lo[0] == 0) {
    ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
    hlo[0] = 0;
    hhi[0] = 0;
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 4;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (lo[1] == 0) {
    ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = 0;
    hhi[1] = 0;
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 5;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (lo[2] == 1) {
    ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = 0;
    hhi[2] = 0;
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 6;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  ierr = HYPRE_StructMatrixAssemble(matrix);
/*
    Create vectors for matrix-vector multiply
*/
  ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_x);
  ierr = HYPRE_StructVectorInitialize(vec_x);
  hlo[0] = lo[0];
  hlo[1] = lo[1];
  hlo[2] = lo[2];
  hhi[0] = hi[0];
  hhi[1] = hi[1];
  hhi[2] = hi[2];
  ierr = HYPRE_StructVectorSetBoxValues(vec_x, hlo, hhi, vector);
  ierr = HYPRE_StructVectorAssemble(vec_x);
  NGA_Distribution(g_a_i,me,blo,bhi);

  if (bhi[1] > ntot-1) {
    bhi[1] = ntot-1;
  }

  btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);

  for (i=0; i<btot; i++) vector[i] = 0.0;
  hlo[0] = lo[0];
  hlo[1] = lo[1];
  hlo[2] = lo[2];
  hhi[0] = hi[0];
  hhi[1] = hi[1];
  hhi[2] = hi[2];
  ierr = HYPRE_StructVectorGetBoxValues(vec_x, hlo, hhi, vector);

  for (i=0; i<btot; i++) vector[i] = 0.0;
  ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_y);
  ierr = HYPRE_StructVectorInitialize(vec_y);
  ierr = HYPRE_StructVectorSetBoxValues(vec_y, hlo, hhi, vector);
  ierr = HYPRE_StructVectorAssemble(vec_y);
#endif
/* Multiply sparse matrix. Start by accessing pointers to local portions of
   g_a_data, g_a_j, g_a_i */

  NGA_Sync();
  for (iloop=0; iloop<LOOPNUM; iloop++) {
    t_beg2 = GA_Wtime();

    NGA_Distribution(g_c,me,blo,bhi);
    NGA_Access(g_c,blo,bhi,&p_c,&ld_c);
    for (i = 0; i<bhi[0]-blo[0]+1; i++) {
      p_c[i] = 0.0;
    }

/* get number of matrix blocks coupled to this process */
    NGA_Get(g_a_i,&me,&me,&lo_bl,&one);
#if 1
    NGA_Get(g_a_i,&me_plus,&me_plus,&hi_bl,&one);
    hi_bl--;
    total_procs = hi_bl - lo_bl + 1;
    blk_ptr = (void**)malloc(sizeof(void*));
/* Loop through matrix blocks */
    ioff = 0;
    for (iblock = 0; iblock<total_procs; iblock++) {
      t_beg = GA_Wtime();
      jdx = lo_bl+iblock;
#if 0
      GP_Access_element(g_a_data, &jdx, &blk_ptr[0], &isize);
#endif
#if 1
      GP_Get_size(g_a_data, &jdx, &jdx, &isize);
#endif
      blk = (void*)malloc(isize);
#if 1
      GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0); 
#endif
      t_gp_get = t_gp_get + GA_Wtime() - t_beg;
      iparams = (int*)blk_ptr[0];
      rval = (double*)(iparams+7);
      imin = iparams[0];
      imax = iparams[1];
      jmin = iparams[2];
      jmax = iparams[3];
      irow = iparams[4];
      icol = iparams[5];
      nnz = iparams[6];
      jval = (int*)(rval+nnz);
      ival = (int*)(jval+nnz);
      nrows = imax - imin + 1;
      bvec = (double*)malloc((jmax-jmin+1)*sizeof(double));
      j = 0;
      t_beg = GA_Wtime();
      NGA_Get(g_b,&jmin,&jmax,bvec,&j);
      t_get = t_get + GA_Wtime() - t_beg;
      t_beg = GA_Wtime();
      for (i=0; i<nrows; i++) {
        kmin = ival[i];
        kmax = ival[i+1]-1;
        tempc = 0.0;
        for (j = kmin; j<=kmax; j++) {
          jj = jval[j];
          tempc = tempc + rval[j]*bvec[jj];
        }
        p_c[i] = p_c[i] + tempc;
      }
      t_mult = t_mult + GA_Wtime() - t_beg;
      free(bvec);
      free(blk);
    }
    NGA_Sync();
    t_ga_tot = t_ga_tot + GA_Wtime() - t_beg2;

    NGA_Distribution(g_c,me,blo,bhi);
    NGA_Release(g_c,blo,bhi);

#if USE_HYPRE
    alpha = 1.0;
    beta = 0.0;
    t_beg = GA_Wtime();
    ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
    t_hypre_strct = t_hypre_strct + GA_Wtime() - t_beg;
    hlo[0] = lo[0];
    hlo[1] = lo[1];
    hlo[2] = lo[2];
    hhi[0] = hi[0];
    hhi[1] = hi[1];
    hhi[2] = hi[2];
    ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
    NGA_Distribution(g_c,me,hlo,hhi);
    cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
    NGA_Get(g_c,hlo,hhi,cbuf,&one);
    prdot = 0.0;
    dotga = 0.0;
    dothypre = 0.0;
    for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
      dothypre = dothypre + vector[i]*vector[i];
      dotga = dotga + cbuf[i]*cbuf[i];
      prdot = prdot + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
    }
    NGA_Dgop(&dotga,1,"+");
    NGA_Dgop(&dothypre,1,"+");
    NGA_Dgop(&prdot,1,"+");
    gatot += sqrt(dotga);
    hypretot += sqrt(dothypre);
    prtot += sqrt(prdot);
    free(cbuf);
#endif

/* Transpose matrix. Start by making local copies of ival and jval arrays for
   the sparse matrix of blocks stored in the GP array */
#if 1
    t_beg2 = GA_Wtime();
    t_beg3 = GA_Wtime();
    iblk = (int*)malloc((nprocs+1)*sizeof(int));
    iblk_t = (int*)malloc((nprocs+1)*sizeof(int));
#if 0
    NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
#else
    if (me == 0) {
      NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
    } else {
      for (i=0; i<nprocs+1; i++) {
        iblk[i] = 0;
      }
    }
    GA_Igop(iblk,nprocs+1,"+");
#endif
    jblk = (int*)malloc(iblk[nprocs]*sizeof(int));
    jblk_t = (int*)malloc(iblk[nprocs]*sizeof(int));
    iblock = iblk[nprocs]-1;
#if 0
    NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
#else
    if (me == 0) {
      NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
    } else {
      for (i=0; i<iblock+1; i++) {
        jblk[i] = 0;
      }
    }
    GA_Igop(jblk,iblock+1,"+");
#endif
    iblock++;
    blkidx = (int*)malloc(iblk[nprocs]*sizeof(int));
    blkidx_t = (int*)malloc(iblk[nprocs]*sizeof(int));
    for (i=0; i<iblock; i++) {
      blkidx[i] = i;
    }
    iblock = nprocs;
    t_get_blk_csr = t_get_blk_csr + GA_Wtime() - t_beg3;
    t_beg3 = GA_Wtime();
    stran(iblock, iblock, iblk, jblk, blkidx, iblk_t, jblk_t, blkidx_t);
    t_trans_blk_csr = t_trans_blk_csr + GA_Wtime() - t_beg3;
    t_beg3 = GA_Wtime();
    gt_a_data = GP_Create_handle();
    i = iblk_t[nprocs];
    GP_Set_dimensions(gt_a_data, one, &i);
    GP_Set_irreg_distr(gt_a_data, iblk_t, &nprocs);
    GP_Allocate(gt_a_data);

    gt_a_j = NGA_Create_handle();
    i = iblk_t[nprocs];
    NGA_Set_data(gt_a_j, one, &i, C_INT);
    NGA_Set_irreg_distr(gt_a_j, iblk_t, &nprocs);
    NGA_Allocate(gt_a_j);

    gt_a_i = NGA_Create_handle();
    i = nprocs+1;
    NGA_Set_data(gt_a_i,one,&i,C_INT);
    for (i=0; i<nprocs; i++) mapc[i] = i;
    NGA_Set_irreg_distr(gt_a_i, mapc, &nprocs);
    NGA_Allocate(gt_a_i);

    /* copy i and j arrays of transposed matrix into distributed arrays */
    if (me==0) {
      lo_bl = 0;
      hi_bl = nprocs;
      NGA_Put(gt_a_i,&lo_bl,&hi_bl,iblk_t,&one);
      lo_bl = 0;
      hi_bl = iblk_t[nprocs]-1;
      NGA_Put(gt_a_j,&lo_bl,&hi_bl,jblk_t,&one);
    }
    NGA_Sync();
    lo_bl = iblk[me];
    hi_bl = iblk[me+1];
    total_procs = hi_bl - lo_bl + 1;
    total_procs = hi_bl - lo_bl;
    t_create_csr_ga = t_create_csr_ga + GA_Wtime() - t_beg3;
    for (iblock = lo_bl; iblock < hi_bl; iblock++) {
      t_beg4 = GA_Wtime();
      jdx = blkidx_t[iblock];
      GP_Get_size(g_a_data, &jdx, &jdx, &isize);
      blk = (void*)malloc(isize);
      GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0); 
      /* Parameters for original block */
      iparams = (int*)blk_ptr[0];
      rval = (double*)(iparams+7);
      imin = iparams[0];
      imax = iparams[1];
      jmin = iparams[2];
      jmax = iparams[3];
      irow = iparams[4];
      icol = iparams[5];
      nnz = iparams[6];
      jval = (int*)(rval+nnz);
      ival = (int*)(jval+nnz);

      /* Create transposed block */
      isize = 7*sizeof(int) + nnz*(sizeof(double)+sizeof(int))
            + (jmax-jmin+2)*sizeof(int);
      t_gp_tget = t_gp_tget + GA_Wtime() - t_beg4;
      t_beg4 = GA_Wtime();
      tblk_ptr = (int*)GP_Malloc(isize);
      t_gp_malloc = t_gp_malloc + GA_Wtime() - t_beg4;
      t_beg3 = GA_Wtime();
      iparamst = (int*)tblk_ptr;
      rvalt = (double*)(iparamst+7);
      jvalt = (int*)(rvalt+nnz);
      ivalt = (int*)(jvalt+nnz);
      iparamst[0] = jmin;
      iparamst[1] = jmax;
      iparamst[2] = imin;
      iparamst[3] = imax;
      iparamst[4] = icol;
      iparamst[5] = irow;
      iparamst[6] = nnz;
      i = imax-imin+1;
      j = jmax-jmin+1;
      stranr(i, j, ival, jval, rval, ivalt, jvalt, rvalt);
      t_trans_blk = t_trans_blk + GA_Wtime() - t_beg3;
      t_beg4 = GA_Wtime();
      GP_Assign_local_element(gt_a_data, &iblock, (void*)tblk_ptr, isize);
      t_gp_assign = t_gp_assign + GA_Wtime() - t_beg4;
#if 1
      free(blk);
#endif
    }

    /* Clean up after transpose */
#if 1
    free(iblk);
    free(iblk_t);
    free(jblk);
    free(jblk_t);
    free(blkidx);
    free(blkidx_t);
#endif
    NGA_Sync();
    t_ga_trans = t_ga_trans + GA_Wtime() - t_beg2;
#if USE_HYPRE
    alpha = 1.0;
    beta = 0.0;
    ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
    hlo[0] = lo[0];
    hlo[1] = lo[1];
    hlo[2] = lo[2];
    hhi[0] = hi[0];
    hhi[1] = hi[1];
    hhi[2] = hi[2];
    ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
    NGA_Distribution(g_c,me,hlo,hhi);
    cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
    NGA_Get(g_c,hlo,hhi,cbuf,&one);
    dothypre = 0.0;
    dotga = 0.0;
    prdot2 = 0.0;
    for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
      dothypre = dothypre + vector[i]*vector[i];
      dotga = dotga + cbuf[i]*cbuf[i];
      if (fabs(vector[i]-cbuf[i]) > 1.0e-10) {
        printf("p[%d] i: %d vector: %f cbuf: %f\n",me,i,vector[i],cbuf[i]);
      }
      prdot2 = prdot2 + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
    }
    NGA_Dgop(&dotga,1,"+");
    NGA_Dgop(&dothypre,1,"+");
    NGA_Dgop(&prdot2,1,"+");
    prtot2 += sqrt(prdot2);
    gatot2 += sqrt(dotga);
    hypretot2 += sqrt(dothypre);
    free(cbuf);
    free(blk_ptr);
#endif
    /* Clean up transposed matrix */
    GP_Distribution(gt_a_data,me,blo,bhi);
    for (i=blo[0]; i<bhi[0]; i++) {
      GP_Free(GP_Free_local_element(gt_a_data,&i));
    }
    GP_Destroy(gt_a_data);
    NGA_Destroy(gt_a_i);
    NGA_Destroy(gt_a_j);
#endif
#endif
  }
  free(vector);
#if USE_HYPRE
  if (me == 0) {
    printf("Magnitude of GA solution:                         %e\n",
        gatot/((double)LOOPNUM));
    printf("Magnitude of HYPRE solution:                      %e\n",
        hypretot/((double)LOOPNUM));
    printf("Magnitude of GA solution(2):                      %e\n",
        gatot2/((double)LOOPNUM));
    printf("Magnitude of HYPRE solution(2):                   %e\n",
        hypretot2/((double)LOOPNUM));
    printf("Difference between GA and HYPRE (Struct) results: %e\n",
        prtot/((double)LOOPNUM));
    printf("Difference between transpose and HYPRE results:   %e\n",
        prtot2/((double)LOOPNUM));
  }
#endif

/*
   Clean up arrays
*/
  NGA_Destroy(g_b);
  NGA_Destroy(g_c);
  GP_Distribution(g_a_data,me,blo,bhi);
  for (i=blo[0]; i<bhi[0]; i++) {
    GP_Free(GP_Free_local_element(g_a_data,&i));
  }
  GP_Destroy(g_a_data);
  NGA_Destroy(g_a_i);
  NGA_Destroy(g_a_j);
#if USE_HYPRE
  ierr = HYPRE_StructStencilDestroy(stencil);
  ierr = HYPRE_StructGridDestroy(grid);
  ierr = HYPRE_StructMatrixDestroy(matrix);
  ierr = HYPRE_StructVectorDestroy(vec_x);
  ierr = HYPRE_StructVectorDestroy(vec_y);
#endif

  NGA_Dgop(&t_cnstrct,1,"+");
  NGA_Dgop(&t_get,1,"+");
  NGA_Dgop(&t_gp_get,1,"+");
  NGA_Dgop(&t_mult,1,"+");
  NGA_Dgop(&t_ga_tot,1,"+");
  NGA_Dgop(&t_ga_trans,1,"+");
  NGA_Dgop(&t_get_blk_csr,1,"+");
  NGA_Dgop(&t_trans_blk_csr,1,"+");
  NGA_Dgop(&t_trans_blk,1,"+");
  NGA_Dgop(&t_create_csr_ga,1,"+");
  NGA_Dgop(&t_gp_tget,1,"+");
  NGA_Dgop(&t_gp_malloc,1,"+");
  NGA_Dgop(&t_gp_assign,1,"+");
#if USE_HYPRE
  NGA_Dgop(&t_hypre_strct,1,"+");
#endif
  free(mapc);

  if (me == 0) {
    printf("Time to create sparse matrix:                         %12.4f\n",
      t_cnstrct/((double)(nprocs*LOOPNUM)));
    printf("Time to get right hand side vector:                   %12.4f\n",
      t_get/((double)(nprocs*LOOPNUM)));
    printf("Time to get GP blocks:                                %12.4f\n",
      t_gp_get/((double)(nprocs*LOOPNUM)));
    printf("Time for sparse matrix block multiplication:          %12.4f\n",
      t_mult/((double)(nprocs*LOOPNUM)));
    printf("Time for total sparse matrix multiplication:          %12.4f\n",
      t_ga_tot/((double)(nprocs*LOOPNUM)));
#if USE_HYPRE
    printf("Total time for HYPRE (Struct)  matrix-vector multiply:%12.4f\n",
      t_hypre_strct/((double)(nprocs*LOOPNUM)));
#endif
    printf("Time to get block CSR distribution:                   %12.4f\n",
      t_get_blk_csr/((double)(nprocs*LOOPNUM)));
    printf("Time for transposing block CSR distribution:          %12.4f\n",
      t_trans_blk_csr/((double)(nprocs*LOOPNUM)));
    printf("Time for creating transposed block CSR GA:            %12.4f\n",
      t_create_csr_ga/((double)(nprocs*LOOPNUM)));
    printf("Time for transposing blocks:                          %12.4f\n",
      t_trans_blk/((double)(nprocs*LOOPNUM)));
    printf("Time to get GP blocks for transpose:                  %12.4f\n",
      t_gp_tget/((double)(nprocs*LOOPNUM)));
    printf("Time to malloc GP blocks for transpose:               %12.4f\n",
      t_gp_malloc/((double)(nprocs*LOOPNUM)));
    printf("Time to assign GP blocks for transpose:               %12.4f\n",
      t_gp_assign/((double)(nprocs*LOOPNUM)));
    printf("Time for total sparse matrix transpose:               %12.4f\n",
      t_ga_trans/((double)(nprocs*LOOPNUM)));
  }
  if (me==0) {
    printf("Terminating GA library\n");
  }
  NGA_Terminate();
/*
 ***  Tidy up after message-passing library
 */
  ierr = MPI_Finalize();
}