C++ (Cpp) NGA_Get Examples

Example #1

0

Show file

File: ga_transpose3.c Project: jeffhammond/ga

validate_transpose(int g_A, int g_B, int* lo, int* hi, int ld)
{
  int i, j;
  int local_A[SIZE][SIZE], local_B[SIZE][SIZE];
  //  int  **local_A=NULL, **local_B=NULL;    

  /*
  local_A=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    local_value[i]=(int*)malloc(SIZE*sizeof(int));

  local_B=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    local_value[i]=(int*)malloc(SIZE*sizeof(int));
  */

  NGA_Get(g_A, lo, hi, local_A, &ld);
  NGA_Get(g_B, lo, hi, local_B, &ld);
  
  for(i=1; i<SIZE; i++)
    {
      for(j=1; j<SIZE; j++)
	{
	      if(local_B[j][i]!=local_A[i][j])
		GA_Error("ERROR : in passing values", DIM);
	}
    }
  
}

Example #2

0

Show file

File: transp1D.c Project: MihaiLupoiu/TPP

/*
 * Check to see if inversion is correct. Start by copying g_a into local
 * buffer a, and g_b into local buffer b.
 */
void verify(int g_a, int g_b) {

    int i, type, ndim, dims[MAXDIM], lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
    int a[MAXPROC*TOTALELEMS],b[MAXPROC*TOTALELEMS];
    
    /* Get dimensions of GA */
    NGA_Inquire(g_a, &type, &ndim, dims);

    lo[0] = 0;
    hi[0] = dims[0]-1;
    /* ### copy the block of data described by the arrays "lo" and "hi" from
     * ### the global array "g_a" into the local array "a". Copy the same block
     * ### of data from "g_b" into the local array "b". Use the array of strides
     * ### "ld" to describe the physical layout of "a" and "b". */

    NGA_Get(g_a,lo,hi,a,ld);
    NGA_Get(g_b,lo,hi,b,ld);
    
    for(i=0; i<dims[0]; i++)
       if (a[i] != b[dims[0]-i-1]) 
       {
          printf("Mismatch: a[%d]=%d is not equal to b[%d]=%d\n",
                 i, a[i], dims[0]-i-1, b[dims[0]-i-1]);
          GA_Error("verify failed",0);
       }
    
    printf("  Transpose OK\n");
}

Example #3

0

Show file

File: ga_fillpatch.c Project: jeffhammond/ga

fillandscale(int rank, int nprocs)
{
  int g_A,  val1=5, val2=5, local_A[SIZE][SIZE], i, j; 
  int dims[DIM]={SIZE,SIZE}, alo[DIM]={1,1}, ahi[DIM]={2,2}, ld=5;

  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Zero(g_A);
  NGA_Fill_patch(g_A, alo, ahi, &val1);
  GA_Print(g_A);

  GA_Scale(g_A, &val2);
  GA_Print(g_A);

  NGA_Get(g_A, alo, ahi, local_A, &ld);

  if(rank == 1)
    {
      for(i=0; i<DIM; i++)
	{
	  for(j=0; j<DIM; j++) if(local_A[i][j]!=val1*val2)
	    printf(" GA ERROR: \n");
	}
    }
  GA_Destroy(g_A);
}

Example #4

0

Show file

File: ga_asymmetr.c Project: dmlb2000/nwchem-cml

void FATR 
ga_antisymmetrize_(Integer *g_a) {
  
  DoublePrecision alpha = 0.5;
  int i, me = GA_Nodeid();
  extern void * FATR ga_malloc(Integer nelem, int type, char *name);
  extern void FATR ga_free(void *ptr);
  void FATR gai_subtr(int *lo, int *hi, void *a, void *b, DoublePrecision alpha,
                      int type, Integer nelem, int ndim);

  int alo[GA_MAX_DIM], ahi[GA_MAX_DIM], lda[GA_MAX_DIM];
  int blo[GA_MAX_DIM], bhi[GA_MAX_DIM], ldb[GA_MAX_DIM];
  int ndim, dims[GA_MAX_DIM], type;
  Integer nelem=1;
  Logical have_data;
  void *a_ptr, *b_ptr;

  GA_Sync();


  
  NGA_Inquire((int)(*g_a), &type, &ndim, dims);
  
  if (dims[0] != dims[1]) 
    GA_Error("ga_sym: can only sym square matrix", 0L);
  
  /* Find the local distribution */
  NGA_Distribution((int)(*g_a), me, alo, ahi);
 
 
  have_data = ahi[0]>=0;
  for(i=1; i<ndim; i++) have_data = have_data && ahi[i]>=0;
  
  if(have_data) {
    NGA_Access((int)(*g_a), alo, ahi, &a_ptr, lda); 
    
    for(i=0; i<ndim; i++) nelem *= ahi[i]-alo[i] +1;
    b_ptr = (void *) ga_malloc(nelem, MT_C_DBL, "v");
    
    for(i=2; i<ndim; i++) {bhi[i]=ahi[i]; blo[i]=alo[i]; }
    
    /* switch rows and cols */
    blo[1]=alo[0];
    bhi[1]=ahi[0];
    blo[0]=alo[1];
    bhi[0]=ahi[1];

    for (i=0; i < ndim-1; i++) 
      ldb[i] = bhi[i+1] - blo[i+1] + 1; 
    NGA_Get((int)(*g_a), blo, bhi, b_ptr, ldb);
  }
  GA_Sync(); 

  if(have_data) {
    gai_subtr(alo, ahi, a_ptr, b_ptr, alpha, type, nelem, ndim);
    NGA_Release_update((int)(*g_a), alo, ahi);
    ga_free(b_ptr);
  }
  GA_Sync();
}

Example #5

0

Show file

File: ga_matrix.cpp Project: Anastien/GridPACK

// -------------------------------------------------------------
// MatGetValues_DenseGA
// -------------------------------------------------------------
static 
PetscErrorCode
MatGetValues_DenseGA(Mat mat, 
                    PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], 
                    PetscScalar v[])
{
  PetscErrorCode ierr = 0;
  struct MatGACtx *ctx;  
  int i, j, idx;
  PetscScalar vij;
  int lo[2], hi[2], ld[2] = {1, 1};
  ierr = MatShellGetContext(mat, (void *)&ctx); CHKERRQ(ierr);

  idx = 0;
  for (i = 0; i < m; ++i) {
    for (j = 0; j < n; ++j, ++idx) {
      lo[0] = idxm[i];
      hi[0] = idxm[i];
      lo[1] = idxn[j];
      hi[1] = idxn[j];
      NGA_Get(ctx->ga, lo, hi, (void *)&vij, ld);
      v[idx] = vij;
    }
  }
  return ierr;
}

Example #6

0

Show file

File: lennard-jones_ga.c Project: fuentesdt/tao-1.10.1-p3

int FormFunctionGradient (TAO_GA_APPLICATION gaapp, GAVec ga_X, double *f, GAVec ga_G, void *ptr)
{
  int lo, hi;			//the global coordinates
  AppCtx *user = (AppCtx *) ptr;
  int i,j;
  double *g, *x;
  double xx,yy,zz,temp,rij;
  

  MA_get_pointer(user->memHandle, &x);
  g = x + user->ndim*user->natoms;
    
  lo=0;
  hi=user->n-1; /* range of array indices */

  NGA_Get(ga_X, &lo, &hi, x, &hi);

  *f = 0;
  for (i=0; i < user->n; i++)
    g[i] = 0.0;

  if (user->ndim == 2) {
    for (j=1; j < user->natoms; j++) {
      for (i=0; i<j; i++) {
	xx = x[2*j] - x[2*i];
	yy = x[2*j+1] - x[2*i+1];
	rij = xx*xx + yy*yy;
	temp = 1.0/rij/rij/rij;
	*f += temp*(temp-2.0);
	temp *= 12.0*(temp-1.0)/rij;
	g[2*j] -= xx*temp;
	g[2*j+1] -= yy*temp;
	g[2*i] += xx*temp;
	g[2*i+1] += yy*temp;
      }
    }
  } else if (user->ndim == 3) {
    for (j=1; j < user->natoms; j++) {
      for (i=0; i < j; i++) {
	xx = x[3*j] - x[3*i];
	yy = x[3*j+1] - x[3*i+1];
	zz = x[3*j+2] - x[3*i+2];
	rij = xx*xx + yy*yy + zz*zz;
	temp = 1.0/rij/rij/rij;
	*f += temp*(temp-2.0);
	temp *= 12.0*(temp-1.0)/rij;
	g[3*j] -= xx*temp;
	g[3*j+1] -= yy*temp;
	g[3*j+2] -= zz*temp;
	g[3*i] += xx*temp;
	g[3*i+1] += yy*temp;
	g[3*i+2] += zz*temp;
      }
    }
  }
      
  NGA_Put(ga_G, &lo, &hi, g, &hi);
  return 0;
}

Example #7

0

Show file

File: lennard-jones_ga2.c Project: fuentesdt/tao-1.10.1-p3

int getBlock(GAVec g_x, int taskId, AppCtx *user) 
{
  int lo, hi;
  int size;

  size = user->BlockSize * user->ndim;

  /* get the coordinates of the atoms in the corresponding rows in the block */
  lo = user->btopo[taskId].x*size;
  hi = lo + size -1;
  NGA_Get(g_x, &lo, &hi, user->x1, &hi);

  /* get the coordinates of the atoms in the corresponding cols in the block */
  lo = user->btopo[taskId].y*size;
  hi = lo + size - 1;
  NGA_Get(g_x, &lo, &hi, user->x2, &hi);
  return 0;
}

Example #8

0

Show file

File: ga_elem_maximumpatch.c Project: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int g_A, g_B, g_C, local_C[DIM][DIM], dims[DIM]={5,5};
  int val_A=5, val_B=3, ld=DIM, max; 
  int lo[DIM]={2,2}, hi[DIM]={4,4}, blo[DIM]={0,0}, bhi[DIM]={2,2}, clo[DIM]={1,1}, chi[DIM]={3,3};

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
  
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  g_B = NGA_Create(C_INT, DIM, dims, "array_B", NULL);
  g_C = NGA_Create(C_INT, DIM, dims, "array_C", NULL);

  GA_Fill(g_A, &val_A);
  GA_Fill(g_B, &val_B);
  GA_Zero(g_C);
  GA_Elem_maximum_patch(g_A, lo, hi, g_B, blo, bhi, g_C, clo, chi);
  GA_Print(g_C);
  GA_Sync();
  
  NGA_Get(g_C, clo, chi, local_C, &ld);
  if(rank==1)
    {
  
      for(i=0; i<DIM; i++)
	{
	  for(j=0; j<DIM; j++)printf("%d ", local_C[i][j]);
	  printf("\n");
	}
      
      if(val_A>val_B) max=val_A;
      else max=val_B;

      for(i=0; i<DIM; i++)
	{
	  for(j=0; j<DIM; j++)
	    if(local_C[i][j]!=max) printf("GA Error : \n");
	}
      
    }
    
  GA_Sync();
  if(rank == 0)
    printf("Test Completed \n");

  GA_Terminate();
  MPI_Finalize();

}

Example #9

0

Show file

File: ga_scale_rows.c Project: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs;
  int g_A, g_V,  val1=5, val2=5, local_A[SIZE][SIZE], dims_V=SIZE, local_V[dims_V]; 
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={1,1}, hi[DIM]={2,2}, ld=5, i, j;
  int loV=0, hiV=dims_V-1;

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();

  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  g_V = NGA_Create(C_INT, 1, &dims_V, "array_A", NULL);
  GA_Fill(g_A, &val1);
  GA_Print(g_A);
  printf("\n");

  GA_Scale(g_A, &val2);
  GA_Print(g_A);

  GA_Get_diag(g_A, g_V);
  GA_Print(g_V);
  
  NGA_Get(g_A, lo, hi, local_A, &ld);
  NGA_Get(g_V, &loV, &hiV, local_V, &ld);

  if(rank==1)
    {
      for(i=0; i<dims_V; i++)
	if(local_V[i]!=val1*val2) printf(" GA Error: \n");
    }

  if(rank == 0)
    printf("Test Completed \n");

  GA_Terminate();
  MPI_Finalize();
}

Example #10

0

Show file

File: workq_add.c Project: davidozog/NWChem-mini-app

int workq_append_task_single_(
                     int *task_id,
                     int *tile_dim,
                     int *g_a,
                     int *g_b,
                     int *ld, 
                     double *bufa,
                     double *bufb
    ) {

  struct bench_buf buf;
  int lo[2], hi[2], myld[1], tile_size;
  
  buf.mtype = *task_id+3;
  buf.task_id = *task_id;
  buf.tile_dim = *tile_dim;

  tile_size = *tile_dim * (*tile_dim);
  lo[1] = *task_id*tile_size;
  hi[1] = lo[1] + tile_size - 1;
  lo[0] = 0;
  hi[0] = 0;
  myld[0] = *ld;

//printf("lo={%d,%d}, hi={%d,%d}\n", lo[0], lo[1], hi[0], hi[1]);
  NGA_Get(*g_a, lo, hi, shmdata+offset, myld);
  offset += tile_size;
  NGA_Get(*g_b, lo, hi, shmdata+offset, myld);
  offset += tile_size;
  tot_size += sizeof(double)*(tile_size*2);

//int i,j;
//printf("bufa: ");
//for (i=0; i<*tile_dim; i++) {
//  for (j=0; j<*tile_dim; j++) {
//    printf("%f ", bufa[*tile_dim*i+j]);
//  }
//}

  bench_bufs[num_microtasks++] = buf;

  return 0;
}

Example #11

0

Show file

File: lennard-jones_ga.c Project: fuentesdt/tao-1.10.1-p3

int InitializeVariables(GAVec ga_X, AppCtx *user) 
{
  double *x;
  double xx, yy, zz;
  int isqrtn, icrtn, left, i, j, k, il, jl, ctr;
  int lo, hi;


  lo = 0; hi = user->n - 1; /* range of array indices */
  MA_get_pointer(user->memHandle, &x);

  NGA_Get (ga_X, &lo, &hi, x, &hi);

  if (user->ndim == 2) {
    isqrtn = (int) sqrt( (double) user->natoms);
    left = user->natoms - isqrtn * isqrtn;
    xx = 0.0;
    yy = 0.0;
    for (j=0; j<=isqrtn + left/isqrtn; j++) {
      for (i=0; i < TaoMin(isqrtn, user->natoms - j*isqrtn); i++) {
	ctr = j*isqrtn + i;
	x[2*ctr] = xx;
	x[2*ctr+1] = yy;
	xx += 1.0;
      }
      yy += 1.0;
      xx = 0.0;
    }	     
  }
  else if (user->ndim == 3) {
    icrtn = (int) pow((user->natoms + 0.5),1.0/3.0);
    left = user->natoms - icrtn * icrtn * icrtn;
    xx = yy = zz = 0.0;
    for (k=0; k <= icrtn + left; k++) {
      jl = TaoMin(icrtn, (user->natoms - k*icrtn*icrtn)/icrtn+1);
      for (j=0; j<jl; j++) {
	il = TaoMin(icrtn, user->natoms - k*icrtn*icrtn - j*icrtn);
	for (i=0; i<il; i++) {
	  ctr = k*icrtn*icrtn + j*icrtn + i;
	  x[3*ctr] = xx;
	  x[3*ctr+1] = yy;
	  x[3*ctr+2] = zz;
	  xx += 1.0;
	}
	yy += 1.0;
	xx = 0.0;
      }
      zz += 1.0;
      yy = 0.0;
    }
  }

  NGA_Put(ga_X, &lo, &hi, x, &hi);
  return 0;
}

Example #12

0

Show file

File: gada.c Project: adrielb/DCell

PetscErrorCode testCreate2D()
{
  int ga;
  DA da;
  DALocalInfo info;
  Vec vec;
  PetscErrorCode ierr;
  
  PetscFunctionBegin;
  int d1 = 1453, d2 = 1451;
  ierr = DACreate2d(PETSC_COMM_WORLD,DA_NONPERIODIC,DA_STENCIL_STAR,
              d1,d2,PETSC_DECIDE,PETSC_DECIDE,1,1,0,0, &da); CHKERRQ(ierr);
  ierr = DAGetLocalInfo(da,&info); CHKERRQ(ierr);
  ierr = DACreateGlobalArray( da, &ga, &vec); CHKERRQ(ierr);
  
  PetscReal **v;
  ierr = DAVecGetArray(da,vec,&v); CHKERRQ(ierr);
  int xe = info.xs+info.xm,
      ye = info.ys+info.ym;
  for (int j = info.ys; j < ye; ++j) {
    for (int i = info.xs; i < xe; ++i) {
      v[j][i] = 1.*i + d1 * j;
    }
  }
  ierr = DAVecRestoreArray(da,vec,&v); CHKERRQ(ierr);
  PetscPrintf(PETSC_COMM_WORLD,"Updated local portion with DAVec\n");
  PetscBarrier(0);
  {
    double *da_ptr;
  VecGetArray(vec, &da_ptr);
  double *ptr;
  int low[2],hi[2],ld;
  NGA_Distribution(ga,GA_Nodeid(),low,hi);
  NGA_Access(ga,low,hi,&ptr,&ld);
  printf("[%d] ga:%p\tda:%p\tdiff:%p\n", GA_Nodeid(), ptr, da_ptr, (ptr-da_ptr) );
  NGA_Release_update(ga,low,hi);
  }
  
  int lo[2],ld;
  double val;
  for (int j = 0; j < d2; ++j) {
    for (int i = 0; i < d1; ++i) {
      lo[0] = j;
      lo[1] = i;
      NGA_Get(ga,lo,lo,&val,&ld);
      if( PetscAbs( i + d1*j - val) > .1 )
        printf(".");
//        printf("[%d] (%3.0f,%3.0f)\n", GA_Nodeid(), 1.*i + d1*j, val);
    }
  }
  GA_Print_stats();
  ierr = VecDestroy(vec); CHKERRQ(ierr);
  GA_Destroy(ga);
  PetscFunctionReturn(0);
}

Example #13

0

Show file

File: ga_get.c Project: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int g_A, **local_A=NULL, **local_B=NULL; 
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=5, value=5;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);
  GA_Initialize();

  local_A=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    {
      local_A[i]=(int*)malloc(SIZE*sizeof(int));
      for(j=0; j<SIZE; j++) local_A[i][j]=rand()%10;
    }

  local_B=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    {
      local_B[i]=(int*)malloc(SIZE*sizeof(int));
      for(j=0; j<SIZE; j++) local_B[i][j]=rand()%10;
    }

  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Zero(g_A);
  
  if(rank==0)
    {
      NGA_Put(g_A, lo, hi, local_A, &ld);
      NGA_Get(g_A, lo, hi, local_B, &ld);

      for(i=0; i<SIZE; i++)
	{
	  for(j=0; j<SIZE; j++)
	    if(local_A[i][j]!=local_B[i][j]) GA_ERROR_MSG();
	}
    }
  
  GA_Sync();
  GA_Destroy(g_A);
  
  if(rank == 0) GA_PRINT_MSG();

  GA_Terminate();
  MPI_Finalize();
}

Example #14

0

Show file

File: test_sum.c Project: niuqingpeng/microbenchmark

void coefs_ga_get_3d(ga_coefs_t *ga_coefs,void *mini_cube,int x,int y,int z)
{
  int nsplines = ga_coefs->nsplines;
  int lo[4],hi[4],ld[4];
  lo[0]=x;lo[1]=y;lo[2]=z;lo[3]=0;
  hi[0]=x+3;hi[1]=y+3;hi[2]=z+3;hi[3]=nsplines-1;
  ld[0]=ld[1]=4;ld[2]=nsplines;
  double begin=MPI_Wtime()*1000;
  NGA_Get(ga_coefs->g_a,lo,hi,mini_cube,ld);
  double end=MPI_Wtime()*1000;
  ga_coefs->amount++;
  ga_coefs->sumt+=end-begin;
  return;
}

Example #15

0

Show file

File: ga_matrix.cpp Project: Anastien/GridPACK

// -------------------------------------------------------------
// MatConvertGAtoDense
// -------------------------------------------------------------
PetscErrorCode
MatConvertGAToDense(Mat A, Mat *B)
{
  PetscErrorCode ierr = 0;
  MPI_Comm comm;
  int nproc;
  struct MatGACtx *ctx;
  PetscInt lrows, grows, lcols, gcols, lo, hi;

  ierr = PetscObjectGetComm((PetscObject)A, &comm); CHKERRQ(ierr);
  ierr = MPI_Comm_size(comm, &nproc); 

  ierr = MatShellGetContext(A, &ctx); CHKERRQ(ierr);

  ierr = MatGetSize(A, &grows, &gcols); CHKERRQ(ierr);
  ierr = MatGetLocalSize(A, &lrows, &lcols); CHKERRQ(ierr);

  ierr = MatCreateDense(comm, lrows, lcols, grows, gcols, NULL, B); CHKERRQ(ierr);
  
  ierr = MatCreate(comm, B); CHKERRXX(ierr);
  ierr = MatSetSizes(*B, lrows, lcols, grows, gcols); CHKERRXX(ierr);
  if (nproc == 1) {
    ierr = MatSetType(*B, MATSEQDENSE); CHKERRXX(ierr);
    ierr = MatSeqDenseSetPreallocation(*B, PETSC_NULL); CHKERRXX(ierr);
  } else {
    ierr = MatSetType(*B, MATDENSE); CHKERRXX(ierr);
    ierr = MatMPIDenseSetPreallocation(*B, PETSC_NULL); CHKERRXX(ierr);
  }
  ierr = MatGetOwnershipRange(*B, &lo, &hi); CHKERRQ(ierr);

  std::vector<PetscInt> cidx(gcols);
  for (PetscInt c = 0; c < gcols; ++c) {
    cidx[c] = c;
  }
  std::vector<PetscScalar> rowvals(gcols);
  for (PetscInt r = lo; r < hi; ++r) {
    int glo[2] = {r, 0};
    int ghi[2] = {r, gcols - 1};
    int ld[2] = {1,1};
    NGA_Get(ctx->ga, glo, ghi, &rowvals[0], ld);
    ierr = MatSetValues(*B, 1, &r, gcols, &cidx[0], &rowvals[0], INSERT_VALUES); CHKERRQ(ierr);
  }

  ierr = MatAssemblyBegin(*B, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*B, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  return ierr;
}

Example #16

0

Show file

File: ga_matrix.cpp Project: Anastien/GridPACK

// -------------------------------------------------------------
// GA2Vec
// -------------------------------------------------------------
static
int
GA2Vec(int ga, Vec x)
{
  int lrows, rows;
  PetscErrorCode ierr = 0;
  
  ierr = VecGetLocalSize(x, &lrows); CHKERRQ(ierr);
  ierr = VecGetSize(x, &rows); CHKERRQ(ierr);
  
  PetscInt vlo, vhi;
  ierr = VecGetOwnershipRange(x, &vlo, &vhi); CHKERRQ(ierr);
  
  PetscScalar *v;
  ierr = VecGetArray(x, &v); CHKERRQ(ierr);
   int lo[2] = {vlo,0}, hi[2] = {vhi-1,0}, ld[2] = {1,1};
  NGA_Get(ga, lo, hi, v, ld);
  ierr = VecRestoreArray(x, &v); CHKERRQ(ierr);
    
  return ierr;
}

Example #17

0

Show file

File: gada.c Project: adrielb/DCell

PetscErrorCode testCreate3D(  )
{
  int ga;
  DA da;
  DALocalInfo info;
  Vec vec;
  PetscErrorCode ierr;
  
  PetscFunctionBegin;
  int d1 = 229, d2 = 229, d3 = 229;
  int rank;
  MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
  ierr = DACreate3d(PETSC_COMM_WORLD,DA_NONPERIODIC,DA_STENCIL_STAR,
              d1,d2,d3,
              PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,
              1,1,
              0,0,0, &da); CHKERRQ(ierr);
  ierr = DAGetLocalInfo(da,&info); CHKERRQ(ierr);
  ierr = DACreateGlobalArray( da, &ga, &vec); CHKERRQ(ierr);
  
  PetscReal ***v;
  ierr = DAVecGetArray(da,vec,&v); CHKERRQ(ierr);
  int xe = info.xs+info.xm,
      ye = info.ys+info.ym,
      ze = info.zs+info.zm;
  for (int k = info.zs; k < ze; ++k) {
    for (int j = info.ys; j < ye; ++j) {
      for (int i = info.xs; i < xe; ++i) {
        v[k][j][i] = 1.*i + d1*j + d1*d2*k;
      }
    }
  }
  ierr = DAVecRestoreArray(da,vec,&v); CHKERRQ(ierr);
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Sequential values filled in petsc vec.\n"); CHKERRQ(ierr);
  
  ierr = PetscBarrier(0); CHKERRQ(ierr);
  int lo[3],ld, p = 10;
  int patch[10][10][10];
  double val;
  for (int k = 0; k < d3; k+=p) {
    for (int j = 0; j < d2; j+=p) {
      for (int i = 0; i < d1; i+=p) {
        lo[0] = k;
        lo[1] = j;
        lo[2] = i;
        NGA_Get(ga,lo,lo,&val,&ld);
        if( PetscAbs( i + d1*j + d1*d2*k - val) > .1 )
//          printf(".");
          printf("(%3.0f,%3.0f) ", 1.*i + d1*j + d1*d2*k, val);
      }
    }
  }
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Ended NGA_Get() test.\n"); CHKERRQ(ierr);
  
  ierr = PetscBarrier(0); CHKERRQ(ierr);
  if( rank == 0 )
  {
    for (int k = 0; k < d3; ++k) {
      printf(">%d\n",k);
      for (int j = 0; j < d2; ++j) {
        for (int i = 0; i < d1; ++i) {
          lo[0] = k; lo[1] = j; lo[2] = i;
          val = 1.*i + d1*j + d1*d2*k;
          val *= -1;
          NGA_Put(ga,lo,lo,&val,&ld);
        }
      }
    }
  }
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Ended NGA_Put() negative seq values.\n"); CHKERRQ(ierr);
  
  ierr = PetscBarrier(0); CHKERRQ(ierr);
  ierr = DAVecGetArray(da,vec,&v); CHKERRQ(ierr);
  for (int k = info.zs; k < ze; ++k) {
    for (int j = info.ys; j < ye; ++j) {
      for (int i = info.xs; i < xe; ++i) {
        val = -1 * (1.*i + d1*j + d1*d2*k);
        if( PetscAbs( val - v[k][j][i] ) > .1 )
          printf(".");
      }
    }
  }
  ierr = DAVecRestoreArray(da,vec,&v); CHKERRQ(ierr);
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Ended petsc vec update test.\n"); CHKERRQ(ierr);
  
  if( rank == 0 )
    GA_Print_stats();
  
  ierr = VecDestroy(vec); CHKERRQ(ierr);
  GA_Destroy(ga);
  PetscFunctionReturn(0);
}

Example #18

0

Show file

File: spiral-mm.c Project: pghosh2/GA-Matrix-Multiplication-Benchmark

/* Square matrix-matrix multiplication */
void matrix_multiply(int M, int N, int K, 
		int blockX_len, int blockY_len) 
{
	/* Local buffers and Global arrays declaration */
	double *a=NULL, *b=NULL, *c=NULL;

	int dims[NDIMS], ld[NDIMS], chunks[NDIMS];
	int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */

	int g_a, g_b, g_c, g_cnt, g_cnt2;
	int offset;
	double alpha = 1.0, beta=0.0;
	int count_p = 0, next_p = 0;
	int count_gac = 0, next_gac = 0;
	double t1,t2,seconds;
        ga_nbhdl_t nbh;
        int count_acc = 0;

	/* Find local processor ID and the number of processes */
	int proc=GA_Nodeid(), nprocs=GA_Nnodes();

	if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 
			|| (K % blockX_len) != 0 || (K % blockY_len) != 0)
		GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101);

	/* Allocate/Set process local buffers */
	a = malloc (blockX_len * blockY_len * sizeof(double)); 
	b = malloc (blockX_len * blockY_len * sizeof(double)); 
	c = malloc (blockX_len * blockY_len * sizeof(double));

	cdims[0] = blockX_len;
	cdims[1] = blockY_len;	

	/* Configure array dimensions */
	for(int i = 0; i < NDIMS; i++) {
		dims[i]  = N;
		chunks[i] = -1;
		ld[i]    = cdims[i]; /* leading dimension/stride of the local buffer */
	}

	/* create a global array g_a and duplicate it to get g_b and g_c*/
	g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks);

	if (!g_a) 
		GA_Error("NGA_Create failed: A", NDIMS);

#if DEBUG>1
	if (proc == 0) 
		printf("  Created Array A\n");
#endif
	/* Ditto for C and B */
	g_b = GA_Duplicate(g_a, "array B");
	g_c = GA_Duplicate(g_a, "array C");

	if (!g_b || !g_c) 
		GA_Error("GA_Duplicate failed",NDIMS);
	if (proc == 0) 
		printf("Created Arrays B and C\n");

	/* Subscript array for read-incr, which is nothing but proc */
	int * rdcnt = malloc (nprocs * sizeof(int));
	memset (rdcnt, 0, nprocs * sizeof(int));
	int * rdcnt2 = malloc (nprocs * sizeof(int));
	memset (rdcnt2, 0, nprocs * sizeof(int));

	/* Create global array of nprocs elements for nxtval */	
	int counter_dim[1];
	counter_dim[0] = nprocs;

	g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL);

	if (!g_cnt) 
		GA_Error("Shared counter failed",1);

	g_cnt2 = GA_Duplicate(g_cnt, "another shared counter");

	if (!g_cnt2) 
		GA_Error("Another shared counter failed",1);

	GA_Zero(g_cnt);
	GA_Zero(g_cnt2);

#if DEBUG>1	
	/* initialize data in matrices a and b */
	if(proc == 0)
		printf("Initializing local buffers - a and b\n");
#endif
	int w = 0; 
	int l = 7;
	for(int i = 0; i < cdims[0]; i++) {
		for(int j = 0; j < cdims[1]; j++) {
			a[i*cdims[1] + j] = (double)(++w%29);
			b[i*cdims[1] + j] = (double)(++l%37);
		}
	}

	/* Copy data to global arrays g_a and g_b from local buffers */
	next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
	for (int i = 0; i < N; i+=cdims[0]) 
	{
		if (next_p == count_p) {
			for (int j = 0; j < N; j+=cdims[1])
			{
				/* Indices of patch */
				lo[0] = i;
				lo[1] = j;
				hi[0] = lo[0] + cdims[0];
				hi[1] = lo[1] + cdims[1];

				hi[0] = hi[0]-1;
				hi[1] = hi[1]-1;
#if DEBUG>1
				printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Put(g_a, lo, hi, a, ld);
				NGA_Put(g_b, lo, hi, b, ld);

			}
			next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
		}		
		count_p++;
	}


#if DEBUG>1
	printf ("After NGA_PUT to global - A and B arrays\n");
#endif
	/* Synchronize all processors to make sure puts from 
	   nprocs has finished before proceeding with dgemm */
	GA_Sync();

	t1 = GA_Wtime();

	next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
	for (int m = 0; m < N; m+=cdims[0])
	{
		for (int k = 0; k < N; k+=cdims[0])
		{
			if (next_gac == count_gac)	
			{
				/* A = m x k */
				lo[0] = m; lo[1] = k;
				hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];

				hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
				printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Get(g_a, lo, hi, a, ld);

				for (int n = 0; n < N; n+=cdims[1])
				{
					memset (c, 0, sizeof(double) * cdims[0] * cdims[1]);
					/* B = k x n */
					lo[0] = k; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_Get(g_b, lo, hi, b, ld);


					//_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0);

					/* TODO I am assuming square matrix blocks, further testing/work 
					   required for rectangular matrices */
					cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */
							cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha,
							a, cdims[0], /* lda */ b, cdims[1], /* ldb */
							beta=1.0, c, cdims[0] /* ldc */);

					NGA_NbWait(&nbh);

					/* C = m x n */
					lo[0] = m; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh);
					count_acc += 1;
				} /* END LOOP N */
				next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
			} /* ENDIF if count == next */
			count_gac++;
		} /* END LOOP K */
	} /* END LOOP M */

	GA_Sync();
	t2 = GA_Wtime();
	seconds = t2 - t1;
	if (proc == 0)
		printf("Time taken for MM (secs):%lf \n", seconds);

        printf("Number of ACC: %d\n", count_acc);

	/* Correctness test - modify data again before this function */
	for (int i = 0; i < NDIMS; i++) {
		lo[i] = 0;
		hi[i] = dims[i]-1;
		ld[i] = dims[i];
	}

	verify(g_a, g_b, g_c, lo, hi, ld, N);

	/* Clear local buffers */
	free(a);
	free(b);
	free(c);
	free(rdcnt);
	free(rdcnt2);

	GA_Sync();

	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);
	GA_Destroy(g_c);
	GA_Destroy(g_cnt);
	GA_Destroy(g_cnt2);
}

Example #19

0

Show file

File: screening.c Project: SahanGH/psi4public

int schwartz_screening(PFock_t pfock, BasisSet_t basis)
{
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    // create shell pairs values
    //ERD_t erd;
    int nthreads = omp_get_max_threads();
    //CInt_createERD(basis, &erd, nthreads);
    int nshells = pfock->nshells;

    // create global arrays for screening
    int nprow = pfock->nprow;
    int npcol = pfock->npcol;
    int dims[2];
    int block[2];
    int map[nprow + npcol];
    for (int i = 0; i < nprow; i++) {
        map[i] = pfock->rowptr_sh[i];
    }
    for (int i = 0; i < npcol; i++) {
        map[i + nprow] = pfock->colptr_sh[i];
    }
    dims[0] = nshells;
    dims[1] = nshells;
    block[0] = nprow;
    block[1] = npcol;
    pfock->ga_screening =
        NGA_Create_irreg(C_DBL, 2, dims, "array Screening", block, map);
    if (0 == pfock->ga_screening) {
        return -1;
    }

    // compute the max shell value
    double *sq_values = (double *)PFOCK_MALLOC(sizeof(double) *
        pfock->nshells_row * pfock->nshells_col);
    if (NULL == sq_values) {
        return -1;
    }
    int startM = pfock->sshell_row;
    int startN = pfock->sshell_col;
    int endM = pfock->eshell_row;
    int endN = pfock->eshell_col;
    double maxtmp = 0.0;
    #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        #pragma omp for reduction(max:maxtmp)
        for (int M = startM; M <= endM; M++) {
            int dimM = CInt_getShellDim(basis, M);
            for (int N = startN; N <= endN; N++) {
                int dimN = CInt_getShellDim(basis, N);
                double *integrals;
                int nints=
                ComputeShellQuartet(basis,tid,M,N,M,N,&integrals);
                //CInt_computeShellQuartet(basis, erd, tid, M, N, M, N,
                //                         &integrals, &nints);
                double maxvalue = 0.0;
                if (nints != 0) {
                    for (int iM = 0; iM < dimM; iM++) {
                        for (int iN = 0; iN < dimN; iN++) {
                            int index =
                                iM * (dimN*dimM*dimN+dimN) + iN * (dimM*dimN+1);
                            if (maxvalue < fabs(integrals[index])) {
                                maxvalue = fabs(integrals[index]);
                            }
                        }
                    }
                }
                sq_values[(M - startM) * (endN - startN + 1)  + (N - startN)]
                    = maxvalue;
                if (maxvalue > maxtmp) {
                    maxtmp = maxvalue;
                }
            }
        }
    }
    int lo[2];
    int hi[2];
    lo[0] = startM;
    hi[0] = endM;
    lo[1] = startN;
    hi[1] = endN;
    int ld = endN - startN + 1;
    NGA_Put(pfock->ga_screening, lo, hi, sq_values, &ld);
    // max value
    MPI_Allreduce(&maxtmp, &(pfock->maxvalue), 1,
                  MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
    //CInt_destroyERD(erd);
    PFOCK_FREE(sq_values);

    // init shellptr
    sq_values = (double *)PFOCK_MALLOC(sizeof(double) * nshells);
    if (NULL == sq_values) {
        return -1;
    }
    int nnz = 0;
    double eta = pfock->tolscr2 / pfock->maxvalue;
    pfock->shellptr = (int *)PFOCK_MALLOC(sizeof(int) * (nshells + 1));
    pfock->mem_cpu += 1.0 * sizeof(int) * (nshells + 1);
    if (NULL == pfock->shellptr) {
        return -1;
    }
    memset(pfock->shellptr, 0, sizeof(int) * (nshells + 1));
    for (int M = 0; M < nshells; M++) {
        pfock->shellptr[M] = nnz;
        lo[0] = M;
        hi[0] = M;
        lo[1] = 0;
        hi[1] = nshells - 1;
        ld = nshells;
        NGA_Get(pfock->ga_screening, lo, hi, sq_values, &ld);
        for (int N = 0; N < nshells; N++) {
            double maxvalue = sq_values[N];
            if (maxvalue > eta) {
                if (M > N && (M + N) % 2 == 1 || M < N && (M + N) % 2 == 0) {
                    continue;
                } else {
                    nnz++;
                }
            }
        }
        pfock->shellptr[M + 1] = nnz;
    }
    pfock->nnz = nnz;

    double maxvalue;
    pfock->shellvalue  = (double *)PFOCK_MALLOC(sizeof(double) * nnz);
    pfock->shellid  = (int *)PFOCK_MALLOC(sizeof(int) * nnz);
    pfock->shellrid  = (int *)PFOCK_MALLOC(sizeof(int) * nnz);
    pfock->mem_cpu += 1.0 * sizeof(double) * nnz + 2.0 * sizeof(int) * nnz;
    nshells = pfock->nshells;
    if (pfock->shellvalue == NULL ||
        pfock->shellid == NULL ||
        pfock->shellrid == NULL) {
        return -1;
    }
    nnz = 0;
    for (int A = 0; A < nshells; A++) {
        pfock->shellptr[A] = nnz;
        lo[0] = A;
        hi[0] = A;
        lo[1] = 0;
        hi[1] = nshells - 1;
        ld = nshells;
        NGA_Get(pfock->ga_screening, lo, hi, sq_values, &ld);
        for (int B = 0; B < nshells; B++) {
            maxvalue = sq_values[B];
            if (maxvalue > eta) {
                if (A > B && (A + B) % 2 == 1 || A < B && (A + B) % 2 == 0)
                    continue;
                if (A == B) {
                    pfock->shellvalue[nnz] = maxvalue;
                } else {
                    pfock->shellvalue[nnz] = -maxvalue;
                }
                pfock->shellid[nnz] = B;
                pfock->shellrid[nnz] = A;
                nnz++;
            }
        }
    }
    PFOCK_FREE(sq_values);
    GA_Destroy(pfock->ga_screening);

    return 0;
}

Example #20

0

Show file

File: ga_add_patch.c Project: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int g_A, g_B, g_C, local_C[DIM][DIM], dims[DIM]={5,5}, val1=5, val2=4, alpha=3, beta=2, ld=5;
  int alo[DIM]={2,2}, ahi[DIM]={3,3}, blo[DIM]={2,2}, bhi[DIM]={3,3}, clo[DIM]={1,1}, chi[DIM]={2,2};

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
  
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);

  g_B = GA_Duplicate(g_A, "array_B");
  g_C = GA_Duplicate(g_A, "array_C");

  GA_Fill(g_A, &val1);
  GA_Fill(g_B, &val2);
  GA_Zero(g_C);

  NGA_Add_patch(&alpha, g_A, clo, chi, &beta, g_B, blo, bhi, g_C, clo, chi);

  GA_Sync();
  GA_Print(g_A);
  GA_Print(g_B);
  GA_Print(g_C);

  NGA_Get(g_C, clo, chi, local_C, &ld);

  //printf("check 1 \n");

  for(i=0; i<DIM; i++)
    {
      for(j=0; j<DIM; j++)printf("%d ", local_C[i][j]);
      printf("\n");
    }
  
  if(rank == 0)
    {
      printf("check 2\n");
    
      for(i=0; i<DIM; i++)
	{
	  for(j=0; j<DIM; j++)
	    if(local_C[i][j]!=(alpha*val1)+(beta*val2)) printf("GA Error : \n");
	}
    }
  
  if(rank==0)
    GA_PRINT_MSG();

  GA_Sync();

  /*
  GA_Destroy(g_A);
  GA_Destroy(g_B);
  GA_Destroy(g_C);
  */

  //*******************************************************************

  /* what would be the possible reason for GA_destroy to get failed .., 
   * solve this before consolidate the whole
   */

  GA_Terminate();
  MPI_Finalize();
 
}

Example #21

0

Show file

File: ga-mpi.c Project: jeffhammond/ga

void do_work()
{
int ZERO=0;   /* useful constants */
int g_a, g_b;
int n=N, ndim=2,type=MT_F_DBL,dims[2]={N,N},coord[2];
int me=GA_Nodeid(), nproc=GA_Nnodes();
int row, i, j;
 int lo[2], hi[2];

/* Note: on all current platforms DoublePrecision = double */
DoublePrecision buf[N], *max_row=NULL;

MPI_Comm WORLD_COMM;
MPI_Comm ROW_COMM;
int ilo,ihi, jlo,jhi, ld, prow, pcol;
int root=0, grp_me=-1;

     WORLD_COMM = GA_MPI_Comm_pgroup_default();

     if(me==0)printf("Creating matrix A\n");
     dims[0]=n; dims[1]=n;
     g_a = NGA_Create(type, ndim, dims, "A", NULL);
     if(!g_a) GA_Error("create failed: A",n); 
     if(me==0)printf("OK\n");
     
     if(me==0)printf("Creating matrix B\n");
     dims[0]=n;
     g_b = NGA_Create(type, 1, dims, "B", NULL);
     if(!g_b) GA_Error("create failed: B",n); 
     if(me==0)printf("OK\n");
     
     GA_Zero(g_a);   /* zero the matrix */
     
     if(me==0)printf("Initializing matrix A\n");
     /* fill in matrix A with values: A(i,j) = (i+j) */ 
     for(row=me; row<n; row+= nproc){
    /**
     * simple load balancing: 
     * each process works on a different row in MIMD style 
     */ 
    for(i=0; i<n; i++) buf[i]=(DoublePrecision)(i+row+1); 
    lo[0]=hi[0]=row;
    lo[1]=ZERO;  hi[1]=n-1; 
    NGA_Put(g_a, lo, hi, buf, &n); 
     }
     
     /* GA_print(&g_a);*/
     NGA_Distribution(g_a, me, lo, hi);
     ilo=lo[0]; ihi=hi[0];
     jlo=lo[1]; jhi=hi[1];
     
     GA_Sync(); 
     if(ihi-ilo+1 >0){
        max_row=(DoublePrecision*)malloc(sizeof(DoublePrecision)*(ihi-ilo+1));
        if (!max_row) GA_Error("malloc 3 failed",(ihi-ilo+1));
        for (i=0; i<(ihi-ilo+1); i++) {
            max_row[i] = 0.0;
        }
     }
     NGA_Proc_topology(g_a, me, coord);  /* block coordinates */
     prow = coord[0];
     pcol = coord[1];

     if(me==0)printf("Splitting comm according to distribution of A\n");
     
     /* GA on SP1 requires synchronization before & after message-passing !!*/
     GA_Sync(); 
     
     if(me==0)printf("Computing max row elements\n");
     /* create communicator for processes that 'own' A[:,jlo:jhi] */
     MPI_Barrier(WORLD_COMM);
     if(pcol < 0 || prow <0)
    MPI_Comm_split(WORLD_COMM,MPI_UNDEFINED,MPI_UNDEFINED, &ROW_COMM);
     else
    MPI_Comm_split(WORLD_COMM, (int)pcol, (int)prow, &ROW_COMM);
     
     if(ROW_COMM != MPI_COMM_NULL){
    double *ptr;
    MPI_Comm_rank(ROW_COMM, &grp_me);
    
    /* each process computes max elements in the block it 'owns' */
    lo[0]=ilo; hi[0]=ihi;
    lo[1]=jlo; hi[1]=jhi;
    NGA_Access(g_a, lo, hi, &ptr, &ld);
    for(i=0; i<ihi-ilo+1; i++){
       for(j=0; j<jhi-jlo+1; j++)
          if(max_row[i] < ptr[i*ld + j]){
         max_row[i] = ptr[i*ld + j];
          }
    }
    MPI_Reduce(max_row, buf, ihi-ilo+1, MPI_DOUBLE, MPI_MAX,
           root, ROW_COMM);
    
     }else fprintf(stderr,"process %d not participating\n",me);
     GA_Sync(); 
     
     /* processes with rank=root in ROW_COMM put results into g_b */
     ld = 1;
     if(grp_me == root) {
    lo[0]=ilo;  hi[0]=ihi;
    NGA_Put(g_b, lo, hi, buf, &ld); 
     }
        
     GA_Sync();

     if(me==0)printf("Checking the result\n");
     if(me==0){
    lo[0]=ZERO; hi[0]=n-1;
        NGA_Get(g_b, lo, hi, buf, &n); 
        for(i=0; i< n; i++)if(buf[i] != (double)n+i){
            fprintf(stderr,"error:%d max=%f should be:%d\n",i,buf[i],n+i);
            GA_Error("terminating...",1);
        }
     }
     
     if(me==0)printf("OK\n");

     GA_Destroy(g_a);
     GA_Destroy(g_b);
}

Example #22

0

Show file

File: testmatmultc.c Project: sg0/Elemental

/*
 * test ga_dgemm
 * Note: - change nummax for large arrays
 *       - turn off "dgemm_verify" for large arrays due to memory 
 *         limitations, as dgemm_verify=1 for large arrays produces 
 *         segfault, dumps core,or any crap.
 */
int main(int argc, char **argv)
{
    int num_m;
    int num_n;
    int num_k;
    int i;
    int ii;
    double *h0;
    int g_c;
    int g_b;
    int g_a;
    double a;
    double t1;
    double mf;
    double avg_t[ntrans];
    double avg_mf[ntrans];
    int itime;
    int ntimes;
    int nums_m[/*howmany*/] = {512,1024};
    int nums_n[/*howmany*/] = {512,1024};
    int nums_k[/*howmany*/] = {512,1024};
    char transa[/*ntrans*/] = "ntnt";
    char transb[/*ntrans*/] = "nntt";
    char ta;
    char tb;
    double *tmpa;
    double *tmpb;
    double *tmpc;
    int ndim;
    int dims[2];
#ifdef BLOCK_CYCLIC
    int block_size[2];
#endif

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );
    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
#else
    MP_INIT(argc,argv);
    if (!MA_init(MT_DBL,1,20000000)) {
        GA_Error("failed: ma_init(MT_DBL,1,20000000)",10);
    }
    GA_INIT(argc,argv);
    me = GA_Nodeid();
#endif

    h0 = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpa = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpb = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpc = (double*)malloc(sizeof(double) * nummax*nummax);

    ii = 0;
    for (i=0; i<nummax*nummax; i++) {
        ii = ii + 1;
        if (ii > nummax) {
            ii = 0;
        }
        h0[i] = ii;
    }

    /* Compute times assuming 500 mflops and 5 second target time */
    /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */
    ntimes = 5;

    for (ii=0; ii<howmany; ii++) {
        num_m = nums_m[ii];
        num_n = nums_n[ii];
        num_k = nums_k[ii];
        a = 0.5/(num_m*num_n);
        if (num_m > nummax || num_n > nummax || num_k > nummax) {
            GA_Error("Insufficient memory: check nummax", 1);
        }

#ifndef BLOCK_CYCLIC
        ndim = 2;

	/*
        dims[0] = num_m;
        dims[1] = num_n;
	*/
        dims[1] = num_m;
        dims[0] = num_n;

#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c );
#else
        if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) {
            GA_Error("failed: create g_c",20);
        }
#endif
	/*
        dims[0] = num_k;
        dims[1] = num_n;
	*/
        dims[1] = num_k;
        dims[0] = num_n;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b );
#else
        if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) {
            GA_Error("failed: create g_b",30);
        }
#endif
	/*
        dims[0] = num_m;
        dims[1] = num_k;
	*/
        dims[1] = num_m;
        dims[0] = num_k;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a );
#else
        if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) {
            GA_Error("failed: create g_a",40);
        }
#endif
#else
        ndim = 2;
        block_size[0] = 128;
        block_size[1] = 128;

        dims[0] = num_m;
        dims[1] = num_n;
        g_c = GA_Create_handle();
        GA_Set_data(g_c,ndim,dims,MT_DBL);
        GA_Set_array_name(g_c,"g_c");
        GA_Set_block_cyclic(g_c,block_size);
        if (!GA_Allocate(g_c)) {
            GA_Error("failed: create g_c",40);
        }

        dims[0] = num_k;
        dims[1] = num_n;
        g_b = GA_Create_handle();
        GA_Set_data(g_b,ndim,dims,MT_DBL);
        GA_Set_array_name(g_b,"g_b");
        GA_Set_block_cyclic(g_b,block_size);
        if (!ga_allocate(g_b)) {
            GA_Error("failed: create g_b",40);
        }

        dims[0] = num_m;
        dims[1] = num_k;
        g_a = GA_Create_handle();
        GA_Set_data(g_a,ndim,dims,MT_DBL);
        GA_Set_array_name(g_a,"g_a");
        GA_Set_block_cyclic(g_a,block_size);
        if (!ga_allocate(g_a)) {
            GA_Error('failed: create g_a',40);
        }
#endif         

        /* Initialize matrices A and B */
        if (me == 0) { 
            load_ga(g_a, h0, num_m, num_k);
            load_ga(g_b, h0, num_k, num_n);
        }
#if defined(USE_ELEMENTAL)
        double zero = 0.0;
        ElGlobalArraysFill_d( eldga, g_c, &zero );
	ElGlobalArraysSync_d( eldga );
#else
        GA_Zero(g_c);
        GA_Sync();
#endif
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n",
                    (long)num_m, (long)num_k, (long)num_k, (long)num_n);
            fflush(stdout);
        }

        for (i=0; i<ntrans; i++) {
            avg_t[i]  = 0.0;
            avg_mf[i] = 0.0;
        }

        for (itime=0; itime<ntimes; itime++) {
            for (i=0; i<ntrans; i++) {
#if defined(USE_ELEMENTAL)
	        ElGlobalArraysSync_d( eldga );
#else
                GA_Sync();
#endif
                ta = transa[i];
                tb = transb[i];
                t1 = MP_TIMER();
#if defined(USE_ELEMENTAL)
		ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c );
#else
                GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c);
#endif
                t1 = MP_TIMER() - t1;
#if defined(USE_ELEMENTAL)
                if (me == 0) {
#else
                if (GA_Nodeid() == 0) {
#endif
#if defined(USE_ELEMENTAL)
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc;
#else
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes();
#endif
                    avg_t[i]  = avg_t[i]+t1;
                    avg_mf[i] = avg_mf[i] + mf;
                    printf("%15s%2d: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                            "Run#", itime, t1, mf, ta, tb);
                    fflush(stdout);
                    if (dgemm_verify && itime == 0) {
                        /* recall the C API swaps the matrix order */
                        /* we swap it here for the Fortran-based verify */
                        verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0,
                                g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc);
                    }
                }
            }
        }
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\n");
            for (i=0; i<ntrans; i++) {
                printf("%17s: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                        "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes,
                        transa[i], transb[i]);
            }
            if(dgemm_verify) {
                printf("All GA_Dgemms are verified...O.K.\n");
            }
            fflush(stdout);
        }

        /*
           GA_Print(g_a);
           GA_Print(g_b);
           GA_Print(g_c);
           */
#if defined(USE_ELEMENTAL)
        ElGlobalArraysDestroy_d( eldga, g_a );
        ElGlobalArraysDestroy_d( eldga, g_b );
        ElGlobalArraysDestroy_d( eldga, g_c );
#else
        GA_Destroy(g_c);
        GA_Destroy(g_b);
        GA_Destroy(g_a);
#endif
    }

    /* ???
       format(a15, i2, ': ', e12.4, ' seconds ',f12.1, 
       .     ' mflops/proc ', 3a2)
       */
#if defined(USE_ELEMENTAL)
    if (me == 0) {
#else
    if (GA_Nodeid() == 0) {
#endif
        printf("All tests successful\n");
    }

    free(h0);
    free(tmpa);
    free(tmpb);
    free(tmpc);
#if defined(USE_ELEMENTAL)
    // call el::global arrays destructor
    ElGlobalArraysTerminate_d( eldga );
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    return 0;
}


/*
 * Verify for correctness. Process 0 computes BLAS dgemm 
 * locally. For larger arrays, disbale this test as memory
 * might not be sufficient
 */
void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k,
        double alpha, int g_a, int g_b, double beta, int g_c,
        double *tmpa, double *tmpb, double *tmpc)
{
    int i,j,type,ndim,dims[2],lo[2],hi[2];
    double abs_value;

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            tmpc[j+i*num_m] = -1.0;
            tmpa[j+i*num_m] = -2.0;
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_a, lo, hi, tmpa, &dims[1]);
#endif

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] );
#else
    NGA_Get(g_b, lo, hi, tmpb, &dims[1]);
#endif

    /* compute dgemm sequentially */
#if defined(USE_ELEMENTAL)
    cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), 
	    ( xt2 == 'n'? CblasNoTrans: CblasTrans ), 
	    num_m /* M */, num_n /* N */, num_k /* K */, 
	    alpha, tmpa, num_m, /* lda */ 
	    tmpb, num_k, /* ldb */ beta, 
	    tmpc, num_m /* ldc */);
#else
    xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k,
            &alpha, tmpa, &num_m,
            tmpb, &num_k, &beta,
            tmpc, &num_m);
#endif

    /* after computing c locally, verify it with the values in g_c */

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_c, lo, hi, tmpa, &dims[1]);
#endif

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]);
            if(abs_value > 1.0 || abs_value < -1.0) {
                printf("Values are = %f %f\n",
                        tmpc[j+i*num_m], tmpa[j+i*num_m]);
                printf("Values are = %f %f\n", 
                        fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value);
                fflush(stdout);
                GA_Error("verify ga_dgemm failed", 1);
            }
        }
    }
}

/**
 * called by process '0' (or your master process )
 */
void load_ga(int handle, double *f, int dim1, int dim2)
{
      int lo[2], hi[2];
      
      if (dim1 < 0 || dim2 < 0) {
          return;
      }

      lo[0] = 0;
      lo[1] = 0;
      hi[0] = dim1-1;
      hi[1] = dim2-1;
#if defined(USE_ELEMENTAL)
      ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 );
#else
      NGA_Put(handle, lo, hi, f, &dim1);
#endif
}

Example #23

0

Show file

File: lock.c Project: wadejong/NWChem-Json

int main(int argc, char **argv)
{
    int me;
    int nproc;
    int status;
    int g_a;
    int dims[NDIM];
    int chunk[NDIM];
    int pg_world;
    size_t num = 10;
    double *p1 = NULL;
    double *p2 = NULL;
    size_t i;
    int num_mutex;
    int lo[1];
    int hi[1];
    int ld[1]={1};
    MPI_Comm comm;

    MP_INIT(argc,argv);
    GA_INIT(argc,argv);

    me = GA_Nodeid();
    nproc = GA_Nnodes();
    comm = GA_MPI_Comm_pgroup_default();

    printf("%d: Hello world!\n",me);

    if (me==0) printf("%d: GA_Initialize\n",me);
    /*if (me==0) printf("%d: ARMCI_Init\n",me);*/
    /*ARMCI_Init();*/
    /*if (me==0) printf("%d: MA_Init\n",me);*/
    /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/

    if (me==0) printf("%d: GA_Create_handle\n",me);
    g_a = GA_Create_handle();

    if (me==0) printf("%d: GA_Set_array_name\n",me);
    GA_Set_array_name(g_a,"test array A");

    dims[0] = 30;
    if (me==0) printf("%d: GA_Set_data\n",me);
    GA_Set_data(g_a,NDIM,dims,MT_DBL);

    chunk[0] = -1;
    if (me==0) printf("%d: GA_Set_chunk\n",me);
    GA_Set_chunk(g_a,chunk);

    if (me==0) printf("%d: GA_Pgroup_get_world\n",me);
    pg_world = GA_Pgroup_get_world();
    if (me==0) printf("%d: GA_Set_pgroup\n",me);
    GA_Set_pgroup(g_a,pg_world);

    if (me==0) printf("%d: GA_Allocate\n",me);
    status = GA_Allocate(g_a);
    if(0 == status) MPI_Abort(comm,100);

    if (me==0) printf("%d: GA_Zero\n",me);
    GA_Zero(g_a);

    if (me==0) printf("%d: GA_Sync\n",me);
    GA_Sync();

    num = 10;
    p1 = malloc(num*sizeof(double));
    /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p1==NULL) MPI_Abort(comm,1000);
    p2 = malloc(num*sizeof(double));
    /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p2==NULL) MPI_Abort(comm,2000);

    for ( i=0 ; i<num ; i++ ) p1[i] = 7.0;
    for ( i=0 ; i<num ; i++ ) p2[i] = 3.0;

    num_mutex = 17;
    status = GA_Create_mutexes(num_mutex);
    if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status);

/***************************************************************/
    if (me==0) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Put(g_a,lo,hi,p1,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
    } 
    GA_Print(g_a);
    if (me==1) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Get(g_a,lo,hi,p2,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
        for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n",
                (long unsigned)i,p2[i]);
    }
/***************************************************************/



    status = GA_Destroy_mutexes();
    if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status);

    /*ARMCI_Free(p2);*/
    /*ARMCI_Free(p1);*/
    free(p2);
    free(p1);

    if (me==0) printf("%d: GA_Destroy\n",me);
    GA_Destroy(g_a);

    /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/
    /*ARMCI_Finalize();*/
    if (me==0) printf("%d: GA_Terminate\n",me);
    GA_Terminate();
    if (me==0) printf("%d: MPI_Finalize\n",me);
    MPI_Finalize();

    return(0);
}

Example #24

0

Show file

File: gemmtest.c Project: jeffhammond/ga

void
test(int data_type) {
  int me=GA_Nodeid();
  int nproc = GA_Nnodes();
  int g_a, g_b, g_c;
  int ndim = 2;
  int dims[2]={N,N};
  int lo[2]={0,0};
  int hi[2]={N-1,N-1};
  int block_size[2]={NB,NB-1};
  int proc_grid[2];
  int i,j,l,k,m,n, ld;

  double alpha_dbl = 1.0, beta_dbl = 0.0;
  double dzero = 0.0;
  double ddiff;

  float alpha_flt = 1.0, beta_flt = 0.0;
  float fzero = 0.0;
  float fdiff;
  float ftmp;
  double dtmp;
  SingleComplex ctmp;
  DoubleComplex ztmp;

  DoubleComplex alpha_dcpl = {1.0, 0.0} , beta_dcpl = {0.0, 0.0}; 
  DoubleComplex zzero = {0.0,0.0};
  DoubleComplex zdiff;

  SingleComplex alpha_scpl = {1.0, 0.0} , beta_scpl = {0.0, 0.0}; 
  SingleComplex czero = {0.0,0.0};
  SingleComplex cdiff;

  void *alpha=NULL, *beta=NULL;
  void *abuf=NULL, *bbuf=NULL, *cbuf=NULL, *c_ptr=NULL;

  switch (data_type) {
  case C_FLOAT:
    alpha  = (void *)&alpha_flt;
    beta   = (void *)&beta_flt;
    abuf = (void*)malloc(N*N*sizeof(float));
    bbuf = (void*)malloc(N*N*sizeof(float));
    cbuf = (void*)malloc(N*N*sizeof(float));
    if(me==0) printf("Single Precision: Testing GA_Sgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;      
  case C_DBL:
    alpha  = (void *)&alpha_dbl;
    beta   = (void *)&beta_dbl;
    abuf = (void*)malloc(N*N*sizeof(double));
    bbuf = (void*)malloc(N*N*sizeof(double));
    cbuf = (void*)malloc(N*N*sizeof(double));
    if(me==0) printf("Double Precision: Testing GA_Dgemm,NGA_Matmul_patch for %d-Dimension", ndim); 
    break;    
  case C_DCPL:
    alpha  = (void *)&alpha_dcpl;
    beta   = (void *)&beta_dcpl;
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    if(me==0) printf("Double Complex:   Testing GA_Zgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  case C_SCPL:
    alpha  = (void *)&alpha_scpl;
    beta   = (void *)&beta_scpl;
    abuf = (void*)malloc(N*N*sizeof(SingleComplex));
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    if(me==0) printf("Single Complex:   Testing GA_Cgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  if (me==0) printf("\nCreate A, B, C\n");
#ifdef USE_REGULAR
  g_a = NGA_Create(data_type, ndim, dims, "array A", NULL);
#endif
#ifdef USE_SIMPLE_CYCLIC
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  NGA_Set_block_cyclic(g_a,block_size);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_SCALAPACK
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_block_cyclic_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_TILED
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_tiled_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
  g_b = GA_Duplicate(g_a, "array B");  
  g_c = GA_Duplicate(g_a, "array C");
  if(!g_a || !g_b || !g_c) GA_Error("Create failed: a, b or c",1);

  ld = N;
  if (me==0) printf("\nInitialize A\n");
  /* Set up matrix A */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[i*N+j] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[i*N+j] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[i*N+j].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[i*N+j].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[i*N+j] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[i*N+j] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[i*N+j].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[i*N+j].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nPerform matrix multiply\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (me==0) printf("\nCheck answer\n");
  /*
  GA_Print(g_a);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_b);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_c); 
  */

  /* Check answer */
  NGA_Get(g_a,lo,hi,abuf,&ld);
  NGA_Get(g_b,lo,hi,bbuf,&ld);
  for (i=0; i<N; i++) {
    for (j=0; j<N; j++) {
      switch (data_type) {
        case C_FLOAT:
          ((float*)cbuf)[i*N+j] = fzero;
          break;
        case C_DBL:
          ((double*)cbuf)[i*N+j] = dzero;
          break;
        case C_DCPL:
          ((DoubleComplex*)cbuf)[i*N+j] = zzero;
          break;
        case C_SCPL:
          ((SingleComplex*)cbuf)[i*N+j] = czero;
          break;
        default:
          GA_Error("wrong data type", data_type);
      }
      for (k=0; k<N; k++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N+j] += ((float*)abuf)[i*N+k]
              *((float*)bbuf)[k*N+j];
            break;
          case C_DBL:
            ((double*)cbuf)[i*N+j] += ((double*)abuf)[i*N+k]
              *((double*)bbuf)[k*N+j];
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N+j].real +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].real
               -(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].imag));
            ((DoubleComplex*)cbuf)[i*N+j].imag +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].imag
               +(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].real));
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N+j].real +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].real
               -(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].imag));
            ((SingleComplex*)cbuf)[i*N+j].imag +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].imag
               +(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].real));
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();
  if (me == 0) {
    NGA_Get(g_c,lo,hi,abuf,&ld);
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            fdiff = ((float*)abuf)[i*N+j]-((float*)cbuf)[i*N+j];
            if (((float*)abuf)[i*N+j] != 0.0) {
              fdiff /= ((float*)abuf)[i*N+j];
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((float*)abuf)[i*N+j],((float*)cbuf)[i*N+j]);
            }
            break;
          case C_DBL:
            ddiff = ((double*)abuf)[i*N+j]-((double*)cbuf)[i*N+j];
            if (((double*)abuf)[i*N+j] != 0.0) {
              ddiff /= ((double*)abuf)[i*N+j];
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((double*)abuf)[i*N+j],((double*)cbuf)[i*N+j]);
            }
            break;
          case C_DCPL:
            zdiff.real = ((DoubleComplex*)abuf)[i*N+j].real
              -((DoubleComplex*)cbuf)[i*N+j].real;
            zdiff.imag = ((DoubleComplex*)abuf)[i*N+j].imag
              -((DoubleComplex*)cbuf)[i*N+j].imag;
            if (((DoubleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((DoubleComplex*)abuf)[i*N+j].imag != 0.0) {
              ztmp = ((DoubleComplex*)abuf)[i*N+j];
              ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
                  /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
            } else {
              ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((DoubleComplex*)abuf)[i*N+j].real,
                  ((DoubleComplex*)abuf)[i*N+j].imag,
                  ((DoubleComplex*)cbuf)[i*N+j].real,
                  ((DoubleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          case C_SCPL:
            cdiff.real = ((SingleComplex*)abuf)[i*N+j].real
              -((SingleComplex*)cbuf)[i*N+j].real;
            cdiff.imag = ((SingleComplex*)abuf)[i*N+j].imag
              -((SingleComplex*)cbuf)[i*N+j].imag;
            if (((SingleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((SingleComplex*)abuf)[i*N+j].imag != 0.0) {
              ctmp = ((SingleComplex*)abuf)[i*N+j];
              fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
                  /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
            } else {
              fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((SingleComplex*)abuf)[i*N+j].real,
                  ((SingleComplex*)abuf)[i*N+j].imag,
                  ((SingleComplex*)cbuf)[i*N+j].real,
                  ((SingleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();

  /* copy cbuf back to g_a */
  if (me == 0) {
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  GA_Sync();

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = GA_Fdot(g_a,g_a);
      break;
    case C_DBL:
      dtmp = GA_Ddot(g_a,g_a);
      break;
    case C_DCPL:
      ztmp = GA_Zdot(g_a,g_a);
      break;
    case C_SCPL:
      ctmp = GA_Cdot(g_a,g_a);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  GA_Zero(g_b);
  GA_Add(alpha,g_a,beta,g_c,g_b);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = GA_Fdot(g_b, g_b);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = GA_Ddot(g_b, g_b);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = GA_Zdot(g_b, g_b);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = GA_Cdot(g_b, g_b);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
#endif

  free(abuf);
  free(bbuf);
  free(cbuf);

  switch (data_type) {
  case C_FLOAT:
    abuf = (void*)malloc(N*N*sizeof(float)/4);
    bbuf = (void*)malloc(N*N*sizeof(float)/4);
    cbuf = (void*)malloc(N*N*sizeof(float)/4);
    break;      
  case C_DBL:
    abuf = (void*)malloc(N*N*sizeof(double)/4);
    bbuf = (void*)malloc(N*N*sizeof(double)/4);
    cbuf = (void*)malloc(N*N*sizeof(double)/4);
    break;    
  case C_DCPL:
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    break;
  case C_SCPL:
    abuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  /* Test multiply on a fraction of matrix. Start by reinitializing
   * A and B */
  GA_Zero(g_a);
  GA_Zero(g_b);
  GA_Zero(g_c);

  if (me==0) printf("\nTest patch multiply\n");

  lo[0] = N/4;
  lo[1] = N/4;
  hi[0] = 3*N/4-1;
  hi[1] = 3*N/4-1;
  ld = N/2;

  /* Set up matrix A */
  if (me==0) printf("\nInitialize A\n");
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[(i-N/4)*N/2+(j-N/4)] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[(i-N/4)*N/2+(j-N/4)] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  beta_flt = 0.0;
  beta_dbl = 0.0;
  beta_scpl.real = 0.0;
  beta_dcpl.real = 0.0;
  if (me==0) printf("\nPerform matrix multiply on sub-blocks\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (0) {
  /*
  if (data_type != C_SCPL && data_type != C_DCPL) {
  */

  if (me==0) printf("\nCheck answer\n");

  /* Multiply buffers by hand */
  if (me == 0) {
    for (i=0; i<N/2; i++) {
      for (j=0; j<N/2; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N/2+j] = fzero;
            break;
          case C_DBL:
            ((double*)cbuf)[i*N/2+j] = dzero;
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N/2+j] = zzero;
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N/2+j] = czero;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
        for (k=0; k<N/2; k++) {
          switch (data_type) {
            case C_FLOAT:
              ((float*)cbuf)[i*N/2+j] += ((float*)abuf)[i*N/2+k]
                *((float*)bbuf)[k*N/2+j];
              break;
            case C_DBL:
              ((double*)cbuf)[i*N/2+j] += ((double*)abuf)[i*N/2+k]
                *((double*)bbuf)[k*N/2+j];
              break;
            case C_DCPL:
              ((DoubleComplex*)cbuf)[i*N/2+j].real +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].real
                 -(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].imag));
              ((DoubleComplex*)cbuf)[i*N/2+j].imag +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].imag
                 +(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].real));
              break;
            case C_SCPL:
              ((SingleComplex*)cbuf)[i*N/2+j].real +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].real
                 -(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].imag));
              ((SingleComplex*)cbuf)[i*N/2+j].imag +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].imag
                 +(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].real));
              break;
            default:
              GA_Error("wrong data type", data_type);
          }
        }
      }
    }
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  if (me == 0) printf("\n\n\n\n");

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = NGA_Fdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DBL:
      dtmp = NGA_Ddot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DCPL:
      ztmp = NGA_Zdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_SCPL:
      ctmp = NGA_Cdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  NGA_Zero_patch(g_b,lo,hi);
  NGA_Add_patch(alpha,g_a,lo,hi,beta,g_c,lo,hi,g_b,lo,hi);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = NGA_Fdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = NGA_Ddot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = NGA_Zdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = NGA_Cdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }

  }
#endif
  free(abuf);
  free(bbuf);
  free(cbuf);

  GA_Destroy(g_a);
  GA_Destroy(g_b);
  GA_Destroy(g_c);
}

Example #25

0

Show file

File: ga_put.c Project: sg0/Elemental

int main(int argc, char **argv)
{
  int rank, nprocs;
  int g_A;
  int *local_A=NULL, *local_B=NULL, *output_A=NULL;
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;
  int value=SIZE;
  //int value=0;

#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &rank );
  ElMPICommSize( MPI_COMM_WORLD, &nprocs );
  // instantiate el::global array
  ElGlobalArraysConstruct_i( &eliga );
  // initialize global arrays
  ElGlobalArraysInitialize_i( eliga );
#else
  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
#endif

  local_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  output_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (output_A, 0, SIZE*SIZE*sizeof(int));
  for(int j=0; j<SIZE; j++)
      for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(i + j);

  local_B=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (local_B, 0, SIZE*SIZE*sizeof(int));

#if defined(USE_ELEMENTAL)
  ElGlobalArraysCreate_i( eliga, DIM, dims, "array_A", NULL, NULL, &g_A );
  ElGlobalArraysFill_i( eliga, g_A, &value );
  ElGlobalArraysPrint_i( eliga, g_A );
  // acc data
  ElGlobalArraysPut_i( eliga, g_A, lo, hi, local_A, &ld );
  ElGlobalArraysSync_i( eliga );
  // get
  ElGlobalArraysGet_i( eliga, g_A, lo, hi, local_B, &ld );
  ElGlobalArraysSync_i( eliga );
  ElGlobalArraysPrint_i( eliga, g_A );
#else
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Fill(g_A, &value);
  GA_Print(g_A);

  NGA_Put(g_A, lo, hi, local_A, &ld);
  
  GA_Sync();
  
  NGA_Get(g_A, lo, hi, local_B, &ld);

  GA_Sync();
  
  GA_Print(g_A);
#endif

  // updated output
  MPI_Reduce (local_A, output_A, SIZE*SIZE, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);

  if(rank==0)
    {
      printf(" Original local buffer to be accumulated: \n");

      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_A[i*ld+j]);
	  printf("\n");
	}
      printf("\n");
      printf(" Get returns: \n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_B[i*ld + j]);
	  printf("\n");
	}

      printf("\n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    {
	      if(local_B[i*ld+j]!=output_A[i*ld+j])
		  GA_Error("ERROR", -99);
	    }
	}
    }
#if defined(USE_ELEMENTAL)
  ElGlobalArraysDestroy_i( eliga, g_A );
#else
  GA_Destroy(g_A);
#endif
  if(rank == 0)
    printf ("OK. Test passed\n");

    free (local_A);
    free (local_B);
    free (output_A);

#if defined(USE_ELEMENTAL)
    ElGlobalArraysTerminate_i( eliga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_i( eliga );
    ElFinalize();
#else
    GA_Terminate();
    MPI_Finalize();
#endif
}

Example #26

0

Show file

File: matrix1.c Project: MihaiLupoiu/TPP

void matrix_multiply() {
    
    int dims[NDIM], chunk[NDIM], ld[NDIM];
    int lo[NDIM], hi[NDIM], lo1[NDIM], hi1[NDIM];
    int lo2[NDIM], hi2[NDIM], lo3[NDIM], hi3[NDIM];
    int g_a, g_b, g_c, i, j, k, l;
    int me, nprocs;

    /* Find local processor ID and the number of processors */
    /* ### assign processor ID to the int variable "me" and the total number
     * ### of processors to the int variable "nprocs" */
    me     = GA_Nodeid();
    nprocs = GA_Nnodes();
    
    /* Configure array dimensions. Force an unequal data distribution */
    for(i=0; i<NDIM; i++) {
       dims[i]  = TOTALELEMS;
       ld[i]    = dims[i];
       chunk[i] = TOTALELEMS/nprocs-1; /*minimum block size on each process*/
    }
 
    /* create a global array g_a and duplicate it to get g_b and g_c*/
    /* ### create GA of doubles with dimension "NDIM" and size "dims" with
     * ### minimum block size "chunk" and assign the handle to the
     * ### integer variable "g_a". Then create remaining global arrays,
     * ### assigned to the integer handle "g_b" and "g_c" by duplicating
     * ### g_a. Assign the names "Array A", "Array B" and "Array C" to
     * ### "g_a", "g_b", and "g_c". */

    g_a=NGA_Create(C_DBL, NDIM, dims, "array A", chunk);
    

    if (!g_a) GA_Error("create failed: A", NDIM);
    if (me==0) printf("  Created Array A\n");

    g_b=GA_Duplicate(g_a,"array B");
    g_c=GA_Duplicate(g_a,"array C");
    if (!g_b || !g_c) GA_Error("duplicate failed",NDIM);
    if (me==0) printf("  Created Arrays B and C\n");
 
    /* initialize data in matrices a and b */
    if(me==0)printf("  Initializing matrix A and B\n");
    k = 0; l = 7;
    for(i=0; i<dims[0]; i++) {
       for(j=0; j<dims[1]; j++) {
          a[i][j] = (double)(++k%29);
          b[i][j] = (double)(++l%37);
       }
    }

    /*  Copy data to global arrays g_a and g_b */
    lo1[0] = 0;
    lo1[1] = 0;
    hi1[0] = dims[0]-1;
    hi1[1] = dims[1]-1;
    if (me==0) {
      /* ### copy the contents of array "a" into the portion of global array
       * ### "g_a" described by "lo1" and "hi1". Similarly, copy the contents
       * ### of the array "b" into corresponding portion of global array "g_b".
       * ### Use the array of strides "ld" to describe the physical layout of
       * ### arrays "a" and "b". */
      NGA_Put(g_a,lo1,hi1,a,ld);
      NGA_Put(g_b,lo1,hi1,b,ld);

    }
    
    /*  Synchronize all processors to make sure everyone has data */
    /* ### synchronize all processors */

    GA_Sync();


    /* Determine which block of data is locally owned. Note that
       the same block is locally owned for all GAs. */
    /* ### find out which block of data my node ("me") owns for the global
     * ### array "g_c" and store the contents in the integer arrays "lo" and
     * ### "hi". */
    NGA_Distribution(g_c,me,lo,hi);

    
    /* Get the blocks from g_a and g_b needed to compute this block in
       g_c and copy them into the local buffers a and b. */
    lo2[0] = lo[0];
    lo2[1] = 0;
    hi2[0] = hi[0];
    hi2[1] = dims[0]-1;
    /* ### copy the block of data described by the arrays "lo2" and "hi2" from
     * ### the global array "g_a" into the local array "a". Use the array of
     * ### strides "ld" to describe the physical layout of "a". */
    NGA_Get(g_a,lo2,hi2,a,ld);

    lo3[0] = 0;
    lo3[1] = lo[1];
    hi3[0] = dims[1]-1;
    hi3[1] = hi[1];
    /* ### copy the block of data described by the arrays "lo3" and "hi3" from
     * ### the global array "g_b" into the local array "b". Use the array of
     * ### strides "ld" to describe the physical layout of "b". */
     NGA_Get(g_b,lo3,hi3,b,ld);

    /* Do local matrix multiplication and store the result in local
       buffer c. Start by evaluating the transpose of b. */
    for(i=0; i < hi3[0]-lo3[0]+1; i++)
       for(j=0; j < hi3[1]-lo3[1]+1; j++) 
          btrns[j][i] = b[i][j];

    /* Multiply a and b to get c */
    for(i=0; i < hi[0] - lo[0] + 1; i++) {
       for(j=0; j < hi[1] - lo[1] + 1; j++) {
          c[i][j] = 0.0;
          for(k=0; k<dims[0]; k++)
             c[i][j] = c[i][j] + a[i][k]*btrns[j][k];
       }
    }
    
    /* Copy c back to g_c */
    /* ### copy data from the local array "c" into the block of the global
     * ### array "g_c" described by the integer arrays "lo" and "hi". Use
     * ### the array of strides "ld" to describe the physical layout of "c". */
    NGA_Put(g_c,lo,hi,c,ld);

    verify(g_a, g_b, g_c, lo1, hi1, ld);
    
    /* Deallocate arrays */
    /* ### destroy the global arrays "g_a", "g_b", "g_c" */
    GA_Destroy(g_a);
    GA_Destroy(g_b);
    GA_Destroy(g_c);
}

Example #27

0

Show file

File: ga_nbacc.c Project: sg0/Elemental

int main(int argc, char **argv)
{
  int rank, nprocs;
  int g_A;
  int *local_A=NULL, *local_B=NULL, *output_A=NULL;
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;
  int value=SIZE;

#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &rank );
  ElMPICommSize( MPI_COMM_WORLD, &nprocs );
  // instantiate el::global array
  ElGlobalArraysConstruct_i( &eliga );
  // initialize global arrays
  ElGlobalArraysInitialize_i( eliga );
#else
  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
#endif

  local_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  output_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (output_A, 0, SIZE*SIZE*sizeof(int));
  for(int j=0; j<SIZE; j++)
      for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(i + j);
      //for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(rand()%10);

  local_B=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (local_B, 0, SIZE*SIZE*sizeof(int));

  // nb handle
#if defined(USE_ELEMENTAL)
  typedef ElInt ga_nbhdl_t;
#endif
  ga_nbhdl_t nbnb;

#if defined(USE_ELEMENTAL)
  ElGlobalArraysCreate_i( eliga, DIM, dims, "array_A", NULL, &g_A );
  ElGlobalArraysFill_i( eliga, g_A, &value );
#else
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Fill(g_A, &value);
#endif

  if (rank == 0) printf ("Initial global array:\n");
#if defined(USE_ELEMENTAL)
  ElGlobalArraysPrint_i( eliga, g_A );
#else
  GA_Print(g_A);
#endif

  for (int i = 0; i < NITERS; i++)
  {
      // acc data
#if defined(USE_ELEMENTAL)
      ElGlobalArraysNBAccumulate_i( eliga, g_A, lo, hi, local_A, &ld, &value, &nbnb );
#else
      NGA_NbAcc(g_A, lo, hi, local_A, &ld, &value, &nbnb);
#endif
      
      // updated output
      MPI_Reduce (local_A, output_A, SIZE*SIZE, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

#if defined(USE_ELEMENTAL)
      ElGlobalArraysNBWait_i( eliga, &nbnb );
#else
      NGA_NbWait (&nbnb);
#endif 

      // get
      if (rank == 0) printf ("Get in iter #%d\n", i);
#if defined(USE_ELEMENTAL)
      ElGlobalArraysSync_i( eliga );
      ElGlobalArraysGet_i( eliga, g_A, lo, hi, local_B, &ld );
      ElGlobalArraysPrint_i( eliga, g_A );
#else
      GA_Sync();
      NGA_Get(g_A, lo, hi, local_B, &ld);
      GA_Print(g_A);
#endif
  } // end of iters

  if(rank==0)
    {
      printf(" Alpha (multiplier): %d\n", value);
      printf(" Original local buffer (before accumulation): \n");

      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_A[i*ld+j]);
	  printf("\n");
	}
      printf("\n");
      printf(" Get returns: \n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_B[i*ld + j]);
	  printf("\n");
	}

      printf("\n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    {
	      if(local_B[i*ld+j]!=(value + (NITERS * value * (output_A[i*ld+j]))))
		  GA_Error("ERROR", -99);
	    }
	}
    }
#if defined(USE_ELEMENTAL)
  ElGlobalArraysDestroy_i( eliga, g_A );
#else
  GA_Destroy(g_A);
#endif
  if(rank == 0)
    printf ("OK. Test passed\n");

    free (local_A);
    free (local_B);
    free (output_A);

#if defined(USE_ELEMENTAL)
    ElGlobalArraysTerminate_i( eliga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_i( eliga );
    ElFinalize();
#else
    GA_Terminate();
    MPI_Finalize();
#endif
}

Example #28

0

Show file

File: ga_add.c Project: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int g_A, g_B, g_C, **local_C=NULL, dims[DIM]={SIZE,SIZE}, val1=5, val2=4, alpha=3, beta=2;
  int clo[DIM]={SIZE-SIZE,SIZE-SIZE}, chi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;
  int **local_tm=NULL;

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();

  local_C=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    local_C[i]=(int*)malloc(SIZE*sizeof(int));

  local_tm=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    local_tm[i]=(int*)malloc(SIZE*sizeof(int));
  
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);

  g_B = GA_Duplicate(g_A, "array_B");
  g_C = GA_Duplicate(g_A, "array_C");

  GA_Fill(g_A, &val1);
  GA_Fill(g_B, &val2);
  
  GA_Add(&alpha, g_A, &beta, g_B, g_C);

  GA_Sync();

  GA_Print(g_A);
  GA_Print(g_B);
  GA_Print(g_C);

  //printf("check 1\n");
  NGA_Get(g_C, clo, chi, local_tm, &ld);
  //printf("check 2\n");
  // GA_Sync();
  if(rank==0)
    {
      for(i=0; i<SIZE; i++)
	{
	  for(j=0; j<SIZE; j++)printf("%d ", local_tm[i][j]);
	  printf("\n");
	}
    }
  /*
  if(rank==0)
    {
      NGA_Get(g_C, clo, chi, local_C, &ld);
      
      printf("check 1 \n");
      
      for(i=0; i<SIZE; i++)
	{
	  for(j=0; j<SIZE; j++)printf("%d ", local_C[i][j]);
	  printf("\n");
	}
      
      printf("check 2\n");
      
      for(i=0; i<SIZE; i++)
	{
	  for(j=0; j<SIZE; j++)
	    if(local_C[i][j]!=(alpha*val1)+(beta*val2)) printf("GA Error : \n");
	}
    }

  */
  //GA_Sync();
  if(rank==0)
    printf("Test Completed \n");

  //  GA_Sync();

  /*
  GA_Destroy(g_A);
  GA_Destroy(g_B);
  GA_Destroy(g_C);
  */

  //*******************************************************************

  /* what would be the possible reason for GA_destroy to get failed .., 
   * solve this before consolidate the whole
   */

  GA_Terminate();
  MPI_Finalize();
 
}

Example #29

0

Show file

File: adjacency_list.cpp Project: Anastien/GridPACK

// -------------------------------------------------------------
// AdjacencyList::ready
// -------------------------------------------------------------
void
AdjacencyList::ready(void)
{
#if 1
  int grp = this->communicator().getGroup();
  int me = GA_Pgroup_nodeid(grp);
  int nprocs = GA_Pgroup_nnodes(grp);
  p_adjacency.clear();
  p_adjacency.resize(p_global_nodes.size());

  // Find total number of nodes and edges. Assume no duplicates
  int nedges = p_edges.size();
  int total_edges = nedges;
  char plus[2];
  strcpy(plus,"+");
  GA_Pgroup_igop(grp,&total_edges, 1, plus);
  int nnodes = p_original_nodes.size();
  int total_nodes = nnodes;
  GA_Pgroup_igop(grp,&total_nodes, 1, plus);

  // Create a global array containing original indices of all nodes and indexed
  // by the global index of the node
  int i, p;
  int dist[nprocs];
  for (p=0; p<nprocs; p++) {
    dist[p] = 0;
  }
  dist[me] = nnodes;
  GA_Pgroup_igop(grp,dist,nprocs,plus);
  int *mapc = new int[nprocs+1];
  mapc[0] = 0;
  for (p=1; p<nprocs; p++) {
    mapc[p] = mapc[p-1] + dist[p-1];
  }
  mapc[nprocs] = total_nodes;
  int g_nodes = GA_Create_handle();
  int dims = total_nodes;
  NGA_Set_data(g_nodes,1,&dims,C_INT);
  NGA_Set_pgroup(g_nodes, grp);
  if (!GA_Allocate(g_nodes)) {
    char buf[256];
    sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array"
        " for bus indices\n");
    printf(buf);
    throw gridpack::Exception(buf);
  }
  int lo, hi;
  lo = mapc[me];
  hi = mapc[me+1]-1;
  int size = hi - lo + 1;
  int o_idx[size], g_idx[size];
  for (i=0; i<size; i++) o_idx[i] = p_original_nodes[i]; 
  for (i=0; i<size; i++) g_idx[i] = p_global_nodes[i]; 
  int **indices= new int*[size];
  int *iptr = g_idx;
  for (i=0; i<size; i++) {
    indices[i] = iptr;
    iptr++;
  }
  if (size > 0) NGA_Scatter(g_nodes,o_idx,indices,size);
  GA_Pgroup_sync(grp);
  delete [] indices;
  delete [] mapc;

  // Cycle through all nodes and match them up with nodes at end of edges.
  for (p=0; p<nprocs; p++) {
    int iproc = (me+p)%nprocs;
    // Get node data from process iproc
    NGA_Distribution(g_nodes,iproc,&lo,&hi);
    size = hi - lo + 1;
    if (size <= 0) continue;
    int *buf = new int[size];
    int ld = 1;
    NGA_Get(g_nodes,&lo,&hi,buf,&ld);
    // Create a map of the nodes from process p
    std::map<int,int> nmap;
    std::map<int,int>::iterator it;
    std::pair<int,int> pr;
    for (i=lo; i<=hi; i++){
      pr = std::pair<int,int>(buf[i-lo],i);
      nmap.insert(pr);
    }
    delete [] buf;
    // scan through the edges looking for matches. If there is a match, set the
    // global index
    int idx;
    for (i=0; i<nedges; i++) {
      idx = static_cast<int>(p_edges[i].original_conn.first);
      it = nmap.find(idx);
      if (it != nmap.end()) {
        p_edges[i].global_conn.first = static_cast<Index>(it->second);
      }
      idx = static_cast<int>(p_edges[i].original_conn.second);
      it = nmap.find(idx);
      if (it != nmap.end()) {
        p_edges[i].global_conn.second = static_cast<Index>(it->second);
      }
    }
  }
  GA_Destroy(g_nodes);

  // All edges now have global indices assigned to them. Begin constructing
  // adjacency list. Start by creating a global array containing all edges
  dist[0] = 0;
  for (p=1; p<nprocs; p++) {
    double max = static_cast<double>(total_edges);
    max = (static_cast<double>(p))*(max/(static_cast<double>(nprocs)));
    dist[p] = 2*(static_cast<int>(max));
  }
  int g_edges = GA_Create_handle();
  dims = 2*total_edges;
  NGA_Set_data(g_edges,1,&dims,C_INT);
  NGA_Set_irreg_distr(g_edges,dist,&nprocs);
  NGA_Set_pgroup(g_edges, grp);
  if (!GA_Allocate(g_edges)) {
    char buf[256];
    sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array"
        " for branch indices\n");
    printf(buf);
    throw gridpack::Exception(buf);
  }

  // Add edge information to global array. Start by figuring out how much data
  // is associated with each process
  for (p=0; p<nprocs; p++) {
    dist[p] = 0;
  }
  dist[me] = nedges;
  GA_Pgroup_igop(grp,dist, nprocs, plus);
  int offset[nprocs];
  offset[0] = 0;
  for (p=1; p<nprocs; p++) {
    offset[p] = offset[p-1] + 2*dist[p-1];
  }
  // Figure out where local data goes in GA and then copy it to GA
  lo = offset[me];
  hi = lo + 2*nedges - 1;
  int edge_ids[2*nedges];
  for (i=0; i<nedges; i++) {
    edge_ids[2*i] = static_cast<int>(p_edges[i].global_conn.first);
    edge_ids[2*i+1] = static_cast<int>(p_edges[i].global_conn.second);
  }
  if (lo <= hi) {
    int ld = 1;
    NGA_Put(g_edges,&lo,&hi,edge_ids,&ld);
  }
  GA_Pgroup_sync(grp);

  // Cycle through all edges and find out how many are attached to the nodes on
  // your process. Start by creating a map between the global node indices and
  // the local node indices
  std::map<int,int> gmap;
  std::map<int,int>::iterator it;
  std::pair<int,int> pr;
  for (i=0; i<nnodes; i++){
    pr = std::pair<int,int>(static_cast<int>(p_global_nodes[i]),i);
    gmap.insert(pr);
  }
  // Cycle through edge information on each processor
  for (p=0; p<nprocs; p++) {
    int iproc = (me+p)%nprocs;
    NGA_Distribution(g_edges,iproc,&lo,&hi);
    int size = hi - lo + 1;
    int *buf = new int[size];
    int ld = 1;
    NGA_Get(g_edges,&lo,&hi,buf,&ld);
    BOOST_ASSERT(size%2 == 0);
    size = size/2;
    int idx1, idx2;
    Index idx;
    for (i=0; i<size; i++) {
      idx1 = buf[2*i];
      idx2 = buf[2*i+1];
      it = gmap.find(idx1);
      if (it != gmap.end()) {
        idx = static_cast<Index>(idx2);
        p_adjacency[it->second].push_back(idx);
      }
      it = gmap.find(idx2);
      if (it != gmap.end()) {
        idx = static_cast<Index>(idx1);
        p_adjacency[it->second].push_back(idx);
      }
    }
    delete [] buf;
  }
  GA_Destroy(g_edges);
  GA_Pgroup_sync(grp);
#else
  int me(this->processor_rank());
  int nproc(this->processor_size());

  p_adjacency.clear();
  p_adjacency.resize(p_nodes.size());

  IndexVector current_indexes;
  IndexVector connected_indexes;

  for (int p = 0; p < nproc; ++p) {

    // broadcast the node indexes owned by process p to all processes,
    // all processes work on these at once

    current_indexes.clear();
    if (me == p) {
      std::copy(p_nodes.begin(), p_nodes.end(), 
	  std::back_inserter(current_indexes));
      // std::cout << me << ": node indexes: ";
      // std::copy(current_indexes.begin(), current_indexes.end(),
      //           std::ostream_iterator<Index>(std::cout, ","));
      // std::cout << std::endl;
    }
    boost::mpi::broadcast(this->communicator(), current_indexes, p);

    // make a copy of the local edges in a list (so it's easier to
    // remove those completely accounted for)
    std::list<p_Edge> tmpedges;
    std::copy(p_edges.begin(), p_edges.end(), 
	std::back_inserter(tmpedges));

    // loop over the process p's node index set

    int local_index(0);
    for (IndexVector::iterator n = current_indexes.begin(); 
	n != current_indexes.end(); ++n, ++local_index) {

      // determine the local edges that refer to the current node index

      connected_indexes.clear();
      std::list<p_Edge>::iterator e(tmpedges.begin());
      //      std::cout << me << ": current node index: " << *n 
      //                << ", edges: " << tmpedges.size() 
      //                << std::endl;

      while (e != tmpedges.end()) {
	if (*n == e->conn.first && e->conn.second != bogus) {
	  connected_indexes.push_back(e->conn.second);
	  e->found.first = true;
	  // std::cout << me << ": found connection: edge " << e->index
	  //           << " (" << e->conn.first << ", " << e->conn.second << ")"
	  //           << std::endl;
	}
	if (*n == e->conn.second && e->conn.first != bogus) {
	  connected_indexes.push_back(e->conn.first);
	  e->found.second = true;
	  // std::cout << me << ": found connection: edge " << e->index
	  //           << " (" << e->conn.first << ", " << e->conn.second << ")"
	  //           << std::endl;
	}

	if (e->found.first && e->found.second) {
	  e = tmpedges.erase(e);
	} else if (e->conn.first == bogus || 
	    e->conn.second == bogus) {
	  e = tmpedges.erase(e);
	} else {
	  ++e;
	}
      }

      // gather all connections for the current node index to the
      // node's owner process, we have to gather the vectors because
      // processes will have different numbers of connections

      if (me == p) {
	size_t allsize;
        boost::mpi::reduce(this->communicator(), 
                           connected_indexes.size(), allsize, std::plus<size_t>(), p);

	std::vector<IndexVector> all_connected_indexes;
        boost::mpi::gather(this->communicator(), 
                           connected_indexes, all_connected_indexes, p);
	p_adjacency[local_index].clear();
	for (std::vector<IndexVector>::iterator k = all_connected_indexes.begin();
	    k != all_connected_indexes.end(); ++k) {
	  std::copy(k->begin(), k->end(), 
	      std::back_inserter(p_adjacency[local_index]));
	}
      } else {
	boost::mpi::reduce(this->communicator(), 
                           connected_indexes.size(), std::plus<size_t>(), p);
	boost::mpi::gather(this->communicator(), connected_indexes, p);
      }
      this->communicator().barrier();
    }
    this->communicator().barrier();
  }
#endif
}

Example #30

0

Show file

File: testc.c Project: jeffhammond/ga

void do_work()
{
int ONE=1 ;   /* useful constants */
int g_a, g_b;
int n=N, type=MT_F_DBL;
int me=GA_Nodeid(), nproc=GA_Nnodes();
int i, row;
int dims[2]={N,N};
int lo[2], hi[2], ld;

/* Note: on all current platforms DoublePrecision == double */
double buf[N], err, alpha, beta;

     if(me==0)printf("Creating matrix A\n");
     g_a = NGA_Create(type, 2, dims, "A", NULL);
     if(!g_a) GA_Error("create failed: A",n); 
     if(me==0)printf("OK\n");

     if(me==0)printf("Creating matrix B\n");
     /* create matrix B  so that it has dims and distribution of A*/
     g_b = GA_Duplicate(g_a, "B");
     if(! g_b) GA_Error("duplicate failed",n); 
     if(me==0)printf("OK\n");

     GA_Zero(g_a);   /* zero the matrix */

     if(me==0)printf("Initializing matrix A\n");
     /* fill in matrix A with random values in range 0.. 1 */ 
     lo[1]=0; hi[1]=n-1;
     for(row=me; row<n; row+= nproc){
         /* each process works on a different row in MIMD style */
         lo[0]=hi[0]=row;   
         for(i=0; i<n; i++) buf[i]=sin((double)i + 0.1*(row+1));
         NGA_Put(g_a, lo, hi, buf, &n);
     }


     if(me==0)printf("Symmetrizing matrix A\n");
     GA_Symmetrize(g_a);   /* symmetrize the matrix A = 0.5*(A+A') */
   

     /* check if A is symmetric */ 
     if(me==0)printf("Checking if matrix A is symmetric\n");
     GA_Transpose(g_a, g_b); /* B=A' */
     alpha=1.; beta=-1.;
     GA_Add(&alpha, g_a, &beta, g_b, g_b);  /* B= A - B */
     err= GA_Ddot(g_b, g_b);
     
     if(me==0)printf("Error=%f\n",(double)err);
     
     if(me==0)printf("\nChecking atomic accumulate \n");

     GA_Zero(g_a);   /* zero the matrix */
     for(i=0; i<n; i++) buf[i]=(double)i;

     /* everybody accumulates to the same location/row */
     alpha = 1.0;
     row = n/2;
     lo[0]=hi[0]=row;
     lo[1]=0; hi[1]=n-1;
     ld = hi[1]-lo[1]+1;
     NGA_Acc(g_a, lo, hi, buf, &ld, &alpha );
     GA_Sync();

     if(me==0){ /* node 0 is checking the result */

        NGA_Get(g_a, lo, hi, buf,&ld);
        for(i=0; i<n; i++) if(buf[i] != (double)nproc*i)
           GA_Error("failed: column=",i);
        printf("OK\n\n");

     }
     
     GA_Destroy(g_a);
     GA_Destroy(g_b);
}