C++ (Cpp) NGA_Put примеры использования

Пример #1

0

Показать файл

Файл: ga_matrix.cpp Проект: Anastien/GridPACK

// -------------------------------------------------------------
// MatSetValues_DenseGA
// -------------------------------------------------------------
static 
PetscErrorCode
MatSetValues_DenseGA(Mat mat, 
                   PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], 
                   const PetscScalar v[],InsertMode addv)
{
  PetscErrorCode ierr = 0;
  struct MatGACtx *ctx;  
  int i, j, idx;
  PetscScalar vij, one(1.0);
  int lo[2], hi[2], ld[2] = {1, 1};
  ierr = MatShellGetContext(mat, (void *)&ctx); CHKERRQ(ierr);

  idx = 0;
  for (i = 0; i < m; ++i) {
    for (j = 0; j < n; ++j, ++idx) {
      lo[0] = idxm[i];
      hi[0] = idxm[i];
      lo[1] = idxn[j];
      hi[1] = idxn[j];
      vij = v[idx];
      switch (addv) {
      case INSERT_VALUES:
        NGA_Put(ctx->ga, lo, hi, (void *)&vij, ld);
        break;
      case ADD_VALUES:
        NGA_Acc(ctx->ga, lo, hi, (void *)&vij, ld, &one);
        break;
      default:
        BOOST_ASSERT_MSG(false, "Unknown set operation");
      }
    }
  }
  return ierr;
}

Пример #2

0

Показать файл

Файл: ga_matrix.cpp Проект: Anastien/GridPACK

// -------------------------------------------------------------
// Vec2GA
// -------------------------------------------------------------
static
PetscErrorCode
Vec2GA(Vec x, int pgroup, int *ga, bool trans = false)
{
  int lrows, rows;
  PetscErrorCode ierr = 0;
  
  ierr = VecGetLocalSize(x, &lrows); CHKERRQ(ierr);
  ierr = VecGetSize(x, &rows); CHKERRQ(ierr);
  
  PetscInt vlo, vhi;
  ierr = VecGetOwnershipRange(x, &vlo, &vhi); CHKERRQ(ierr);
  
  PetscScalar *v;
  ierr = VecGetArray(x, &v); CHKERRQ(ierr);

  int lo[2] = {0,0}, hi[2] = {0,0}, ld[2] = {1,1};
  if (!trans) {
    ierr = CreateMatGA(pgroup, lrows, 1, rows, 1, ga); CHKERRQ(ierr);
    lo[0] = vlo; 
    hi[0] = vhi-1;
  } else {
    ierr = CreateMatGA(pgroup, 1, lrows, 1, rows, ga); CHKERRQ(ierr);
    lo[1] = vlo; 
    hi[1] = vhi-1;
  }
  NGA_Put(*ga, lo, hi, v, ld);
  // GA_Print(*ga);
  ierr = VecRestoreArray(x, &v); CHKERRQ(ierr);

  GA_Pgroup_sync(pgroup);
  return ierr;
}

Пример #3

0

Показать файл

Файл: util_gnxtval.c Проект: jnafziger/pdft-nwchem

Integer util_gnxtval_(Integer *val) {

    if(*val > 0) {
       if(!initialized) ga_error("nxtval: not yet initialized", 0L);
       return (Integer) NGA_Read_inc(g_T, &subscript, 1);
    }
    else if(*val==0) {
       int n = 1;
       initialized=1;

       /* create task array */
       g_T = NGA_Create(C_LONG, 1, &n,"Atomic Task", NULL);
       
       /* Initialize the task array */
       if(GA_Nodeid()==0) {
	  int lo=0, hi=0;
	  NGA_Put (g_T, &lo, &hi, &initval, &hi);
	  initval=0;
       }
              GA_Sync();
       return 0;
    }
    else if (*val < 0) { GA_Destroy(g_T); initialized=0; initval=0; return 0;}
    
    ga_error("nxtval: invalid value passed", 0L);
    return -1;
}

Пример #4

0

Показать файл

Файл: lennard-jones_ga.c Проект: fuentesdt/tao-1.10.1-p3

int FormFunctionGradient (TAO_GA_APPLICATION gaapp, GAVec ga_X, double *f, GAVec ga_G, void *ptr)
{
  int lo, hi;			//the global coordinates
  AppCtx *user = (AppCtx *) ptr;
  int i,j;
  double *g, *x;
  double xx,yy,zz,temp,rij;
  

  MA_get_pointer(user->memHandle, &x);
  g = x + user->ndim*user->natoms;
    
  lo=0;
  hi=user->n-1; /* range of array indices */

  NGA_Get(ga_X, &lo, &hi, x, &hi);

  *f = 0;
  for (i=0; i < user->n; i++)
    g[i] = 0.0;

  if (user->ndim == 2) {
    for (j=1; j < user->natoms; j++) {
      for (i=0; i<j; i++) {
	xx = x[2*j] - x[2*i];
	yy = x[2*j+1] - x[2*i+1];
	rij = xx*xx + yy*yy;
	temp = 1.0/rij/rij/rij;
	*f += temp*(temp-2.0);
	temp *= 12.0*(temp-1.0)/rij;
	g[2*j] -= xx*temp;
	g[2*j+1] -= yy*temp;
	g[2*i] += xx*temp;
	g[2*i+1] += yy*temp;
      }
    }
  } else if (user->ndim == 3) {
    for (j=1; j < user->natoms; j++) {
      for (i=0; i < j; i++) {
	xx = x[3*j] - x[3*i];
	yy = x[3*j+1] - x[3*i+1];
	zz = x[3*j+2] - x[3*i+2];
	rij = xx*xx + yy*yy + zz*zz;
	temp = 1.0/rij/rij/rij;
	*f += temp*(temp-2.0);
	temp *= 12.0*(temp-1.0)/rij;
	g[3*j] -= xx*temp;
	g[3*j+1] -= yy*temp;
	g[3*j+2] -= zz*temp;
	g[3*i] += xx*temp;
	g[3*i+1] += yy*temp;
	g[3*i+2] += zz*temp;
      }
    }
  }
      
  NGA_Put(ga_G, &lo, &hi, g, &hi);
  return 0;
}

Пример #5

0

Показать файл

Файл: lennard-jones_ga.c Проект: fuentesdt/tao-1.10.1-p3

int InitializeVariables(GAVec ga_X, AppCtx *user) 
{
  double *x;
  double xx, yy, zz;
  int isqrtn, icrtn, left, i, j, k, il, jl, ctr;
  int lo, hi;


  lo = 0; hi = user->n - 1; /* range of array indices */
  MA_get_pointer(user->memHandle, &x);

  NGA_Get (ga_X, &lo, &hi, x, &hi);

  if (user->ndim == 2) {
    isqrtn = (int) sqrt( (double) user->natoms);
    left = user->natoms - isqrtn * isqrtn;
    xx = 0.0;
    yy = 0.0;
    for (j=0; j<=isqrtn + left/isqrtn; j++) {
      for (i=0; i < TaoMin(isqrtn, user->natoms - j*isqrtn); i++) {
	ctr = j*isqrtn + i;
	x[2*ctr] = xx;
	x[2*ctr+1] = yy;
	xx += 1.0;
      }
      yy += 1.0;
      xx = 0.0;
    }	     
  }
  else if (user->ndim == 3) {
    icrtn = (int) pow((user->natoms + 0.5),1.0/3.0);
    left = user->natoms - icrtn * icrtn * icrtn;
    xx = yy = zz = 0.0;
    for (k=0; k <= icrtn + left; k++) {
      jl = TaoMin(icrtn, (user->natoms - k*icrtn*icrtn)/icrtn+1);
      for (j=0; j<jl; j++) {
	il = TaoMin(icrtn, user->natoms - k*icrtn*icrtn - j*icrtn);
	for (i=0; i<il; i++) {
	  ctr = k*icrtn*icrtn + j*icrtn + i;
	  x[3*ctr] = xx;
	  x[3*ctr+1] = yy;
	  x[3*ctr+2] = zz;
	  xx += 1.0;
	}
	yy += 1.0;
	xx = 0.0;
      }
      zz += 1.0;
      yy = 0.0;
    }
  }

  NGA_Put(ga_X, &lo, &hi, x, &hi);
  return 0;
}

Пример #6

0

Показать файл

Файл: lennard-jones_ga2.c Проект: fuentesdt/tao-1.10.1-p3

int FormFunctionGradient (TAO_GA_APPLICATION gaapp, GAVec ga_X, double *f, GAVec ga_G, void *ptr)
{
  AppCtx *user = (AppCtx *) ptr;
  int lo, hi;		
  int taskId=user->me;          //Which task am I running
  int i;
  int zero = 0;

  /* reset atomicTask to nproc */
  if (user->me == 0) 
    NGA_Put(user->atomicTask, &zero, &zero, &user->nproc, &zero);


  for (i=0;i<user->natoms*user->ndim; i++)
    user->grad[i] = 0.0;
  *f = 0.0;

  while (taskId < user->nBlocks) {
    getBlock(ga_X, taskId, user);
    if (user->ndim == 2)
      LJFG_2D(taskId,f, user);
    else
      LJFG_3D(taskId,f, user);

    /* Get next block */
    taskId += user->nproc;
    //NGA_Read_inc(user->atomicTask, &zero, 1); 
  }


  /* gather function */
  GA_Dgop(f, 1, "+");

  /* gather gradient */
  GA_Dgop(user->grad, user->natoms*user->ndim, "+");
  NGA_Distribution(ga_G, user->me, &lo, &hi);
  NGA_Put(ga_G, &lo, &hi, user->grad+lo, &hi);

  GA_Sync();
  
  return 0;
}

Пример #7

0

Показать файл

Файл: lennard-jones_ga2.c Проект: fuentesdt/tao-1.10.1-p3

/**
 * Block Topology (of Force Matrix): 
 * Say for example: If there are 4 block and 100 atoms, the size of 
 * the force matrix is 100x100 and each block size is 50x50. 
 * 
 *  -----------
 * |     |     |
 * | 0,0 | 0,1 |
 *  -----------
 * | 1,0 | 1,1 |
 * |     |     |
 *  -----------
 */
int SetupBlocks(AppCtx *user)
{
  int i,j,k=0;
  int n;
  int zero = 0;
  int x_space, g_space;

  if (user->natoms % user->BlockSize) {
    GA_Error("Number of atoms should be a multiple of block size. Choose a different block size.", 0L);
  }

  n = user->natoms / user->BlockSize;
  user->nBlocks = n*n;

  if (user->nBlocks > MAX_BLOCKS) 
    GA_Error("Number of blocks is greater that MAX_BLOCKS: Solution is either to increase the defined MAX_BLOCKS or increase your block size",0L);

  if (user->nBlocks < user->nproc)
    GA_Error("Number of blocks should be greater than or equal to the number of processors",0L);

  
  for (i=0;i<n;i++)
    for (j=0;j<n;j++,k++) {
      user->btopo[k].x = i;
      user->btopo[k].y = j;
    }
  
  /* Create task array */
  n = 1;
  user->atomicTask = NGA_Create(C_INT, 1, &n, "Atomic Task", NULL);
  if (!user->atomicTask)
    GA_Error("NGA_Create failed for Atomic Task",0);

  if (user->me == 0) 
    NGA_Put(user->atomicTask, &zero, &zero, &user->nproc, &zero);
  
  
  /* space for x values from two processors */
  x_space = 2 * user->BlockSize * user->ndim;
  /* space for ALL gradient value */
  g_space = user->natoms * user->ndim; 
             

  if (MA_push_stack(C_DBL, x_space + g_space+3, "GA LJ bufs", &user->memHandle))
    MA_get_pointer(user->memHandle, &user->x1);
  else
    GA_Error("ma_alloc_get failed",x_space + g_space);
  
  user->x2  = user->x1 + x_space/2 + 1;
  user->grad = user->x2 + x_space/2 + 1;
  GA_Sync();
  return 0;
}

Пример #8

0

Показать файл

Файл: ga_get.c Проект: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int g_A, **local_A=NULL, **local_B=NULL; 
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=5, value=5;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);
  GA_Initialize();

  local_A=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    {
      local_A[i]=(int*)malloc(SIZE*sizeof(int));
      for(j=0; j<SIZE; j++) local_A[i][j]=rand()%10;
    }

  local_B=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    {
      local_B[i]=(int*)malloc(SIZE*sizeof(int));
      for(j=0; j<SIZE; j++) local_B[i][j]=rand()%10;
    }

  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Zero(g_A);
  
  if(rank==0)
    {
      NGA_Put(g_A, lo, hi, local_A, &ld);
      NGA_Get(g_A, lo, hi, local_B, &ld);

      for(i=0; i<SIZE; i++)
	{
	  for(j=0; j<SIZE; j++)
	    if(local_A[i][j]!=local_B[i][j]) GA_ERROR_MSG();
	}
    }
  
  GA_Sync();
  GA_Destroy(g_A);
  
  if(rank == 0) GA_PRINT_MSG();

  GA_Terminate();
  MPI_Finalize();
}

Пример #9

0

Показать файл

Файл: tce_mrcc_sublock.c Проект: twang15/lotsofcoresbook1code

Integer util_tcesublock_(Integer *val,Integer *p_handle) {

//    if(*p_handle==0) exit(1);
//ga_error("nxtask: p_handle is zero", 1);

    if(*val > 0) {
//       if(!initialized) exit(1);
//ga_error("nxtask: not yet initialized", 1);
       return (Integer) NGA_Read_inc(g_T, &subscript, 1);
    }
    else if(*val==0) {
       int n = 1;
       initialized=1;
       int p_h = (int)*p_handle;

       /* create task array */
//       g_T = NGA_Create(C_LONG, 1, &n,"Atomic Task", NULL);

       g_T = NGA_Create_config(C_LONG,1,&n,"Atomic Task",NULL,p_h);

       /* Initialize the task array */
       if(GA_Pgroup_nodeid(p_h)==0) {
	  int lo=0, hi=0;
	  NGA_Put (g_T, &lo, &hi, &initval, &hi);
//          printf("PUT %i %i %i\n",sizeof(*p_handle),sizeof(Integer),sizeof(int));
	  initval=0;
       }

       GA_Pgroup_sync(p_h);
//       printf("CREATE %i %i \n",*p_handle,g_T);
       return 0;
    }
    else if (*val < 0) {
        GA_Destroy(g_T);
//        printf("DELETE %i %i \n",*p_handle,g_T);
//        ga_pgroup_sync_(p_handle);
        initialized=0; 
        initval=0; 
        return 0;
   }
    
//    ga_error("nxtval: invalid value passed", 0L);
    return -1;
}

Пример #10

0

Показать файл

Файл: ga_transpose3.c Проект: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int g_A, g_B, **local_value=NULL;

  int dims[DIM]={SIZE,SIZE}, lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;

  local_value=(int**)malloc(SIZE*sizeof(int*));
  for(i=0; i<SIZE; i++)
    local_value[i]=(int*)malloc(SIZE*sizeof(int));

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();

  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  g_B = NGA_Create(C_INT, DIM, dims, "array_B", NULL);

  for(i=0; i<SIZE; i++)
    for(j=0; j<SIZE; j++)
      local_value[i][j]=rand()%10;
	
  if(rank==0) NGA_Put(g_A, lo, hi, local_value, &ld);
  GA_Transpose(g_A, g_B);
  if(rank==0) validate_transpose(g_A, g_B, lo, hi, ld);

  GA_Sync();
  if(rank == 1) GA_PRINT_MSG();
  
  GA_Terminate();
  MPI_Finalize();
}

Пример #11

0

Показать файл

Файл: gada.c Проект: adrielb/DCell

PetscErrorCode vizGA2DA()
{
  PetscErrorCode  ierr;
  int rank;
  MPI_Comm_rank(PETSC_COMM_WORLD,&rank);  
  int d1 = 40, d2 = 50;
  
  DA da;
  Vec vec;
  const PetscInt *lx, *ly, *lz;
  PetscInt m,n,p;
  DALocalInfo info;
  ierr = DACreate2d(PETSC_COMM_WORLD,DA_NONPERIODIC,DA_STENCIL_STAR,
            d1,d2,PETSC_DECIDE,PETSC_DECIDE,1,1,0,0, &da); CHKERRQ(ierr);
  ierr = DACreateGlobalVector(da, &vec); CHKERRQ(ierr);
  ierr = DAGetOwnershipRanges(da, &lx, &ly, &lz); CHKERRQ(ierr);
  ierr = DAGetLocalInfo(da,&info); CHKERRQ(ierr);
  ierr = DAGetInfo(da,0,0,0,0,&m,&n,&p,0,0,0,0); CHKERRQ(ierr);
  /**/
  ierr = DAView(da, PETSC_VIEWER_STDOUT_WORLD); CHKERRQ(ierr);
  for (int i = 0; i < m; ++i) {
    PetscPrintf(PETSC_COMM_WORLD,"%d\tlx: %d\n",i,lx[i]);
  }
  for (int i = 0; i < n; ++i) {
    PetscPrintf(PETSC_COMM_WORLD,"%d\tly: %d\n",i,ly[i]);
  }
  /**/
 
  
  int ga = GA_Create_handle();
  int ndim = 2;
  int dims[2] = {d2,d1};
  GA_Set_data(ga,2,dims,MT_DBL);
  int *map;
  PetscMalloc( sizeof(int)*(m+n), &map);
  map[0] = 0;
  for( int i = 1; i < n; i++ )
  {
    map[i] = ly[i-1] + map[i-1];
  }
  map[n] = 0;
  for( int i = n+1; i < m+n; i++ )
  {
    map[i] = lx[i-n-1] + map[i-1];
  }
  /* correct ordering, but nodeid's dont line up with mpi rank for petsc's da
   * DA: +---+---+   GA: +---+---+   
   *     +-2-+-3-+       +-1-+-3-+
   *     +---+---+       +---+---+
   *     +-0-+-1-+       +-0-+-2-+
   *     +---+---+       +---+---+
  int *map;
  PetscMalloc( sizeof(int)*(m+n), &map);
  map[0] = 0;
  for( int i = 1; i < m; i++ )
  {
    map[i] = lx[i] + map[i-1];
  }
  map[m] = 0;
  for( int i = m+1; i < m+n; i++ )
  {
    map[i] = ly[i-m] + map[i-1];
  }
  */
  int block[2] = {n,m};  
  GA_Set_irreg_distr(ga,map,block);
  ierr = GA_Allocate( ga );
  if( !ierr ) GA_Error("\n\n\nga allocaltion failed\n\n",ierr);
  if( !ga ) GA_Error("\n\n\n ga null \n\n",ierr); 
  if( rank != GA_Nodeid() ) GA_Error("MPI rank does not match GA_Nodeid()",1);
  GA_Print_distribution(ga);  
  
  int lo[2], hi[2];
  NGA_Distribution(ga,rank,lo,hi);
  if( lo[1] != info.xs || hi[1] != info.xs+info.xm-1 ||
      lo[0] != info.ys || hi[0] != info.ys+info.ym-1 )
  {
    PetscSynchronizedPrintf(PETSC_COMM_SELF,"[%d] lo:(%2d,%2d)  hi:(%2d,%2d) \t DA: (%2d,%2d), (%2d, %2d)\n",
        rank, lo[1], lo[0], hi[1], hi[0], info.xs, info.ys, info.xs+info.xm-1, info.ys+info.ym-1);
  }
  PetscBarrier(0);
  PetscSynchronizedFlush(PETSC_COMM_WORLD);

  AO ao;
  DAGetAO(da,&ao);
  if( rank == 0 )
  {
    int *idx, len = d1*d2;
    PetscReal *val;
    PetscMalloc(sizeof(PetscReal)*len, &val);
    PetscMalloc(sizeof(int)*len, &idx);
    for (int j = 0; j < d2; ++j)
    {
      for (int i = 0; i < d1; ++i)
      {
        idx[i + d1*j] = i + d1*j;
        val[i + d1*j] = i + d1*j;
      }
    }
    AOApplicationToPetsc(ao,len,idx);
    VecSetValues(vec,len,idx,val,INSERT_VALUES);

    int a[2], b[2],ld[1]={0};
    double c = 0;
    for (int j = 0; j < d2; ++j)
    {
      for (int i = 0; i < d1; ++i)
      {
        a[0] = j;
        a[1] = i;
//        printf("%5.0f ",c);
        NGA_Put(ga,a,a,&c,ld);
        c++;
      }
    }
  }
//  GA_Print(ga);
  VecAssemblyBegin(vec);
  VecAssemblyEnd(vec);
  
  int ld;
  double *ptr;
  NGA_Access(ga,lo,hi,&ptr,&ld);
  PetscReal **d;
  int c=0;
  ierr = DAVecGetArray(da,vec,&d); CHKERRQ(ierr);
  for (int j = info.ys; j < info.ys+info.ym; ++j)
  {
    for (int i = info.xs; i < info.xs+info.xm; ++i)
    {
      if( d[j][i] != ptr[(i-info.xs)+ld*(j-info.ys)] )
        GA_Error("DA array is not equal to GA array",1);
//      printf("%d (%d,%d):\t%3.0f\t%3.0f\n", c, i, j, d[j][i], ptr[(i-info.xs)+ld*(j-info.ys)]);
      c++;
    }
  }
  ierr = DAVecRestoreArray(da,vec,&d); CHKERRQ(ierr);
  
  c=0;
  PetscReal *v;
  int start, end;
  VecGetOwnershipRange(vec, &start, &end);
  VecGetArray( vec, &v );
  for( int i = start; i < end; i++)
  {
//    printf("%d:\t%3.0f\t%3.0f\t%s\n", start, v[i-start], ptr[i-start], (v[i-start]-ptr[i-start]==0?"":"NO") );
  }
  VecRestoreArray( vec, &v );
  
  NGA_Release_update(ga,lo,hi);

  Vec gada;
  VecCreateMPIWithArray(((PetscObject)da)->comm,da->Nlocal,PETSC_DETERMINE,ptr,&gada);
  VecView(gada,PETSC_VIEWER_STDOUT_SELF);
  
  GA_Destroy(ga);
  
  
  
  ierr = VecDestroy(vec); CHKERRQ(ierr);
  ierr = DADestroy(da); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

Пример #12

0

Показать файл

Файл: transpose.c Проект: jeffhammond/ga

/*
    create a random sparse matrix in compressed row form corresponding to a
    7-point stencil for a grid on a lattice of dimension idim X jdim X kdim grid
    points
*/
void create_laplace_mat(int idim, int jdim, int kdim, int pdi, int pdj, int pdk,
                        int *gp_block, int *g_j, int *g_i, int **imapc) {
/*
    idim: i-dimension of grid
    jdim: j-dimension of grid
    kdim: k-dimension of grid
    pdi: i-dimension of processor grid
    pdj: j-dimension of processor grid
    pdk: k-dimension of processor grid
!    g_data: global array of values
!    g_j: global array containing j indices (using local indices)
!    g_i: global array containing starting location of each row in g_j
!         (using local indices)
    gp_block: global pointer array containing non-zero sparse sub-blocks of
              matrix
    g_j: global array containing j indices of sub-blocks
    g_i: global array containing starting location of each row in g_j
    tsize: total number of non-zero elements in matrix
    imapc: map array for vectors
*/
  int ltotal_procs;
  int *lproclist, *lproc_inv,  *lvoffset, *lnsize, *loffset, *licnt, *limapc;
  int *nnz_list;
  int nnz, offset, b_nnz;
  int nprocs, me, imin, imax, jcnt;
  int *jmin, *jmax;
  int ix, iy, iz, idx;
  double x, dr;
  double *rval, *gp_rval;
  int isize, idbg;
  int *jval, *gp_jval, *ival, *gp_ival, *ivalt;
  int i, j, k, itmp, one, tlo, thi, ld;
  int idum, ntot, indx, nghbrs[7], ncnt, nsave;
  int ixn[7],iyn[7],izn[7], procid[7];
  int status;
  int lo[3], hi[3], ip, jp, kp, ldi, ldj, jdx, joff;
  int il, jl, kl, ldmi, ldpi, ldmj, ldpj;
  int *xld, *yld, *zld, *tmapc;
  int *ecnt, *total_distr;
  int total_max, toffset;
  int *iparams, *blk_ptr;
  int *iparamst, *jvalt;
  double *rvalt;
  FILE *fp, *fopen();

  me = NGA_Nodeid();
  nprocs = NGA_Nnodes();
  idum = -(12345+me);
  x = ran3(&idum);
  one = 1;

  if (me == 0) {
    printf("\n Dimension of grid: \n\n");
    printf(" I Dimension: %d\n",idim);
    printf(" J Dimension: %d\n",jdim);
    printf(" K Dimension: %d\n\n",kdim);
  }
/*
   Find position of processor in processor grid and calulate minimum
   and maximum values of indices
*/
  i = me;
  ip = i%pdi;
  i = (i-ip)/pdi;
  jp = i%pdj;
  kp = (i-jp)/pdj;
 
  lo[0] = (int)((((double)idim)*((double)ip))/((double)pdi));
  if (ip < pdi-1) {
    hi[0] = (int)((((double)idim)*((double)(ip+1)))/((double)pdi))-1;
  } else {
    hi[0] = idim - 1;
  } 

  lo[1] = (int)((((double)jdim)*((double)jp))/((double)pdj));
  if (jp < pdj-1) {
    hi[1] = (int)((((double)jdim)*((double)(jp+1)))/((double)pdj))-1;
  } else {
    hi[1] = jdim - 1;
  } 

  lo[2] = (int)((((double)kdim)*((double)kp))/((double)pdk));
  if (kp < pdk-1) {
    hi[2] = (int)((((double)kdim)*((double)(kp+1)))/((double)pdk))-1;
  } else {
    hi[2] = kdim - 1;
  } 
 
  ldi = hi[0]-lo[0]+1;
  ldj = hi[1]-lo[1]+1;
 
  /* Evaluate xld, yld, zld. These contain the number of elements in each
     division along the x, y, z axes */
  xld = (int*)malloc(pdi*sizeof(int));
  for (i=0; i<pdi; i++) {
    if (i<pdi-1) {
      xld[i] = (int)((((double)idim)*((double)(i+1)))/((double)pdi));
    } else {
      xld[i] = idim;
    }
    xld[i] = xld[i] - (int)((((double)idim)*((double)(i)))/((double)pdi));
  }

  yld = (int*)malloc(pdj*sizeof(int));
  for (i=0; i<pdj; i++) {
    if (i<pdj-1) {
      yld[i] = (int)((((double)jdim)*((double)(i+1)))/((double)pdj));
    } else {
      yld[i] = jdim;
    }
    yld[i] = yld[i] - (int)((((double)jdim)*((double)(i)))/((double)pdj));
  }

  zld = (int*)malloc(pdk*sizeof(int));
  for (i=0; i<pdk; i++) {
    if (i<pdk-1) {
      zld[i] = (int)((((double)kdim)*((double)(i+1)))/((double)pdk));
    } else {
      zld[i] = jdim;
    }
    zld[i] = zld[i] - (int)((((double)kdim)*((double)(i)))/((double)pdk));
  }

/* Determine number of rows per processor
   lnsize[i]: number of rows associated with process i
   loffset[i]: global offset to location of first row associated
               with process i */

  lnsize = (int*)malloc(nprocs*sizeof(int));
  loffset = (int*)malloc(nprocs*sizeof(int));
  for (i=0; i<nprocs; i++) {
    lnsize[i] = 0;
    loffset[i] = 0;
  }
  lnsize[me] = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
  NGA_Igop(lnsize,nprocs,"+");
  loffset[0] = 0;
  for (i=1; i<nprocs; i++) {
    loffset[i] = loffset[i-1] + lnsize[i-1];
  }
 
  ntot = idim*jdim*kdim;
  NGA_Sync();
/*
    scan over rows of lattice
    imin: minimum global index of rows associated with this process (me)
    imax: maximum global index of rows associated with this process (me)
*/
  imin = loffset[me];
  imax = loffset[me]+lnsize[me]-1;
  free(loffset);
/*
    find out how many other processors couple to this row of blocks
    ecnt[i]: the number of columns on processor i that are coupled to this
    process
*/
  ecnt = (int*)malloc(nprocs*sizeof(int));
  for (i=0; i<nprocs; i++) {
    ecnt[i] = 0;
  }

  for (i=imin; i<=imax; i++) {
/*
    compute local indices of grid point corresponding to row i
*/
    indx = i - imin;
    ix = indx%ldi;
    indx = (indx - ix)/ldi;
    iy = indx%ldj;
    iz = (indx - iy)/ldj;
    ix = ix + lo[0];
    iy = iy + lo[1];
    iz = iz + lo[2];
 
    ecnt[me] = ecnt[me] + 1;
    if (ix+1 <= idim-1) {
      if (ix+1 > hi[0]) {
        jdx = kp*pdi*pdj + jp*pdi + ip + 1;
        ecnt[jdx] = ecnt[jdx] + 1;
      } else {
        ecnt[me] = ecnt[me] + 1;
      }
    }
    if (ix-1 >= 0) {
      if (ix-1 < lo[0]) {
        jdx = kp*pdi*pdj + jp*pdi + ip - 1;
        ecnt[jdx] = ecnt[jdx] + 1;
      } else {
        ecnt[me] = ecnt[me] + 1;
      }
    }
    if (iy+1 <= jdim-1) {
      if (iy+1 > hi[1]) {
        jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
        ecnt[jdx] = ecnt[jdx] + 1;
      } else {
        ecnt[me] = ecnt[me] + 1;
      }
    }
    if (iy-1 >= 0) {
      if (iy-1 < lo[1]) {
        jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
        ecnt[jdx] = ecnt[jdx] + 1;
      } else {
        ecnt[me] = ecnt[me] + 1;
      }
    }
    if (iz+1 <= kdim-1) {
      if (iz+1 > hi[2]) {
        jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
        ecnt[jdx] = ecnt[jdx] + 1;
      } else {
        ecnt[me] = ecnt[me] + 1;
      }
    }
    if (iz-1 >= 0) {
      if (iz-1 < lo[2]) {
        jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
        ecnt[jdx] = ecnt[jdx] + 1;
      } else {
        ecnt[me] = ecnt[me] + 1;
      }
    }
  }

/* Create list of processors that this processor is coupled to.
   If ecnt[i] is greater than zero then process i is coupled to this process.
   ltotal_procs: the total number of other processor that this process is coupled
                 to. This includes this process (the diagonal term).
   lproclist[i]: the IDs of the processor that this processor is coupled to
   lproc_inv[i]: the location in lproclist of processor i. If processor i is not
                 coupled to this process, the lproc_inv[i] = -1
   ncnt: total number of non-zero elements held by this process
   nnz_list[i]: number of processes coupled to process i by sparse blocks
   nnz: total number of sparse blocks */

  ltotal_procs = 0;
  ncnt = 0;
  for (i=0; i<nprocs; i++) {
    if (ecnt[i] > 0) {
      ltotal_procs++;
      ncnt += ecnt[i];
    }
  }
  nsave = ncnt;

  lproclist = (int*)malloc(ltotal_procs*sizeof(int));
  lproc_inv = (int*)malloc(nprocs*sizeof(int));
  licnt = (int*)malloc(ltotal_procs*sizeof(int));
  for (i=0; i<ltotal_procs; i++) {
    licnt[i] = 0;
  }

  rval = (double*)malloc(ncnt*sizeof(double));
  idbg = ncnt;
  jval = (int*)malloc(ncnt*sizeof(int));
  ival = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));
  ivalt = (int*)malloc((imax-imin+2)*ltotal_procs*sizeof(int));

  for (i=0; i<ncnt; i++) {
    rval[i] = 0.0;
    jval[i] = 0;
  }

  j = (imax-imin+2)*ltotal_procs;
  for (i=0; i<j; i++) {
    ival[i] = 0;
    ivalt[i] = 0;
  }

  nnz_list = (int*)malloc(nprocs*sizeof(int));
  for (i=0; i<nprocs; i++) {
    nnz_list[i] = 0;
  }

  /* nnz is total number of non-zero sparse blocks */
  nnz_list[me] = ltotal_procs;
  NGA_Igop(nnz_list, nprocs, "+");
  nnz = 0;
  for (i=0; i<nprocs; i++) {
    nnz += nnz_list[i];
  }

/*  lvoffset[i]: local offset into array ival[i] to get to elements associated
    with block i (i runs from 0 to ltotal_procs-1)
    isize: number of rows (plus 1) that reside on this processor */
  isize = (imax-imin+2);
  for (i=0; i<nprocs; i++) {
    lproc_inv[i] = -1;
  }
  lvoffset = (int*)malloc(ltotal_procs*sizeof(int));
  lvoffset[0] = 0;
  j = 0;
  for (i=0; i<nprocs; i++) {
    if (ecnt[i] > 0) {
      lproclist[j] = i;
      if (j > 0) {
        lvoffset[j] = ecnt[lproclist[j-1]]+lvoffset[j-1];
      }
      lproc_inv[i] = j;
      j++;
    }
  }

/* Create arrays the hold the sparse block representation of the sparse matrix
   gp_block[nnz]: Global Pointer array holding the sparse sub-matrices
   g_j[nnz]: column block indices for the element in gp_block
   g_i[nprocs]: row block indices for the elements in g_j */

  tmapc = (int*)malloc((nprocs+1)*sizeof(int));
  tmapc[0] = 0;
  for (i=1; i<=nprocs; i++) {
    tmapc[i] = tmapc[i-1]+nnz_list[i-1];
  }
  *gp_block = GP_Create_handle();
  GP_Set_dimensions(*gp_block,one,&nnz);
  GP_Set_irreg_distr(*gp_block, tmapc, &nprocs);
  GP_Allocate(*gp_block);

  *g_j = NGA_Create_handle();
  NGA_Set_data(*g_j,one,&nnz,C_INT);
  NGA_Set_irreg_distr(*g_j, tmapc, &nprocs);
  NGA_Allocate(*g_j);

  for (i=0; i<nprocs; i++) {
    tmapc[i] = i;
  }
  *g_i = NGA_Create_handle();
  i = nprocs+1;
  NGA_Set_data(*g_i,one,&i,C_INT);
  NGA_Set_irreg_distr(*g_i, tmapc, &nprocs);
  NGA_Allocate(*g_i);
  free(tmapc);

  jmin = (int*)malloc(nprocs*sizeof(int));
  jmax = (int*)malloc(nprocs*sizeof(int));
  for (i=0; i<nprocs; i++) {
    jmin[i] = 0;
    jmax[i] = 0;
  }
  jmin[me] = imin;
  jmax[me] = imax;
  NGA_Igop(jmin, nprocs, "+");
  NGA_Igop(jmax, nprocs, "+");

/*
   Create the sparse blocks holding actual data. All the elements within each block
   couple this processor to one other processor
   rval[i]: values of matrix elements
   jval[i]: column indices of matrix elements
   ival[i]: index of first elements in rval and jval for the row represented by
            the index i.
   ivalt[i]: temporary array used in the construction of ival[i]
*/
  for (i=imin; i<=imax; i++) {
    /*
    compute local indices of grid point corresponding to row i
     */
    indx = i - imin;
    ix = indx%ldi;
    indx = (indx - ix)/ldi;
    iy = indx%ldj;
    iz = (indx - iy)/ldj;
    ix = ix + lo[0];
    iy = iy + lo[1];
    iz = iz + lo[2];
    /*
    find locations of neighbors in 7-point stencil (if they are on the grid)
     */
    ncnt = 0;
    ixn[ncnt] = ix;
    iyn[ncnt] = iy;
    izn[ncnt] = iz;
    il = ix - lo[0];
    jl = iy - lo[1];
    kl = iz - lo[2];
    idx = kl*ldi*ldj + jl*ldi + il;
    nghbrs[ncnt] = idx;
    procid[ncnt] = me;
    if (ix+1 <= idim - 1) {
      ncnt++;
      ixn[ncnt] = ix + 1;
      iyn[ncnt] = iy;
      izn[ncnt] = iz;
      if (ix+1 > hi[0]) {
        jdx = kp*pdi*pdj + jp*pdi + ip + 1;
        il = 0;
        jl = iy - lo[1];
        kl = iz - lo[2];
        ldpi = xld[ip+1];
      } else {
        jdx = me;
        il = ix - lo[0] + 1;
        jl = iy - lo[1];
        kl = iz - lo[2];
        ldpi = ldi;
      }
      idx = kl*ldpi*ldj + jl*ldpi + il;
      nghbrs[ncnt] = idx;
      procid[ncnt] = jdx;
    }
    if (ix-1 >= 0) {
      ncnt++;
      ixn[ncnt] = ix - 1;
      iyn[ncnt] = iy;
      izn[ncnt] = iz;
      if (ix-1 < lo[0]) {
        jdx = kp*pdi*pdj + jp*pdi + ip - 1;
        il = xld[ip-1] - 1;
        jl = iy - lo[1];
        kl = iz - lo[2];
        ldmi = xld[ip-1];
      } else {
        jdx = me;
        il = ix - lo[0] - 1;
        jl = iy - lo[1];
        kl = iz - lo[2];
        ldmi = ldi;
      }
      idx = kl*ldmi*ldj + jl*ldmi + il;
      nghbrs[ncnt] = idx;
      procid[ncnt] = jdx;
    }
    if (iy+1 <= jdim-1) {
      ncnt++;
      ixn[ncnt] = ix; 
      iyn[ncnt] = iy + 1;
      izn[ncnt] = iz;
      if (iy+1 > hi[1]) {
        jdx = kp*pdi*pdj + (jp+1)*pdi + ip;
        il = ix - lo[0];
        jl = 0;
        kl = iz - lo[2];
        ldpj = yld[jp+1];
      } else {
        jdx = me;
        il = ix - lo[0];
        jl = iy - lo[1] + 1;
        kl = iz - lo[2];
        ldpj = ldj;
      }
      idx = kl*ldi*ldpj + jl*ldi + il;
      nghbrs[ncnt] = idx;
      procid[ncnt] = jdx;
    }
    if (iy-1 >= 0) {
      ncnt++;
      ixn[ncnt] = ix;
      iyn[ncnt] = iy - 1;
      izn[ncnt] = iz;
      if (iy-1 < lo[1]) {
        jdx = kp*pdi*pdj + (jp-1)*pdi + ip;
        il = ix - lo[0];
        jl = yld[jp-1] - 1;
        kl = iz - lo[2];
        ldmj = yld[jp-1];
      } else {
        jdx = me;
        il = ix - lo[0];
        jl = iy - lo[1] - 1;
        kl = iz - lo[2];
        ldmj = ldj;
      }
      idx = kl*ldi*ldmj + jl*ldi + il;
      nghbrs[ncnt] = idx;
      procid[ncnt] = jdx;
    }
    if (iz+1 <= kdim-1) {
      ncnt++;
      ixn[ncnt] = ix;
      iyn[ncnt] = iy;
      izn[ncnt] = iz + 1;
      if (iz+1 > hi[2]) {
        jdx = (kp+1)*pdi*pdj + jp*pdi + ip;
        il = ix - lo[0];
        jl = iy - lo[1];
        kl = 0;
      } else {
        jdx = me;
        il = ix - lo[0];
        jl = iy - lo[1];
        kl = iz - lo[2] + 1;
      }
      idx = kl*ldi*ldj + jl*ldi + il;
      nghbrs[ncnt] = idx;
      procid[ncnt] = jdx;
    }
    if (iz-1 >= 0) {
      ncnt++;
      ixn[ncnt] = ix;
      iyn[ncnt] = iy;
      izn[ncnt] = iz - 1;
      if (iz-1 < lo[2]) {
        jdx = (kp-1)*pdi*pdj + jp*pdi + ip;
        il = ix - lo[0];
        jl = iy - lo[1];
        kl = zld[kp-1] - 1;
      } else {
        jdx = me;
        il = ix - lo[0];
        jl = iy - lo[1];
        kl = iz - lo[2] - 1;
      }
      idx = kl*ldi*ldj + jl*ldi + il;
      nghbrs[ncnt] = idx;
      procid[ncnt] = jdx;
    }
    /*
    sort indices so that neighbors run from lowest to highest local index. This sort
    is not particularly efficient but ncnt is generally small
     */
    ncnt++;
    for (j=0; j<ncnt; j++) {
      for (k=j+1; k<ncnt; k++) {
        if (nghbrs[j] > nghbrs[k]) {
          itmp = nghbrs[j];
          nghbrs[j] = nghbrs[k];
          nghbrs[k] = itmp;
          itmp = ixn[j];
          ixn[j] = ixn[k];
          ixn[k] = itmp;
          itmp = iyn[j];
          iyn[j] = iyn[k];
          iyn[k] = itmp;
          itmp = izn[j];
          izn[j] = izn[k];
          izn[k] = itmp;
          itmp = procid[j];
          procid[j] = procid[k];
          procid[k] = itmp;
        }
      }
    }
    for (k=0; k<ncnt; k++) {
      if (nghbrs[k] < 0 || nghbrs[k] >= ntot) {
        printf("p[%d] Invalid neighbor %d\n",me,nghbrs[k]);
      }
    }

/* set weights corresponding to a finite difference Laplacian on a 7-point
   stencil */

    for (j=0; j<ncnt; j++) {
      jdx = procid[j];
      idx = lproc_inv[jdx];
      if (ix == ixn[j] && iy == iyn[j] && iz == izn[j]) {
        rval[lvoffset[idx]+licnt[idx]] = 6.0;
      } else {
        rval[lvoffset[idx]+licnt[idx]] = -1.0;
      }
      if (lvoffset[idx]+licnt[idx] < 0 || lvoffset[idx]+licnt[idx] >= nsave) {
        printf("p[%d] Out of bounds (lvoffset+licnt)[%d]: %d\n",me,idx,lvoffset[idx]+licnt[idx]);
      }
      if (lvoffset[idx]+licnt[idx]>=idbg) {
      }
      /* TODO: Check this carefully */
      jval[lvoffset[idx]+licnt[idx]] = nghbrs[j];
      ivalt[idx*isize+i-imin] = ivalt[idx*isize+i-imin]+1;
      licnt[idx]++;
    }
  }

/* finish evaluating ival array */

  for (i=0; i<ltotal_procs; i++) {
    ival[i*isize] = lvoffset[i];
    for (j=1; j<isize; j++) {
      ival[i*isize+j] = ival[i*isize+j-1] + ivalt[i*isize+j-1];
    }
  }
  isize = 0;
  for (i=0; i<ltotal_procs; i++) {
    isize = isize + licnt[i];
  }
  if (isize > MAXVEC)
    NGA_Error("ISIZE exceeds MAXVEC in local arrays ",isize);

/* Local portion of sparse matrix has been evaluated and decomposed into blocks
   that match partitioning of right hand side across processors. The following
   data is available at this point:
      1) ltotal_procs: the number of processors that are coupled to this one via
         the sparse matrix
      2) lproclist(ltotal_procs): a list of processor IDs that are coupled to
         this processor
      3) lproc_inv(nprocs): The entry in proc_list that corresponds to a given
         processor. If the entry is -1 then that processor does not couple to
         this processor.
      4) licnt(ltotal_procs): The number of non-zero entries in the sparse matrix
         that couple the process represented by proc_list(j) to this process
      5) lvoffset(ltotal_procs): The offsets for the non-zero data in the arrays
         rval and jval for the blocks that couple this processor to other
         processes in proc_list
      6) offset(nprocs): the offset array for the distributed right hand side
         vector
    These arrays describe how the sparse matrix is layed out both locally and
    across processors. In addition, the actual data for the distributed sparse
    matrix is found in the following arrays:
      1) rval: values of matrix for all blocks on this processor
      2) jval: j-indices of matrix for all blocks on this processor
      3) ival(ltotal_procs*(lnsize(me)+1)): starting index in rval and
         jval for each row in each block */
 
  NGA_Sync();

/* Create a sparse array of sparse blocks.
   Each block element is divided into for sections.
   The first section consists of 7 ints and contains the parameters
     imin: minimum i index represented by block
     imin: maximum i index represented by block
     jmin: minimum j index represented by block
     jmin: maximum j index represented by block
     iblock: row index of block
     jblock: column index of block
     nnz: number of non-zero elements in block
   The next section consists of nnz doubles that represent the non-zero values
   in the block. The third section consists of nnz ints and contains the local
   j indices of all values. The final section consists of (imax-imin+2) ints
   and contains the starting index in jval and rval for the each row between
   imin and imax. An extra value is included at the end and is set equal to
   nnz+1. This is included to simplify some coding.
 */

  offset = 0;
  for (i=0; i<me; i++) {
    offset += nnz_list[i];
  }
  NGA_Put(*g_i, &me, &me, &offset, &one);
  if (me==nprocs-1) {
    NGA_Put(*g_i, &nprocs, &nprocs, &nnz, &one);
  }
  NGA_Sync();
  for (i = 0; i<ltotal_procs; i++) {
    /* evaluate total size of block */
    b_nnz = ecnt[lproclist[i]];
    isize = 7*sizeof(int) + b_nnz*(sizeof(double)+sizeof(int))
          + (imax-imin+2)*sizeof(int);
    blk_ptr = (int*)GP_Malloc(isize);

    iparams = blk_ptr;
    gp_rval = (double*)(iparams+7);
    gp_jval = (int*)(gp_rval+b_nnz);
    gp_ival = (gp_jval+b_nnz);

    iparams[0] = imin;
    iparams[1] = imax;
    iparams[2] = jmin[lproclist[i]];
    iparams[3] = jmax[lproclist[i]];
    iparams[4] = me;
    iparams[5] = lproclist[i];
    iparams[6] = b_nnz;

    ldj = (imax-imin+2);
    k = 0;
    toffset = lvoffset[i];
    for (j=0; j<b_nnz; j++) {
      gp_jval[j] = jval[toffset+j];
      gp_rval[j] = rval[toffset+j];
    }

    toffset = ival[i*ldj];
    for (k=0; k<ldj; k++) {
      gp_ival[k] = ival[i*ldj+k]-toffset;
    }

    /* Assign blk_ptr to GP array element */
    GP_Assign_local_element(*gp_block, &offset, (void*)blk_ptr, isize);
    j = 1;
    NGA_Put(*g_j,&offset,&offset,&lproclist[i],&j);
    offset++;
  }
  NGA_Sync();

  tmapc = (int*)malloc(nprocs*sizeof(int));
  tmapc[0] = 0;
  for (i=1; i<nprocs; i++) {
    tmapc[i] = tmapc[i-1] + lnsize[i-1];
  }
    i = nprocs-1;
  *imapc = tmapc;

  free(rval);
  free(jval);
  free(ival);
  free(ivalt);
  free(xld);
  free(yld);
  free(zld);
  free(lnsize);
  free(lvoffset);
  free(ecnt);
  free(licnt);
  free(lproclist);
  free(lproc_inv);
  free(jmin);
  free(jmax);
  free(nnz_list);
  return;
}

Пример #13

0

Показать файл

Файл: screening.c Проект: SahanGH/psi4public

int schwartz_screening(PFock_t pfock, BasisSet_t basis)
{
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    // create shell pairs values
    //ERD_t erd;
    int nthreads = omp_get_max_threads();
    //CInt_createERD(basis, &erd, nthreads);
    int nshells = pfock->nshells;

    // create global arrays for screening
    int nprow = pfock->nprow;
    int npcol = pfock->npcol;
    int dims[2];
    int block[2];
    int map[nprow + npcol];
    for (int i = 0; i < nprow; i++) {
        map[i] = pfock->rowptr_sh[i];
    }
    for (int i = 0; i < npcol; i++) {
        map[i + nprow] = pfock->colptr_sh[i];
    }
    dims[0] = nshells;
    dims[1] = nshells;
    block[0] = nprow;
    block[1] = npcol;
    pfock->ga_screening =
        NGA_Create_irreg(C_DBL, 2, dims, "array Screening", block, map);
    if (0 == pfock->ga_screening) {
        return -1;
    }

    // compute the max shell value
    double *sq_values = (double *)PFOCK_MALLOC(sizeof(double) *
        pfock->nshells_row * pfock->nshells_col);
    if (NULL == sq_values) {
        return -1;
    }
    int startM = pfock->sshell_row;
    int startN = pfock->sshell_col;
    int endM = pfock->eshell_row;
    int endN = pfock->eshell_col;
    double maxtmp = 0.0;
    #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        #pragma omp for reduction(max:maxtmp)
        for (int M = startM; M <= endM; M++) {
            int dimM = CInt_getShellDim(basis, M);
            for (int N = startN; N <= endN; N++) {
                int dimN = CInt_getShellDim(basis, N);
                double *integrals;
                int nints=
                ComputeShellQuartet(basis,tid,M,N,M,N,&integrals);
                //CInt_computeShellQuartet(basis, erd, tid, M, N, M, N,
                //                         &integrals, &nints);
                double maxvalue = 0.0;
                if (nints != 0) {
                    for (int iM = 0; iM < dimM; iM++) {
                        for (int iN = 0; iN < dimN; iN++) {
                            int index =
                                iM * (dimN*dimM*dimN+dimN) + iN * (dimM*dimN+1);
                            if (maxvalue < fabs(integrals[index])) {
                                maxvalue = fabs(integrals[index]);
                            }
                        }
                    }
                }
                sq_values[(M - startM) * (endN - startN + 1)  + (N - startN)]
                    = maxvalue;
                if (maxvalue > maxtmp) {
                    maxtmp = maxvalue;
                }
            }
        }
    }
    int lo[2];
    int hi[2];
    lo[0] = startM;
    hi[0] = endM;
    lo[1] = startN;
    hi[1] = endN;
    int ld = endN - startN + 1;
    NGA_Put(pfock->ga_screening, lo, hi, sq_values, &ld);
    // max value
    MPI_Allreduce(&maxtmp, &(pfock->maxvalue), 1,
                  MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
    //CInt_destroyERD(erd);
    PFOCK_FREE(sq_values);

    // init shellptr
    sq_values = (double *)PFOCK_MALLOC(sizeof(double) * nshells);
    if (NULL == sq_values) {
        return -1;
    }
    int nnz = 0;
    double eta = pfock->tolscr2 / pfock->maxvalue;
    pfock->shellptr = (int *)PFOCK_MALLOC(sizeof(int) * (nshells + 1));
    pfock->mem_cpu += 1.0 * sizeof(int) * (nshells + 1);
    if (NULL == pfock->shellptr) {
        return -1;
    }
    memset(pfock->shellptr, 0, sizeof(int) * (nshells + 1));
    for (int M = 0; M < nshells; M++) {
        pfock->shellptr[M] = nnz;
        lo[0] = M;
        hi[0] = M;
        lo[1] = 0;
        hi[1] = nshells - 1;
        ld = nshells;
        NGA_Get(pfock->ga_screening, lo, hi, sq_values, &ld);
        for (int N = 0; N < nshells; N++) {
            double maxvalue = sq_values[N];
            if (maxvalue > eta) {
                if (M > N && (M + N) % 2 == 1 || M < N && (M + N) % 2 == 0) {
                    continue;
                } else {
                    nnz++;
                }
            }
        }
        pfock->shellptr[M + 1] = nnz;
    }
    pfock->nnz = nnz;

    double maxvalue;
    pfock->shellvalue  = (double *)PFOCK_MALLOC(sizeof(double) * nnz);
    pfock->shellid  = (int *)PFOCK_MALLOC(sizeof(int) * nnz);
    pfock->shellrid  = (int *)PFOCK_MALLOC(sizeof(int) * nnz);
    pfock->mem_cpu += 1.0 * sizeof(double) * nnz + 2.0 * sizeof(int) * nnz;
    nshells = pfock->nshells;
    if (pfock->shellvalue == NULL ||
        pfock->shellid == NULL ||
        pfock->shellrid == NULL) {
        return -1;
    }
    nnz = 0;
    for (int A = 0; A < nshells; A++) {
        pfock->shellptr[A] = nnz;
        lo[0] = A;
        hi[0] = A;
        lo[1] = 0;
        hi[1] = nshells - 1;
        ld = nshells;
        NGA_Get(pfock->ga_screening, lo, hi, sq_values, &ld);
        for (int B = 0; B < nshells; B++) {
            maxvalue = sq_values[B];
            if (maxvalue > eta) {
                if (A > B && (A + B) % 2 == 1 || A < B && (A + B) % 2 == 0)
                    continue;
                if (A == B) {
                    pfock->shellvalue[nnz] = maxvalue;
                } else {
                    pfock->shellvalue[nnz] = -maxvalue;
                }
                pfock->shellid[nnz] = B;
                pfock->shellrid[nnz] = A;
                nnz++;
            }
        }
    }
    PFOCK_FREE(sq_values);
    GA_Destroy(pfock->ga_screening);

    return 0;
}

Пример #14

0

Показать файл

Файл: ga-mpi.c Проект: jeffhammond/ga

void do_work()
{
int ZERO=0;   /* useful constants */
int g_a, g_b;
int n=N, ndim=2,type=MT_F_DBL,dims[2]={N,N},coord[2];
int me=GA_Nodeid(), nproc=GA_Nnodes();
int row, i, j;
 int lo[2], hi[2];

/* Note: on all current platforms DoublePrecision = double */
DoublePrecision buf[N], *max_row=NULL;

MPI_Comm WORLD_COMM;
MPI_Comm ROW_COMM;
int ilo,ihi, jlo,jhi, ld, prow, pcol;
int root=0, grp_me=-1;

     WORLD_COMM = GA_MPI_Comm_pgroup_default();

     if(me==0)printf("Creating matrix A\n");
     dims[0]=n; dims[1]=n;
     g_a = NGA_Create(type, ndim, dims, "A", NULL);
     if(!g_a) GA_Error("create failed: A",n); 
     if(me==0)printf("OK\n");
     
     if(me==0)printf("Creating matrix B\n");
     dims[0]=n;
     g_b = NGA_Create(type, 1, dims, "B", NULL);
     if(!g_b) GA_Error("create failed: B",n); 
     if(me==0)printf("OK\n");
     
     GA_Zero(g_a);   /* zero the matrix */
     
     if(me==0)printf("Initializing matrix A\n");
     /* fill in matrix A with values: A(i,j) = (i+j) */ 
     for(row=me; row<n; row+= nproc){
    /**
     * simple load balancing: 
     * each process works on a different row in MIMD style 
     */ 
    for(i=0; i<n; i++) buf[i]=(DoublePrecision)(i+row+1); 
    lo[0]=hi[0]=row;
    lo[1]=ZERO;  hi[1]=n-1; 
    NGA_Put(g_a, lo, hi, buf, &n); 
     }
     
     /* GA_print(&g_a);*/
     NGA_Distribution(g_a, me, lo, hi);
     ilo=lo[0]; ihi=hi[0];
     jlo=lo[1]; jhi=hi[1];
     
     GA_Sync(); 
     if(ihi-ilo+1 >0){
        max_row=(DoublePrecision*)malloc(sizeof(DoublePrecision)*(ihi-ilo+1));
        if (!max_row) GA_Error("malloc 3 failed",(ihi-ilo+1));
        for (i=0; i<(ihi-ilo+1); i++) {
            max_row[i] = 0.0;
        }
     }
     NGA_Proc_topology(g_a, me, coord);  /* block coordinates */
     prow = coord[0];
     pcol = coord[1];

     if(me==0)printf("Splitting comm according to distribution of A\n");
     
     /* GA on SP1 requires synchronization before & after message-passing !!*/
     GA_Sync(); 
     
     if(me==0)printf("Computing max row elements\n");
     /* create communicator for processes that 'own' A[:,jlo:jhi] */
     MPI_Barrier(WORLD_COMM);
     if(pcol < 0 || prow <0)
    MPI_Comm_split(WORLD_COMM,MPI_UNDEFINED,MPI_UNDEFINED, &ROW_COMM);
     else
    MPI_Comm_split(WORLD_COMM, (int)pcol, (int)prow, &ROW_COMM);
     
     if(ROW_COMM != MPI_COMM_NULL){
    double *ptr;
    MPI_Comm_rank(ROW_COMM, &grp_me);
    
    /* each process computes max elements in the block it 'owns' */
    lo[0]=ilo; hi[0]=ihi;
    lo[1]=jlo; hi[1]=jhi;
    NGA_Access(g_a, lo, hi, &ptr, &ld);
    for(i=0; i<ihi-ilo+1; i++){
       for(j=0; j<jhi-jlo+1; j++)
          if(max_row[i] < ptr[i*ld + j]){
         max_row[i] = ptr[i*ld + j];
          }
    }
    MPI_Reduce(max_row, buf, ihi-ilo+1, MPI_DOUBLE, MPI_MAX,
           root, ROW_COMM);
    
     }else fprintf(stderr,"process %d not participating\n",me);
     GA_Sync(); 
     
     /* processes with rank=root in ROW_COMM put results into g_b */
     ld = 1;
     if(grp_me == root) {
    lo[0]=ilo;  hi[0]=ihi;
    NGA_Put(g_b, lo, hi, buf, &ld); 
     }
        
     GA_Sync();

     if(me==0)printf("Checking the result\n");
     if(me==0){
    lo[0]=ZERO; hi[0]=n-1;
        NGA_Get(g_b, lo, hi, buf, &n); 
        for(i=0; i< n; i++)if(buf[i] != (double)n+i){
            fprintf(stderr,"error:%d max=%f should be:%d\n",i,buf[i],n+i);
            GA_Error("terminating...",1);
        }
     }
     
     if(me==0)printf("OK\n");

     GA_Destroy(g_a);
     GA_Destroy(g_b);
}

Пример #15

0

Показать файл

Файл: check_sum_ghost.c Проект: niuqingpeng/microbenchmark

int main(int argc, char**argv)
{
  int nprocs, me;
  int i,j;
  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
  GA_Initialize();
  const int heap=3000000, stack=300000;
  if(! MA_init(C_INT,stack,heap) ) GA_Error((char *) "MA_init failed",stack+heap /*error code*/);
  int Nx=97; int Ny=97; int Nz = 97;
  Nx+=3; Ny+=3; Nz+=3;
  int data[Nx*Ny*Nz];
  int num_splines = 32;
  int g_a,dims[4]={Nx,Ny,Nz,num_splines},chunk[4]={-1,-1,-1,num_splines};
  int width[4] = {3, 3, 3, 0};
  int type=C_INT;
  //g_a=NGA_Create(type,4,dims,"Coefs",chunk);
  g_a=NGA_Create_ghosts(type, 4, dims, width, "Coefs", chunk);
  int lo[4],hi[4],ld[3];
  //double value=9.0; GA_Fill(g_a,&value);
  GA_Print_distribution(g_a);
  fflush(stdout);
  if(me==0)
  {
      for (i=0; i<num_splines; i++)
      {
          int x, y, z;
          for (x=0; x<Nx; x++)
              for (y=0; y<Ny; y++)
                  for (z=0; z<Nz; z++)
                  {   j=x*(Ny*Nz)+y*Nz+z;
              data[j] = (x*100*100+y*100+z)*100+i;}
          lo[0]=lo[1]=lo[2]=0;
          hi[0]=Nx-1;hi[1]=Ny-1;hi[2]=Nz-1;
          lo[3]=hi[3]=i%num_splines;
          ld[0]=Ny;ld[1]=Nz;ld[2]=1;
          NGA_Put(g_a,lo,hi,data,ld);
      }
  }
  GA_Update_ghosts(g_a);
  GA_Sync();
  printf("done\n"),fflush(stdout);
  ga_coefs_t *ga_coefs = malloc(sizeof(ga_coefs_t));
  ga_coefs->Mx = Nx;
  ga_coefs->My = Ny;
  ga_coefs->Mz = Nz;
  ga_coefs->nsplines = num_splines;
  ga_coefs->g_a=g_a;
  int *coefs1 = (int*)malloc((size_t)1*sizeof(int)*4*4*4*num_splines);
  int ix,iy,iz;
  Nx-=3; Ny-=3; Nz-=3;
  ga_coefs->sumt=ga_coefs->amount=0;
  NGA_Distribution(g_a,me,lo,hi);
  GA_Print_distribution(g_a);
  int low[16][4],high[16][4];
  for(i=0;i<nprocs;i++)
      NGA_Distribution(g_a,i,low[i],high[i]);
  srand ( time(NULL) );
  int k=GA_Nodeid();
  printf("%d: low[k]=%d high[k]=%d\n", GA_Nodeid(), low[k][2], high[k][2]);
  int unequal=0;
  for(i=0;i<1000;i++) {
      ix=rand_index(low[k][0],high[k][0]);
      if(ix+3>=dims[0]) ix=low[k][0];
      iy=rand_index(low[k][1],high[k][1]);
      if(iy+3>=dims[1]) iy=low[k][1];
      iz=rand_index(low[k][2],high[k][2]);
      if(iz+3>=dims[2]) iz=low[k][2];
      coefs_ga_get_3d(ga_coefs,coefs1,ix,iy,iz);
      long get_sum=mini_cube_sum(coefs1, ga_coefs->nsplines);
      long ghost_sum=coefs_ghost_access_3d(ga_coefs->g_a, ix, iy, iz, ga_coefs->nsplines);
      if(get_sum!=ghost_sum) {
      printf("ixyz=\t%d\t%d\t%d\t", ix, iy, iz);
          printf("get_sum=%ld ghost_sum=%ld\n", get_sum, ghost_sum);
          unequal++;
      }
  }
  printf("unequal count=%d\n", unequal);
  free(coefs1);
  GA_Terminate();
  MPI_Finalize();
  return 0;
}

Пример #16

0

Показать файл

Файл: spiral-mm.c Проект: pghosh2/GA-Matrix-Multiplication-Benchmark

/* Square matrix-matrix multiplication */
void matrix_multiply(int M, int N, int K, 
		int blockX_len, int blockY_len) 
{
	/* Local buffers and Global arrays declaration */
	double *a=NULL, *b=NULL, *c=NULL;

	int dims[NDIMS], ld[NDIMS], chunks[NDIMS];
	int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */

	int g_a, g_b, g_c, g_cnt, g_cnt2;
	int offset;
	double alpha = 1.0, beta=0.0;
	int count_p = 0, next_p = 0;
	int count_gac = 0, next_gac = 0;
	double t1,t2,seconds;
        ga_nbhdl_t nbh;
        int count_acc = 0;

	/* Find local processor ID and the number of processes */
	int proc=GA_Nodeid(), nprocs=GA_Nnodes();

	if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 
			|| (K % blockX_len) != 0 || (K % blockY_len) != 0)
		GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101);

	/* Allocate/Set process local buffers */
	a = malloc (blockX_len * blockY_len * sizeof(double)); 
	b = malloc (blockX_len * blockY_len * sizeof(double)); 
	c = malloc (blockX_len * blockY_len * sizeof(double));

	cdims[0] = blockX_len;
	cdims[1] = blockY_len;	

	/* Configure array dimensions */
	for(int i = 0; i < NDIMS; i++) {
		dims[i]  = N;
		chunks[i] = -1;
		ld[i]    = cdims[i]; /* leading dimension/stride of the local buffer */
	}

	/* create a global array g_a and duplicate it to get g_b and g_c*/
	g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks);

	if (!g_a) 
		GA_Error("NGA_Create failed: A", NDIMS);

#if DEBUG>1
	if (proc == 0) 
		printf("  Created Array A\n");
#endif
	/* Ditto for C and B */
	g_b = GA_Duplicate(g_a, "array B");
	g_c = GA_Duplicate(g_a, "array C");

	if (!g_b || !g_c) 
		GA_Error("GA_Duplicate failed",NDIMS);
	if (proc == 0) 
		printf("Created Arrays B and C\n");

	/* Subscript array for read-incr, which is nothing but proc */
	int * rdcnt = malloc (nprocs * sizeof(int));
	memset (rdcnt, 0, nprocs * sizeof(int));
	int * rdcnt2 = malloc (nprocs * sizeof(int));
	memset (rdcnt2, 0, nprocs * sizeof(int));

	/* Create global array of nprocs elements for nxtval */	
	int counter_dim[1];
	counter_dim[0] = nprocs;

	g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL);

	if (!g_cnt) 
		GA_Error("Shared counter failed",1);

	g_cnt2 = GA_Duplicate(g_cnt, "another shared counter");

	if (!g_cnt2) 
		GA_Error("Another shared counter failed",1);

	GA_Zero(g_cnt);
	GA_Zero(g_cnt2);

#if DEBUG>1	
	/* initialize data in matrices a and b */
	if(proc == 0)
		printf("Initializing local buffers - a and b\n");
#endif
	int w = 0; 
	int l = 7;
	for(int i = 0; i < cdims[0]; i++) {
		for(int j = 0; j < cdims[1]; j++) {
			a[i*cdims[1] + j] = (double)(++w%29);
			b[i*cdims[1] + j] = (double)(++l%37);
		}
	}

	/* Copy data to global arrays g_a and g_b from local buffers */
	next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
	for (int i = 0; i < N; i+=cdims[0]) 
	{
		if (next_p == count_p) {
			for (int j = 0; j < N; j+=cdims[1])
			{
				/* Indices of patch */
				lo[0] = i;
				lo[1] = j;
				hi[0] = lo[0] + cdims[0];
				hi[1] = lo[1] + cdims[1];

				hi[0] = hi[0]-1;
				hi[1] = hi[1]-1;
#if DEBUG>1
				printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Put(g_a, lo, hi, a, ld);
				NGA_Put(g_b, lo, hi, b, ld);

			}
			next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
		}		
		count_p++;
	}


#if DEBUG>1
	printf ("After NGA_PUT to global - A and B arrays\n");
#endif
	/* Synchronize all processors to make sure puts from 
	   nprocs has finished before proceeding with dgemm */
	GA_Sync();

	t1 = GA_Wtime();

	next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
	for (int m = 0; m < N; m+=cdims[0])
	{
		for (int k = 0; k < N; k+=cdims[0])
		{
			if (next_gac == count_gac)	
			{
				/* A = m x k */
				lo[0] = m; lo[1] = k;
				hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];

				hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
				printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Get(g_a, lo, hi, a, ld);

				for (int n = 0; n < N; n+=cdims[1])
				{
					memset (c, 0, sizeof(double) * cdims[0] * cdims[1]);
					/* B = k x n */
					lo[0] = k; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_Get(g_b, lo, hi, b, ld);


					//_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0);

					/* TODO I am assuming square matrix blocks, further testing/work 
					   required for rectangular matrices */
					cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */
							cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha,
							a, cdims[0], /* lda */ b, cdims[1], /* ldb */
							beta=1.0, c, cdims[0] /* ldc */);

					NGA_NbWait(&nbh);

					/* C = m x n */
					lo[0] = m; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh);
					count_acc += 1;
				} /* END LOOP N */
				next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
			} /* ENDIF if count == next */
			count_gac++;
		} /* END LOOP K */
	} /* END LOOP M */

	GA_Sync();
	t2 = GA_Wtime();
	seconds = t2 - t1;
	if (proc == 0)
		printf("Time taken for MM (secs):%lf \n", seconds);

        printf("Number of ACC: %d\n", count_acc);

	/* Correctness test - modify data again before this function */
	for (int i = 0; i < NDIMS; i++) {
		lo[i] = 0;
		hi[i] = dims[i]-1;
		ld[i] = dims[i];
	}

	verify(g_a, g_b, g_c, lo, hi, ld, N);

	/* Clear local buffers */
	free(a);
	free(b);
	free(c);
	free(rdcnt);
	free(rdcnt2);

	GA_Sync();

	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);
	GA_Destroy(g_c);
	GA_Destroy(g_cnt);
	GA_Destroy(g_cnt2);
}

Пример #17

0

Показать файл

Файл: lock.c Проект: wadejong/NWChem-Json

int main(int argc, char **argv)
{
    int me;
    int nproc;
    int status;
    int g_a;
    int dims[NDIM];
    int chunk[NDIM];
    int pg_world;
    size_t num = 10;
    double *p1 = NULL;
    double *p2 = NULL;
    size_t i;
    int num_mutex;
    int lo[1];
    int hi[1];
    int ld[1]={1};
    MPI_Comm comm;

    MP_INIT(argc,argv);
    GA_INIT(argc,argv);

    me = GA_Nodeid();
    nproc = GA_Nnodes();
    comm = GA_MPI_Comm_pgroup_default();

    printf("%d: Hello world!\n",me);

    if (me==0) printf("%d: GA_Initialize\n",me);
    /*if (me==0) printf("%d: ARMCI_Init\n",me);*/
    /*ARMCI_Init();*/
    /*if (me==0) printf("%d: MA_Init\n",me);*/
    /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/

    if (me==0) printf("%d: GA_Create_handle\n",me);
    g_a = GA_Create_handle();

    if (me==0) printf("%d: GA_Set_array_name\n",me);
    GA_Set_array_name(g_a,"test array A");

    dims[0] = 30;
    if (me==0) printf("%d: GA_Set_data\n",me);
    GA_Set_data(g_a,NDIM,dims,MT_DBL);

    chunk[0] = -1;
    if (me==0) printf("%d: GA_Set_chunk\n",me);
    GA_Set_chunk(g_a,chunk);

    if (me==0) printf("%d: GA_Pgroup_get_world\n",me);
    pg_world = GA_Pgroup_get_world();
    if (me==0) printf("%d: GA_Set_pgroup\n",me);
    GA_Set_pgroup(g_a,pg_world);

    if (me==0) printf("%d: GA_Allocate\n",me);
    status = GA_Allocate(g_a);
    if(0 == status) MPI_Abort(comm,100);

    if (me==0) printf("%d: GA_Zero\n",me);
    GA_Zero(g_a);

    if (me==0) printf("%d: GA_Sync\n",me);
    GA_Sync();

    num = 10;
    p1 = malloc(num*sizeof(double));
    /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p1==NULL) MPI_Abort(comm,1000);
    p2 = malloc(num*sizeof(double));
    /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p2==NULL) MPI_Abort(comm,2000);

    for ( i=0 ; i<num ; i++ ) p1[i] = 7.0;
    for ( i=0 ; i<num ; i++ ) p2[i] = 3.0;

    num_mutex = 17;
    status = GA_Create_mutexes(num_mutex);
    if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status);

/***************************************************************/
    if (me==0) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Put(g_a,lo,hi,p1,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
    } 
    GA_Print(g_a);
    if (me==1) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Get(g_a,lo,hi,p2,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
        for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n",
                (long unsigned)i,p2[i]);
    }
/***************************************************************/



    status = GA_Destroy_mutexes();
    if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status);

    /*ARMCI_Free(p2);*/
    /*ARMCI_Free(p1);*/
    free(p2);
    free(p1);

    if (me==0) printf("%d: GA_Destroy\n",me);
    GA_Destroy(g_a);

    /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/
    /*ARMCI_Finalize();*/
    if (me==0) printf("%d: GA_Terminate\n",me);
    GA_Terminate();
    if (me==0) printf("%d: MPI_Finalize\n",me);
    MPI_Finalize();

    return(0);
}

Пример #18

0

Показать файл

Файл: ga_put.c Проект: sg0/Elemental

int main(int argc, char **argv)
{
  int rank, nprocs;
  int g_A;
  int *local_A=NULL, *local_B=NULL, *output_A=NULL;
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;
  int value=SIZE;
  //int value=0;

#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &rank );
  ElMPICommSize( MPI_COMM_WORLD, &nprocs );
  // instantiate el::global array
  ElGlobalArraysConstruct_i( &eliga );
  // initialize global arrays
  ElGlobalArraysInitialize_i( eliga );
#else
  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
#endif

  local_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  output_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (output_A, 0, SIZE*SIZE*sizeof(int));
  for(int j=0; j<SIZE; j++)
      for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(i + j);

  local_B=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (local_B, 0, SIZE*SIZE*sizeof(int));

#if defined(USE_ELEMENTAL)
  ElGlobalArraysCreate_i( eliga, DIM, dims, "array_A", NULL, NULL, &g_A );
  ElGlobalArraysFill_i( eliga, g_A, &value );
  ElGlobalArraysPrint_i( eliga, g_A );
  // acc data
  ElGlobalArraysPut_i( eliga, g_A, lo, hi, local_A, &ld );
  ElGlobalArraysSync_i( eliga );
  // get
  ElGlobalArraysGet_i( eliga, g_A, lo, hi, local_B, &ld );
  ElGlobalArraysSync_i( eliga );
  ElGlobalArraysPrint_i( eliga, g_A );
#else
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Fill(g_A, &value);
  GA_Print(g_A);

  NGA_Put(g_A, lo, hi, local_A, &ld);
  
  GA_Sync();
  
  NGA_Get(g_A, lo, hi, local_B, &ld);

  GA_Sync();
  
  GA_Print(g_A);
#endif

  // updated output
  MPI_Reduce (local_A, output_A, SIZE*SIZE, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);

  if(rank==0)
    {
      printf(" Original local buffer to be accumulated: \n");

      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_A[i*ld+j]);
	  printf("\n");
	}
      printf("\n");
      printf(" Get returns: \n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_B[i*ld + j]);
	  printf("\n");
	}

      printf("\n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    {
	      if(local_B[i*ld+j]!=output_A[i*ld+j])
		  GA_Error("ERROR", -99);
	    }
	}
    }
#if defined(USE_ELEMENTAL)
  ElGlobalArraysDestroy_i( eliga, g_A );
#else
  GA_Destroy(g_A);
#endif
  if(rank == 0)
    printf ("OK. Test passed\n");

    free (local_A);
    free (local_B);
    free (output_A);

#if defined(USE_ELEMENTAL)
    ElGlobalArraysTerminate_i( eliga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_i( eliga );
    ElFinalize();
#else
    GA_Terminate();
    MPI_Finalize();
#endif
}

Пример #19

0

Показать файл

Файл: transpose.c Проект: jeffhammond/ga

int main(int argc, char **argv) {
  int nmax, nprocs, me, me_plus;
  int g_a_data, g_a_i, g_a_j, isize;
  int gt_a_data, gt_a_i, gt_a_j;
  int g_b, g_c;
  int i, j, jj, k, one, jcnt;
  int chunk, kp1, ld;
  int *p_i, *p_j;
  double *p_data, *p_b, *p_c;
  double t_beg, t_beg2, t_ga_tot, t_get, t_mult, t_cnstrct, t_mpi_in, t_ga_in;
  double t_hypre_strct, t_ga_trans, t_gp_get;
  double t_get_blk_csr, t_trans_blk_csr, t_trans_blk, t_create_csr_ga, t_beg3;
  double t_gp_tget, t_gp_malloc, t_gp_assign, t_beg4;
  double prdot, dotga, dothypre, tempc;
  double prtot, gatot, hypretot, gatot2, hypretot2;
  double prdot2, prtot2;
  int status;
  int idim, jdim, kdim, idum, memsize;
  int lsize, ntot;
  int heap=200000, fudge=100, stack=200000, ma_heap;
  double *cbuf, *vector;
  int pdi, pdj, pdk, ip, jp, kp, ncells;
  int lo[3],hi[3];
  int blo[3], bhi[3];
  int ld_a, ld_b, ld_c, ld_i, ld_j, irows, ioff, joff, total_procs;
  int iproc, iblock, btot;
  double *amat, *bvec;
  int *ivec, *jvec;
  int *proclist, *proc_inv, *icnt;
  int *voffset, *offset, *mapc;
  int iloop, lo_bl, hi_bl;
  char *buf, **buf_ptr;
  int *iparams, *jval, *ival;
  double *rval, *rvalt;
  int imin, imax, jmin, jmax, irow, icol, nnz;
  int nrows, kmin, kmax, lmin, lmax, jdx;
  int LOOPNUM = 100;
  void **blk_ptr;
  void *blk;
  int blk_size, tsize, zero;
  int *iblk, *jblk, *blkidx;
  int *tblk_ptr;
  int *ivalt, *jvalt, *iparamst;
  int *iblk_t, *jblk_t, *blkidx_t;
/*
   Hypre declarations
*/
  int ierr;
#if USE_HYPRE
  HYPRE_StructGrid grid;
  HYPRE_StructStencil stencil;
  HYPRE_StructMatrix matrix;
  HYPRE_StructVector vec_x, vec_y;
  int i4, j4, ndim, nelems, offsets[7][3];
  int stencil_indices[7], hlo[3], hhi[3];
  double weights[7];
  double *values;
  double alpha, beta;
  int *rows, *cols;
#endif
/*
  ***  Intitialize a message passing library
*/
  zero = 0;
  one = 1;
  ierr = MPI_Init(&argc, &argv);
/*
 ***  Initialize GA
 
      There are 2 choices: ga_initialize or ga_initialize_ltd.
      In the first case, there is no explicit limit on memory usage.
      In the second, user can set limit (per processor) in bytes.
*/
  t_beg = GA_Wtime();
  NGA_Initialize();
  GP_Initialize();
  t_ga_in = GA_Wtime() - t_beg;
  NGA_Dgop(&t_ga_in,one,"+");

  t_ga_tot = 0.0;
  t_ga_trans = 0.0;
  t_get_blk_csr = 0.0;
  t_create_csr_ga = 0.0;
  t_trans_blk_csr = 0.0;
  t_trans_blk = 0.0;
  t_gp_get = 0.0;
  t_gp_malloc = 0.0;
  t_gp_assign = 0.0;
  t_mult = 0.0;
  t_get = 0.0;
  t_gp_tget = 0.0;
  t_hypre_strct = 0.0;
  prtot = 0.0;
  prtot2 = 0.0;
  gatot = 0.0;
  hypretot = 0.0;

  me = NGA_Nodeid();
  me_plus = me + 1;
  nprocs = NGA_Nnodes();
  if (me == 0) {
   printf("Time to initialize GA:                                 %12.4f\n",
          t_ga_in/((double)nprocs));
  }
/*
     we can also use GA_set_memory_limit BEFORE first ga_create call
*/
  ma_heap = heap + fudge;
/*      call GA_set_memory_limit(util_mdtob(ma_heap)) */
 
  if (me == 0) {
    printf("\nNumber of cores used: %d\n\nGA initialized\n\n",nprocs);
  }
/*
 ***  Initialize the MA package
      MA must be initialized before any global array is allocated
*/
  if (!MA_init(MT_DBL, stack, ma_heap)) NGA_Error("ma_init failed",-1);
/*
     create a sparse LMAX x LMAX matrix and two vectors of length
     LMAX. The matrix is stored in compressed row format.
     One of the vectors is filled with random data and the other
     is filled with zeros.
*/
  idim = IMAX;
  jdim = JMAX;
  kdim = KMAX;
  ntot = idim*jdim*kdim;
  if (me == 0) {
    printf("\nDimension of matrix: %d\n\n",ntot);
  }
  t_beg = GA_Wtime();
  grid_factor(nprocs,idim,jdim,kdim,&pdi,&pdj,&pdk);
  if (me == 0) {
    printf("\nProcessor grid configuration\n");
    printf("  PDX: %d\n",pdi);
    printf("  PDY: %d\n",pdj);
    printf("  PDZ: %d\n\n",pdk);
    printf(" Number of Loops: %d\n",LOOPNUM);
  }

  create_laplace_mat(idim,jdim,kdim,pdi,pdj,pdk,&g_a_data,&g_a_j,&g_a_i,&mapc);
  t_cnstrct = GA_Wtime() - t_beg;

  g_b = NGA_Create_handle();
  NGA_Set_data(g_b,one,&ntot,MT_DBL);
  NGA_Set_irreg_distr(g_b,mapc,&nprocs);
  status = NGA_Allocate(g_b);
/*
    fill g_b with random values
*/
  NGA_Distribution(g_b,me,blo,bhi);
  NGA_Access(g_b,blo,bhi,&p_b,&ld);
  ld = bhi[0]-blo[0]+1;
  btot = ld;
  vector = (double*)malloc(ld*sizeof(double));
  for (i=0; i<ld; i++) {
    idum  = 0;
    p_b[i] = ran3(&idum);
    vector[i] = p_b[i];
  }
  NGA_Release(g_b,blo,bhi);
  NGA_Sync();

  g_c = NGA_Create_handle();
  NGA_Set_data(g_c,one,&ntot,MT_DBL);
  NGA_Set_irreg_distr(g_c,mapc,&nprocs);
  status = NGA_Allocate(g_c);
  NGA_Zero(g_c);
#if USE_HYPRE
/*
    Assemble HYPRE grid and use that to create matrix. Start by creating
    grid partition
*/
  ndim = 3;
  i = me;
  ip = i%pdi;
  i = (i-ip)/pdi;
  jp = i%pdj;
  kp = (i-jp)/pdj;
  lo[0] = (int)(((double)idim)*((double)ip)/((double)pdi));
  if (ip < pdi-1) {
    hi[0] = (int)(((double)idim)*((double)(ip+1))/((double)pdi)) - 1;
  } else {
    hi[0] = idim - 1;
  }
  lo[1] = (int)(((double)jdim)*((double)jp)/((double)pdj));
  if (jp < pdj-1) {
    hi[1] = (int)(((double)jdim)*((double)(jp+1))/((double)pdj)) - 1;
  } else {
    hi[1] = jdim - 1;
  }
  lo[2] = (int)(((double)kdim)*((double)kp)/((double)pdk));
  if (kp < pdk-1) {
    hi[2] = (int)(((double)kdim)*((double)(kp+1))/((double)pdk)) - 1;
  } else {
    hi[2] = kdim - 1;
  }
/*
   Create grid
*/
  hlo[0] = lo[0];
  hlo[1] = lo[1];
  hlo[2] = lo[2];
  hhi[0] = hi[0];
  hhi[1] = hi[1];
  hhi[2] = hi[2];
  ierr = HYPRE_StructGridCreate(MPI_COMM_WORLD, ndim, &grid);
  ierr = HYPRE_StructGridSetExtents(grid, hlo, hhi);
  ierr = HYPRE_StructGridAssemble(grid);
/*
   Create stencil
*/
  offsets[0][0] = 0;
  offsets[0][1] = 0;
  offsets[0][2] = 0;

  offsets[1][0] = 1;
  offsets[1][1] = 0;
  offsets[1][2] = 0;

  offsets[2][0] = 0;
  offsets[2][1] = 1;
  offsets[2][2] = 0;

  offsets[3][0] = 0;
  offsets[3][1] = 0;
  offsets[3][2] = 1;

  offsets[4][0] = -1;
  offsets[4][1] = 0;
  offsets[4][2] = 0;

  offsets[5][0] = 0;
  offsets[5][1] = -1;
  offsets[5][2] = 0;

  offsets[6][0] = 0;
  offsets[6][1] = 0;
  offsets[6][2] = -1;

  nelems = 7;
  ierr = HYPRE_StructStencilCreate(ndim, nelems, &stencil);
  for (i=0; i<nelems; i++) {
    ierr = HYPRE_StructStencilSetElement(stencil, i, offsets[i]);
  }

  ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
  jcnt = 7*ncells;
  values = (double*)malloc(jcnt*sizeof(double));
  jcnt = 0;
  weights[0] = 6.0;
  weights[1] = -1.0;
  weights[2] = -1.0;
  weights[3] = -1.0;
  weights[4] = -1.0;
  weights[5] = -1.0;
  weights[6] = -1.0;
  for (i=0; i<ncells; i++) {
    for (j=0; j<7; j++) {
      values[jcnt] = weights[j];
      jcnt++;
    }
  }

  ierr = HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &matrix);
  ierr = HYPRE_StructMatrixInitialize(matrix);
  for (i=0; i<7; i++) {
    stencil_indices[i] = i;
  }
  ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, 7, stencil_indices, values);
  free(values);
/*
   Check all six sides of current box to see if any are boundaries.
   Set values to zero if they are.
*/
  if (hi[0] == idim-1) {
    ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
    hlo[0] = idim-1;
    hhi[0] = idim-1;
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 1;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (hi[1] == jdim-1) {
    ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = jdim-1;
    hhi[1] = jdim-1;
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 2;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  } 
  if (hi[2] == kdim-1) {
    ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = kdim-1;
    hhi[2] = kdim-1;
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 3;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (lo[0] == 0) {
    ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
    hlo[0] = 0;
    hhi[0] = 0;
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 4;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (lo[1] == 0) {
    ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = 0;
    hhi[1] = 0;
    hlo[2] = lo[2];
    hhi[2] = hi[2];
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 5;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  if (lo[2] == 1) {
    ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);
    hlo[0] = lo[0];
    hhi[0] = hi[0];
    hlo[1] = lo[1];
    hhi[1] = hi[1];
    hlo[2] = 0;
    hhi[2] = 0;
    values = (double*)malloc(ncells*sizeof(double));
    for (i=0; i<ncells; i++) values[i] = 0.0;
    i4 = 1;
    j4 = 6;
    ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values);
    free(values);
  }
  ierr = HYPRE_StructMatrixAssemble(matrix);
/*
    Create vectors for matrix-vector multiply
*/
  ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_x);
  ierr = HYPRE_StructVectorInitialize(vec_x);
  hlo[0] = lo[0];
  hlo[1] = lo[1];
  hlo[2] = lo[2];
  hhi[0] = hi[0];
  hhi[1] = hi[1];
  hhi[2] = hi[2];
  ierr = HYPRE_StructVectorSetBoxValues(vec_x, hlo, hhi, vector);
  ierr = HYPRE_StructVectorAssemble(vec_x);
  NGA_Distribution(g_a_i,me,blo,bhi);

  if (bhi[1] > ntot-1) {
    bhi[1] = ntot-1;
  }

  btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1);

  for (i=0; i<btot; i++) vector[i] = 0.0;
  hlo[0] = lo[0];
  hlo[1] = lo[1];
  hlo[2] = lo[2];
  hhi[0] = hi[0];
  hhi[1] = hi[1];
  hhi[2] = hi[2];
  ierr = HYPRE_StructVectorGetBoxValues(vec_x, hlo, hhi, vector);

  for (i=0; i<btot; i++) vector[i] = 0.0;
  ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_y);
  ierr = HYPRE_StructVectorInitialize(vec_y);
  ierr = HYPRE_StructVectorSetBoxValues(vec_y, hlo, hhi, vector);
  ierr = HYPRE_StructVectorAssemble(vec_y);
#endif
/* Multiply sparse matrix. Start by accessing pointers to local portions of
   g_a_data, g_a_j, g_a_i */

  NGA_Sync();
  for (iloop=0; iloop<LOOPNUM; iloop++) {
    t_beg2 = GA_Wtime();

    NGA_Distribution(g_c,me,blo,bhi);
    NGA_Access(g_c,blo,bhi,&p_c,&ld_c);
    for (i = 0; i<bhi[0]-blo[0]+1; i++) {
      p_c[i] = 0.0;
    }

/* get number of matrix blocks coupled to this process */
    NGA_Get(g_a_i,&me,&me,&lo_bl,&one);
#if 1
    NGA_Get(g_a_i,&me_plus,&me_plus,&hi_bl,&one);
    hi_bl--;
    total_procs = hi_bl - lo_bl + 1;
    blk_ptr = (void**)malloc(sizeof(void*));
/* Loop through matrix blocks */
    ioff = 0;
    for (iblock = 0; iblock<total_procs; iblock++) {
      t_beg = GA_Wtime();
      jdx = lo_bl+iblock;
#if 0
      GP_Access_element(g_a_data, &jdx, &blk_ptr[0], &isize);
#endif
#if 1
      GP_Get_size(g_a_data, &jdx, &jdx, &isize);
#endif
      blk = (void*)malloc(isize);
#if 1
      GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0); 
#endif
      t_gp_get = t_gp_get + GA_Wtime() - t_beg;
      iparams = (int*)blk_ptr[0];
      rval = (double*)(iparams+7);
      imin = iparams[0];
      imax = iparams[1];
      jmin = iparams[2];
      jmax = iparams[3];
      irow = iparams[4];
      icol = iparams[5];
      nnz = iparams[6];
      jval = (int*)(rval+nnz);
      ival = (int*)(jval+nnz);
      nrows = imax - imin + 1;
      bvec = (double*)malloc((jmax-jmin+1)*sizeof(double));
      j = 0;
      t_beg = GA_Wtime();
      NGA_Get(g_b,&jmin,&jmax,bvec,&j);
      t_get = t_get + GA_Wtime() - t_beg;
      t_beg = GA_Wtime();
      for (i=0; i<nrows; i++) {
        kmin = ival[i];
        kmax = ival[i+1]-1;
        tempc = 0.0;
        for (j = kmin; j<=kmax; j++) {
          jj = jval[j];
          tempc = tempc + rval[j]*bvec[jj];
        }
        p_c[i] = p_c[i] + tempc;
      }
      t_mult = t_mult + GA_Wtime() - t_beg;
      free(bvec);
      free(blk);
    }
    NGA_Sync();
    t_ga_tot = t_ga_tot + GA_Wtime() - t_beg2;

    NGA_Distribution(g_c,me,blo,bhi);
    NGA_Release(g_c,blo,bhi);

#if USE_HYPRE
    alpha = 1.0;
    beta = 0.0;
    t_beg = GA_Wtime();
    ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
    t_hypre_strct = t_hypre_strct + GA_Wtime() - t_beg;
    hlo[0] = lo[0];
    hlo[1] = lo[1];
    hlo[2] = lo[2];
    hhi[0] = hi[0];
    hhi[1] = hi[1];
    hhi[2] = hi[2];
    ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
    NGA_Distribution(g_c,me,hlo,hhi);
    cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
    NGA_Get(g_c,hlo,hhi,cbuf,&one);
    prdot = 0.0;
    dotga = 0.0;
    dothypre = 0.0;
    for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
      dothypre = dothypre + vector[i]*vector[i];
      dotga = dotga + cbuf[i]*cbuf[i];
      prdot = prdot + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
    }
    NGA_Dgop(&dotga,1,"+");
    NGA_Dgop(&dothypre,1,"+");
    NGA_Dgop(&prdot,1,"+");
    gatot += sqrt(dotga);
    hypretot += sqrt(dothypre);
    prtot += sqrt(prdot);
    free(cbuf);
#endif

/* Transpose matrix. Start by making local copies of ival and jval arrays for
   the sparse matrix of blocks stored in the GP array */
#if 1
    t_beg2 = GA_Wtime();
    t_beg3 = GA_Wtime();
    iblk = (int*)malloc((nprocs+1)*sizeof(int));
    iblk_t = (int*)malloc((nprocs+1)*sizeof(int));
#if 0
    NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
#else
    if (me == 0) {
      NGA_Get(g_a_i,&zero,&nprocs,iblk,&one);
    } else {
      for (i=0; i<nprocs+1; i++) {
        iblk[i] = 0;
      }
    }
    GA_Igop(iblk,nprocs+1,"+");
#endif
    jblk = (int*)malloc(iblk[nprocs]*sizeof(int));
    jblk_t = (int*)malloc(iblk[nprocs]*sizeof(int));
    iblock = iblk[nprocs]-1;
#if 0
    NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
#else
    if (me == 0) {
      NGA_Get(g_a_j,&zero,&iblock,jblk,&one);
    } else {
      for (i=0; i<iblock+1; i++) {
        jblk[i] = 0;
      }
    }
    GA_Igop(jblk,iblock+1,"+");
#endif
    iblock++;
    blkidx = (int*)malloc(iblk[nprocs]*sizeof(int));
    blkidx_t = (int*)malloc(iblk[nprocs]*sizeof(int));
    for (i=0; i<iblock; i++) {
      blkidx[i] = i;
    }
    iblock = nprocs;
    t_get_blk_csr = t_get_blk_csr + GA_Wtime() - t_beg3;
    t_beg3 = GA_Wtime();
    stran(iblock, iblock, iblk, jblk, blkidx, iblk_t, jblk_t, blkidx_t);
    t_trans_blk_csr = t_trans_blk_csr + GA_Wtime() - t_beg3;
    t_beg3 = GA_Wtime();
    gt_a_data = GP_Create_handle();
    i = iblk_t[nprocs];
    GP_Set_dimensions(gt_a_data, one, &i);
    GP_Set_irreg_distr(gt_a_data, iblk_t, &nprocs);
    GP_Allocate(gt_a_data);

    gt_a_j = NGA_Create_handle();
    i = iblk_t[nprocs];
    NGA_Set_data(gt_a_j, one, &i, C_INT);
    NGA_Set_irreg_distr(gt_a_j, iblk_t, &nprocs);
    NGA_Allocate(gt_a_j);

    gt_a_i = NGA_Create_handle();
    i = nprocs+1;
    NGA_Set_data(gt_a_i,one,&i,C_INT);
    for (i=0; i<nprocs; i++) mapc[i] = i;
    NGA_Set_irreg_distr(gt_a_i, mapc, &nprocs);
    NGA_Allocate(gt_a_i);

    /* copy i and j arrays of transposed matrix into distributed arrays */
    if (me==0) {
      lo_bl = 0;
      hi_bl = nprocs;
      NGA_Put(gt_a_i,&lo_bl,&hi_bl,iblk_t,&one);
      lo_bl = 0;
      hi_bl = iblk_t[nprocs]-1;
      NGA_Put(gt_a_j,&lo_bl,&hi_bl,jblk_t,&one);
    }
    NGA_Sync();
    lo_bl = iblk[me];
    hi_bl = iblk[me+1];
    total_procs = hi_bl - lo_bl + 1;
    total_procs = hi_bl - lo_bl;
    t_create_csr_ga = t_create_csr_ga + GA_Wtime() - t_beg3;
    for (iblock = lo_bl; iblock < hi_bl; iblock++) {
      t_beg4 = GA_Wtime();
      jdx = blkidx_t[iblock];
      GP_Get_size(g_a_data, &jdx, &jdx, &isize);
      blk = (void*)malloc(isize);
      GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0); 
      /* Parameters for original block */
      iparams = (int*)blk_ptr[0];
      rval = (double*)(iparams+7);
      imin = iparams[0];
      imax = iparams[1];
      jmin = iparams[2];
      jmax = iparams[3];
      irow = iparams[4];
      icol = iparams[5];
      nnz = iparams[6];
      jval = (int*)(rval+nnz);
      ival = (int*)(jval+nnz);

      /* Create transposed block */
      isize = 7*sizeof(int) + nnz*(sizeof(double)+sizeof(int))
            + (jmax-jmin+2)*sizeof(int);
      t_gp_tget = t_gp_tget + GA_Wtime() - t_beg4;
      t_beg4 = GA_Wtime();
      tblk_ptr = (int*)GP_Malloc(isize);
      t_gp_malloc = t_gp_malloc + GA_Wtime() - t_beg4;
      t_beg3 = GA_Wtime();
      iparamst = (int*)tblk_ptr;
      rvalt = (double*)(iparamst+7);
      jvalt = (int*)(rvalt+nnz);
      ivalt = (int*)(jvalt+nnz);
      iparamst[0] = jmin;
      iparamst[1] = jmax;
      iparamst[2] = imin;
      iparamst[3] = imax;
      iparamst[4] = icol;
      iparamst[5] = irow;
      iparamst[6] = nnz;
      i = imax-imin+1;
      j = jmax-jmin+1;
      stranr(i, j, ival, jval, rval, ivalt, jvalt, rvalt);
      t_trans_blk = t_trans_blk + GA_Wtime() - t_beg3;
      t_beg4 = GA_Wtime();
      GP_Assign_local_element(gt_a_data, &iblock, (void*)tblk_ptr, isize);
      t_gp_assign = t_gp_assign + GA_Wtime() - t_beg4;
#if 1
      free(blk);
#endif
    }

    /* Clean up after transpose */
#if 1
    free(iblk);
    free(iblk_t);
    free(jblk);
    free(jblk_t);
    free(blkidx);
    free(blkidx_t);
#endif
    NGA_Sync();
    t_ga_trans = t_ga_trans + GA_Wtime() - t_beg2;
#if USE_HYPRE
    alpha = 1.0;
    beta = 0.0;
    ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y);
    hlo[0] = lo[0];
    hlo[1] = lo[1];
    hlo[2] = lo[2];
    hhi[0] = hi[0];
    hhi[1] = hi[1];
    hhi[2] = hi[2];
    ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector);
    NGA_Distribution(g_c,me,hlo,hhi);
    cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double));
    NGA_Get(g_c,hlo,hhi,cbuf,&one);
    dothypre = 0.0;
    dotga = 0.0;
    prdot2 = 0.0;
    for (i=0; i<(hhi[0]-hlo[0]+1); i++) {
      dothypre = dothypre + vector[i]*vector[i];
      dotga = dotga + cbuf[i]*cbuf[i];
      if (fabs(vector[i]-cbuf[i]) > 1.0e-10) {
        printf("p[%d] i: %d vector: %f cbuf: %f\n",me,i,vector[i],cbuf[i]);
      }
      prdot2 = prdot2 + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]);
    }
    NGA_Dgop(&dotga,1,"+");
    NGA_Dgop(&dothypre,1,"+");
    NGA_Dgop(&prdot2,1,"+");
    prtot2 += sqrt(prdot2);
    gatot2 += sqrt(dotga);
    hypretot2 += sqrt(dothypre);
    free(cbuf);
    free(blk_ptr);
#endif
    /* Clean up transposed matrix */
    GP_Distribution(gt_a_data,me,blo,bhi);
    for (i=blo[0]; i<bhi[0]; i++) {
      GP_Free(GP_Free_local_element(gt_a_data,&i));
    }
    GP_Destroy(gt_a_data);
    NGA_Destroy(gt_a_i);
    NGA_Destroy(gt_a_j);
#endif
#endif
  }
  free(vector);
#if USE_HYPRE
  if (me == 0) {
    printf("Magnitude of GA solution:                         %e\n",
        gatot/((double)LOOPNUM));
    printf("Magnitude of HYPRE solution:                      %e\n",
        hypretot/((double)LOOPNUM));
    printf("Magnitude of GA solution(2):                      %e\n",
        gatot2/((double)LOOPNUM));
    printf("Magnitude of HYPRE solution(2):                   %e\n",
        hypretot2/((double)LOOPNUM));
    printf("Difference between GA and HYPRE (Struct) results: %e\n",
        prtot/((double)LOOPNUM));
    printf("Difference between transpose and HYPRE results:   %e\n",
        prtot2/((double)LOOPNUM));
  }
#endif

/*
   Clean up arrays
*/
  NGA_Destroy(g_b);
  NGA_Destroy(g_c);
  GP_Distribution(g_a_data,me,blo,bhi);
  for (i=blo[0]; i<bhi[0]; i++) {
    GP_Free(GP_Free_local_element(g_a_data,&i));
  }
  GP_Destroy(g_a_data);
  NGA_Destroy(g_a_i);
  NGA_Destroy(g_a_j);
#if USE_HYPRE
  ierr = HYPRE_StructStencilDestroy(stencil);
  ierr = HYPRE_StructGridDestroy(grid);
  ierr = HYPRE_StructMatrixDestroy(matrix);
  ierr = HYPRE_StructVectorDestroy(vec_x);
  ierr = HYPRE_StructVectorDestroy(vec_y);
#endif

  NGA_Dgop(&t_cnstrct,1,"+");
  NGA_Dgop(&t_get,1,"+");
  NGA_Dgop(&t_gp_get,1,"+");
  NGA_Dgop(&t_mult,1,"+");
  NGA_Dgop(&t_ga_tot,1,"+");
  NGA_Dgop(&t_ga_trans,1,"+");
  NGA_Dgop(&t_get_blk_csr,1,"+");
  NGA_Dgop(&t_trans_blk_csr,1,"+");
  NGA_Dgop(&t_trans_blk,1,"+");
  NGA_Dgop(&t_create_csr_ga,1,"+");
  NGA_Dgop(&t_gp_tget,1,"+");
  NGA_Dgop(&t_gp_malloc,1,"+");
  NGA_Dgop(&t_gp_assign,1,"+");
#if USE_HYPRE
  NGA_Dgop(&t_hypre_strct,1,"+");
#endif
  free(mapc);

  if (me == 0) {
    printf("Time to create sparse matrix:                         %12.4f\n",
      t_cnstrct/((double)(nprocs*LOOPNUM)));
    printf("Time to get right hand side vector:                   %12.4f\n",
      t_get/((double)(nprocs*LOOPNUM)));
    printf("Time to get GP blocks:                                %12.4f\n",
      t_gp_get/((double)(nprocs*LOOPNUM)));
    printf("Time for sparse matrix block multiplication:          %12.4f\n",
      t_mult/((double)(nprocs*LOOPNUM)));
    printf("Time for total sparse matrix multiplication:          %12.4f\n",
      t_ga_tot/((double)(nprocs*LOOPNUM)));
#if USE_HYPRE
    printf("Total time for HYPRE (Struct)  matrix-vector multiply:%12.4f\n",
      t_hypre_strct/((double)(nprocs*LOOPNUM)));
#endif
    printf("Time to get block CSR distribution:                   %12.4f\n",
      t_get_blk_csr/((double)(nprocs*LOOPNUM)));
    printf("Time for transposing block CSR distribution:          %12.4f\n",
      t_trans_blk_csr/((double)(nprocs*LOOPNUM)));
    printf("Time for creating transposed block CSR GA:            %12.4f\n",
      t_create_csr_ga/((double)(nprocs*LOOPNUM)));
    printf("Time for transposing blocks:                          %12.4f\n",
      t_trans_blk/((double)(nprocs*LOOPNUM)));
    printf("Time to get GP blocks for transpose:                  %12.4f\n",
      t_gp_tget/((double)(nprocs*LOOPNUM)));
    printf("Time to malloc GP blocks for transpose:               %12.4f\n",
      t_gp_malloc/((double)(nprocs*LOOPNUM)));
    printf("Time to assign GP blocks for transpose:               %12.4f\n",
      t_gp_assign/((double)(nprocs*LOOPNUM)));
    printf("Time for total sparse matrix transpose:               %12.4f\n",
      t_ga_trans/((double)(nprocs*LOOPNUM)));
  }
  if (me==0) {
    printf("Terminating GA library\n");
  }
  NGA_Terminate();
/*
 ***  Tidy up after message-passing library
 */
  ierr = MPI_Finalize();
}

Пример #20

0

Показать файл

Файл: transp1D.c Проект: MihaiLupoiu/TPP

void TRANSPOSE1D() {
    
    int dims[MAXDIM], chunk[MAXDIM], ld[MAXDIM], lo[MAXDIM], hi[MAXDIM];
    int lo1[MAXDIM], hi1[MAXDIM], lo2[MAXDIM], hi2[MAXDIM];
    int g_a, g_b, a[MAXPROC*TOTALELEMS],b[MAXPROC*TOTALELEMS];
    int nelem, i;    
    int me, nprocs;

    /* Find local processor ID and number of processors */
    /* ### assign the local processor ID to the int variable "me"
     * ### and the total number of processors to the int variable
     * ### "nprocs" */
    
    me     = GA_Nodeid();
    nprocs = GA_Nnodes();

    /* Configure array dimensions. Force an unequal data distribution */
    dims[0]  = nprocs*TOTALELEMS + nprocs/2;
    ld[0]    = dims[0];
    chunk[0] = TOTALELEMS; /* minimum data on each process */
 
    /* create a global array g_a and duplicate it to get g_b */
    /* ### create GA of integers with dimension "NDIM" and size "dims" with
     * ### minimum block size "chunk" and assign the handle to the
     * ### integer variable "g_a". Then create a second global array
     * ### assigned to the integer handle "g_b" by duplicating "g_a".
     * ### Assign the names "Array A" and "Array B" to "g_a" and "g_b". */

    g_a=NGA_Create(C_INT, 1, dims, "array A", chunk);
    g_b=GA_Duplicate(g_a,"array B");

    if (!g_a) GA_Error("create failed: A", NDIM);
    if (me==0) printf("  Created Array A\n");
    
    if (! g_b) GA_Error("duplicate failed",NDIM);
    if (me==0) printf("  Created Array B\n");
 
    /* initialize data in g_a */
    if (me==0) {
       printf("  Initializing matrix A\n");
       for(i=0; i<dims[0]; i++) a[i] = i;
       lo[0]  = 0;
       hi[0] = dims[0]-1;
     /* ### copy the contents of array "a" into the portion of global array
      * ### "g_a" described by "lo" and "hi". Use the array of strides
      * ### "ld" to describe the physical layout of array "a". */

      NGA_Put(g_a,lo,hi,a,ld);

    }

    /* Synchronize all processors to guarantee that everyone has data
       before proceeding to the next step. */

    /* ### synchronize all processors */
    GA_Sync();

    /* Start initial phase of inversion by inverting the data held locally on
       each processor. Start by finding out which data each processor owns. */

    /* ### find out which block of data my node ("me") owns for the global
     * ### array "g_a" and store the contents in the integer arrays "lo1" and
     * ### "hi1". */

    NGA_Distribution(g_a,me,lo1,hi1);

    /* Get locally held data and copy it into local buffer a  */

    /* ### use the arrays "lo1" and "hi1" to copy the locally held block of data
     * ### from the global array "g_a" into the local array "a". Use the array
     * ### of strides "ld" to describe the physical layout of "a". */

    NGA_Get(g_a,lo1,hi1,a,ld);

    /* Invert data locally */
    nelem = hi1[0] - lo1[0] + 1;
    for (i=0; i<nelem; i++) b[i] = a[nelem-1-i];
    
    /* Invert data globally by copying locally inverted blocks into
     * their inverted positions in the GA */
    lo2[0] = dims[0] - hi1[0] -1;
    hi2[0] = dims[0] - lo1[0] -1;

    /* ### copy data from the local array "b" into the block of the global
     * ### array "g_b" described by the integer arrays "lo2" and "hi2". Use
     * ### the array of strides "ld" to describe the physical layout of "b". */

    NGA_Put(g_b,lo2,hi2,b,ld);

    /* Synchronize all processors to make sure inversion is complete */
    /* ### synchronize all processors */
    
    GA_Sync();

    /* Check to see if inversion is correct */
    if(me == 0) verify(g_a, g_b);
    
    /* Deallocate arrays */
    /* ### destroy global arrays "g_a" and "g_b" */
    GA_Destroy(g_a);
    GA_Destroy(g_b);
}

Пример #21

0

Показать файл

Файл: lennard-jones_ga2.c Проект: fuentesdt/tao-1.10.1-p3

int InitializeVariables(GAVec ga_x, AppCtx *user) {
  double *x;
  int lo, hi, n, handle;
  double xx,yy,zz;
  int i,j,k, il, jl, ctr, icrtn, ileft, isqrtn;

  // This method is not parallelized
  if (user->me != 0) {
    GA_Sync();
    return 0;
  }

  n = user->ndim * user->natoms + 1;
  if (MA_push_stack(C_DBL, n, "InitializeVariables buffer", &handle))
    MA_get_pointer(handle, &x);
  else 
    ga_error("ma_alloc_get failed", n);

  lo = 0;
  hi = n-2;

  if (user->ndim == 2) {
    isqrtn = int(sqrt(user->natoms));
    ileft = user->natoms - isqrtn * isqrtn;
    xx = yy = 0.0;
    for (j=0; j <= isqrtn + ileft/isqrtn; j++) {
      for (i=0; i<TaoMin(isqrtn, user->natoms - j * isqrtn); i++) {
	ctr = j * isqrtn + i;
	x[2*ctr] = xx;
	x[2*ctr + 1] = yy;
	xx += 1.0;
      }
      yy += 1.0;
      xx = 0.0;
    }
  } else if (user->ndim == 3) {
    icrtn = (int)pow((int)(user->natoms + 0.5),1.0/3.0);
    ileft = user->natoms - icrtn*icrtn*icrtn;
    xx = yy = zz = 0.0;
    for (k=0; k<=icrtn + ileft/(icrtn*icrtn); k++) {
      jl = TaoMin(icrtn, (user->natoms - k*icrtn*icrtn)/icrtn + 1);
      for (j=0; j<jl; j++) {
	il = TaoMin(icrtn, user->natoms - k*icrtn*icrtn - j*icrtn);
	for (i = 0; i<il; i++) {
	  ctr = k*icrtn*icrtn + j*icrtn + i;
	  x[3*ctr] = xx;
	  x[3*ctr + 1] = yy;
	  x[3*ctr + 2] = zz;
	  xx += 1.0;
	}
	yy += 1.0;
	xx = 0.0;
      }
      zz += 1.0;
      yy = 0.0;
    }
  }

  // Distribute the array
  NGA_Put(ga_x, &lo, &hi, x, &hi);

  if (!MA_pop_stack(handle)) 
    ga_error("InitializeVariables:MA_pop_stack failed",0);

  GA_Sync();

  return 0;
}

Пример #22

0

Показать файл

Файл: gemmtest.c Проект: jeffhammond/ga

void
test(int data_type) {
  int me=GA_Nodeid();
  int nproc = GA_Nnodes();
  int g_a, g_b, g_c;
  int ndim = 2;
  int dims[2]={N,N};
  int lo[2]={0,0};
  int hi[2]={N-1,N-1};
  int block_size[2]={NB,NB-1};
  int proc_grid[2];
  int i,j,l,k,m,n, ld;

  double alpha_dbl = 1.0, beta_dbl = 0.0;
  double dzero = 0.0;
  double ddiff;

  float alpha_flt = 1.0, beta_flt = 0.0;
  float fzero = 0.0;
  float fdiff;
  float ftmp;
  double dtmp;
  SingleComplex ctmp;
  DoubleComplex ztmp;

  DoubleComplex alpha_dcpl = {1.0, 0.0} , beta_dcpl = {0.0, 0.0}; 
  DoubleComplex zzero = {0.0,0.0};
  DoubleComplex zdiff;

  SingleComplex alpha_scpl = {1.0, 0.0} , beta_scpl = {0.0, 0.0}; 
  SingleComplex czero = {0.0,0.0};
  SingleComplex cdiff;

  void *alpha=NULL, *beta=NULL;
  void *abuf=NULL, *bbuf=NULL, *cbuf=NULL, *c_ptr=NULL;

  switch (data_type) {
  case C_FLOAT:
    alpha  = (void *)&alpha_flt;
    beta   = (void *)&beta_flt;
    abuf = (void*)malloc(N*N*sizeof(float));
    bbuf = (void*)malloc(N*N*sizeof(float));
    cbuf = (void*)malloc(N*N*sizeof(float));
    if(me==0) printf("Single Precision: Testing GA_Sgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;      
  case C_DBL:
    alpha  = (void *)&alpha_dbl;
    beta   = (void *)&beta_dbl;
    abuf = (void*)malloc(N*N*sizeof(double));
    bbuf = (void*)malloc(N*N*sizeof(double));
    cbuf = (void*)malloc(N*N*sizeof(double));
    if(me==0) printf("Double Precision: Testing GA_Dgemm,NGA_Matmul_patch for %d-Dimension", ndim); 
    break;    
  case C_DCPL:
    alpha  = (void *)&alpha_dcpl;
    beta   = (void *)&beta_dcpl;
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    if(me==0) printf("Double Complex:   Testing GA_Zgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  case C_SCPL:
    alpha  = (void *)&alpha_scpl;
    beta   = (void *)&beta_scpl;
    abuf = (void*)malloc(N*N*sizeof(SingleComplex));
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    if(me==0) printf("Single Complex:   Testing GA_Cgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  if (me==0) printf("\nCreate A, B, C\n");
#ifdef USE_REGULAR
  g_a = NGA_Create(data_type, ndim, dims, "array A", NULL);
#endif
#ifdef USE_SIMPLE_CYCLIC
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  NGA_Set_block_cyclic(g_a,block_size);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_SCALAPACK
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_block_cyclic_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_TILED
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_tiled_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
  g_b = GA_Duplicate(g_a, "array B");  
  g_c = GA_Duplicate(g_a, "array C");
  if(!g_a || !g_b || !g_c) GA_Error("Create failed: a, b or c",1);

  ld = N;
  if (me==0) printf("\nInitialize A\n");
  /* Set up matrix A */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[i*N+j] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[i*N+j] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[i*N+j].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[i*N+j].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[i*N+j] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[i*N+j] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[i*N+j].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[i*N+j].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nPerform matrix multiply\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (me==0) printf("\nCheck answer\n");
  /*
  GA_Print(g_a);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_b);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_c); 
  */

  /* Check answer */
  NGA_Get(g_a,lo,hi,abuf,&ld);
  NGA_Get(g_b,lo,hi,bbuf,&ld);
  for (i=0; i<N; i++) {
    for (j=0; j<N; j++) {
      switch (data_type) {
        case C_FLOAT:
          ((float*)cbuf)[i*N+j] = fzero;
          break;
        case C_DBL:
          ((double*)cbuf)[i*N+j] = dzero;
          break;
        case C_DCPL:
          ((DoubleComplex*)cbuf)[i*N+j] = zzero;
          break;
        case C_SCPL:
          ((SingleComplex*)cbuf)[i*N+j] = czero;
          break;
        default:
          GA_Error("wrong data type", data_type);
      }
      for (k=0; k<N; k++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N+j] += ((float*)abuf)[i*N+k]
              *((float*)bbuf)[k*N+j];
            break;
          case C_DBL:
            ((double*)cbuf)[i*N+j] += ((double*)abuf)[i*N+k]
              *((double*)bbuf)[k*N+j];
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N+j].real +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].real
               -(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].imag));
            ((DoubleComplex*)cbuf)[i*N+j].imag +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].imag
               +(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].real));
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N+j].real +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].real
               -(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].imag));
            ((SingleComplex*)cbuf)[i*N+j].imag +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].imag
               +(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].real));
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();
  if (me == 0) {
    NGA_Get(g_c,lo,hi,abuf,&ld);
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            fdiff = ((float*)abuf)[i*N+j]-((float*)cbuf)[i*N+j];
            if (((float*)abuf)[i*N+j] != 0.0) {
              fdiff /= ((float*)abuf)[i*N+j];
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((float*)abuf)[i*N+j],((float*)cbuf)[i*N+j]);
            }
            break;
          case C_DBL:
            ddiff = ((double*)abuf)[i*N+j]-((double*)cbuf)[i*N+j];
            if (((double*)abuf)[i*N+j] != 0.0) {
              ddiff /= ((double*)abuf)[i*N+j];
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((double*)abuf)[i*N+j],((double*)cbuf)[i*N+j]);
            }
            break;
          case C_DCPL:
            zdiff.real = ((DoubleComplex*)abuf)[i*N+j].real
              -((DoubleComplex*)cbuf)[i*N+j].real;
            zdiff.imag = ((DoubleComplex*)abuf)[i*N+j].imag
              -((DoubleComplex*)cbuf)[i*N+j].imag;
            if (((DoubleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((DoubleComplex*)abuf)[i*N+j].imag != 0.0) {
              ztmp = ((DoubleComplex*)abuf)[i*N+j];
              ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
                  /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
            } else {
              ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((DoubleComplex*)abuf)[i*N+j].real,
                  ((DoubleComplex*)abuf)[i*N+j].imag,
                  ((DoubleComplex*)cbuf)[i*N+j].real,
                  ((DoubleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          case C_SCPL:
            cdiff.real = ((SingleComplex*)abuf)[i*N+j].real
              -((SingleComplex*)cbuf)[i*N+j].real;
            cdiff.imag = ((SingleComplex*)abuf)[i*N+j].imag
              -((SingleComplex*)cbuf)[i*N+j].imag;
            if (((SingleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((SingleComplex*)abuf)[i*N+j].imag != 0.0) {
              ctmp = ((SingleComplex*)abuf)[i*N+j];
              fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
                  /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
            } else {
              fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((SingleComplex*)abuf)[i*N+j].real,
                  ((SingleComplex*)abuf)[i*N+j].imag,
                  ((SingleComplex*)cbuf)[i*N+j].real,
                  ((SingleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();

  /* copy cbuf back to g_a */
  if (me == 0) {
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  GA_Sync();

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = GA_Fdot(g_a,g_a);
      break;
    case C_DBL:
      dtmp = GA_Ddot(g_a,g_a);
      break;
    case C_DCPL:
      ztmp = GA_Zdot(g_a,g_a);
      break;
    case C_SCPL:
      ctmp = GA_Cdot(g_a,g_a);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  GA_Zero(g_b);
  GA_Add(alpha,g_a,beta,g_c,g_b);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = GA_Fdot(g_b, g_b);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = GA_Ddot(g_b, g_b);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = GA_Zdot(g_b, g_b);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = GA_Cdot(g_b, g_b);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
#endif

  free(abuf);
  free(bbuf);
  free(cbuf);

  switch (data_type) {
  case C_FLOAT:
    abuf = (void*)malloc(N*N*sizeof(float)/4);
    bbuf = (void*)malloc(N*N*sizeof(float)/4);
    cbuf = (void*)malloc(N*N*sizeof(float)/4);
    break;      
  case C_DBL:
    abuf = (void*)malloc(N*N*sizeof(double)/4);
    bbuf = (void*)malloc(N*N*sizeof(double)/4);
    cbuf = (void*)malloc(N*N*sizeof(double)/4);
    break;    
  case C_DCPL:
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    break;
  case C_SCPL:
    abuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  /* Test multiply on a fraction of matrix. Start by reinitializing
   * A and B */
  GA_Zero(g_a);
  GA_Zero(g_b);
  GA_Zero(g_c);

  if (me==0) printf("\nTest patch multiply\n");

  lo[0] = N/4;
  lo[1] = N/4;
  hi[0] = 3*N/4-1;
  hi[1] = 3*N/4-1;
  ld = N/2;

  /* Set up matrix A */
  if (me==0) printf("\nInitialize A\n");
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[(i-N/4)*N/2+(j-N/4)] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[(i-N/4)*N/2+(j-N/4)] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  beta_flt = 0.0;
  beta_dbl = 0.0;
  beta_scpl.real = 0.0;
  beta_dcpl.real = 0.0;
  if (me==0) printf("\nPerform matrix multiply on sub-blocks\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (0) {
  /*
  if (data_type != C_SCPL && data_type != C_DCPL) {
  */

  if (me==0) printf("\nCheck answer\n");

  /* Multiply buffers by hand */
  if (me == 0) {
    for (i=0; i<N/2; i++) {
      for (j=0; j<N/2; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N/2+j] = fzero;
            break;
          case C_DBL:
            ((double*)cbuf)[i*N/2+j] = dzero;
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N/2+j] = zzero;
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N/2+j] = czero;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
        for (k=0; k<N/2; k++) {
          switch (data_type) {
            case C_FLOAT:
              ((float*)cbuf)[i*N/2+j] += ((float*)abuf)[i*N/2+k]
                *((float*)bbuf)[k*N/2+j];
              break;
            case C_DBL:
              ((double*)cbuf)[i*N/2+j] += ((double*)abuf)[i*N/2+k]
                *((double*)bbuf)[k*N/2+j];
              break;
            case C_DCPL:
              ((DoubleComplex*)cbuf)[i*N/2+j].real +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].real
                 -(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].imag));
              ((DoubleComplex*)cbuf)[i*N/2+j].imag +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].imag
                 +(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].real));
              break;
            case C_SCPL:
              ((SingleComplex*)cbuf)[i*N/2+j].real +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].real
                 -(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].imag));
              ((SingleComplex*)cbuf)[i*N/2+j].imag +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].imag
                 +(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].real));
              break;
            default:
              GA_Error("wrong data type", data_type);
          }
        }
      }
    }
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  if (me == 0) printf("\n\n\n\n");

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = NGA_Fdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DBL:
      dtmp = NGA_Ddot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DCPL:
      ztmp = NGA_Zdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_SCPL:
      ctmp = NGA_Cdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  NGA_Zero_patch(g_b,lo,hi);
  NGA_Add_patch(alpha,g_a,lo,hi,beta,g_c,lo,hi,g_b,lo,hi);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = NGA_Fdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = NGA_Ddot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = NGA_Zdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = NGA_Cdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }

  }
#endif
  free(abuf);
  free(bbuf);
  free(cbuf);

  GA_Destroy(g_a);
  GA_Destroy(g_b);
  GA_Destroy(g_c);
}

Пример #23

0

Показать файл

Файл: matrix1.c Проект: MihaiLupoiu/TPP

void matrix_multiply() {
    
    int dims[NDIM], chunk[NDIM], ld[NDIM];
    int lo[NDIM], hi[NDIM], lo1[NDIM], hi1[NDIM];
    int lo2[NDIM], hi2[NDIM], lo3[NDIM], hi3[NDIM];
    int g_a, g_b, g_c, i, j, k, l;
    int me, nprocs;

    /* Find local processor ID and the number of processors */
    /* ### assign processor ID to the int variable "me" and the total number
     * ### of processors to the int variable "nprocs" */
    me     = GA_Nodeid();
    nprocs = GA_Nnodes();
    
    /* Configure array dimensions. Force an unequal data distribution */
    for(i=0; i<NDIM; i++) {
       dims[i]  = TOTALELEMS;
       ld[i]    = dims[i];
       chunk[i] = TOTALELEMS/nprocs-1; /*minimum block size on each process*/
    }
 
    /* create a global array g_a and duplicate it to get g_b and g_c*/
    /* ### create GA of doubles with dimension "NDIM" and size "dims" with
     * ### minimum block size "chunk" and assign the handle to the
     * ### integer variable "g_a". Then create remaining global arrays,
     * ### assigned to the integer handle "g_b" and "g_c" by duplicating
     * ### g_a. Assign the names "Array A", "Array B" and "Array C" to
     * ### "g_a", "g_b", and "g_c". */

    g_a=NGA_Create(C_DBL, NDIM, dims, "array A", chunk);
    

    if (!g_a) GA_Error("create failed: A", NDIM);
    if (me==0) printf("  Created Array A\n");

    g_b=GA_Duplicate(g_a,"array B");
    g_c=GA_Duplicate(g_a,"array C");
    if (!g_b || !g_c) GA_Error("duplicate failed",NDIM);
    if (me==0) printf("  Created Arrays B and C\n");
 
    /* initialize data in matrices a and b */
    if(me==0)printf("  Initializing matrix A and B\n");
    k = 0; l = 7;
    for(i=0; i<dims[0]; i++) {
       for(j=0; j<dims[1]; j++) {
          a[i][j] = (double)(++k%29);
          b[i][j] = (double)(++l%37);
       }
    }

    /*  Copy data to global arrays g_a and g_b */
    lo1[0] = 0;
    lo1[1] = 0;
    hi1[0] = dims[0]-1;
    hi1[1] = dims[1]-1;
    if (me==0) {
      /* ### copy the contents of array "a" into the portion of global array
       * ### "g_a" described by "lo1" and "hi1". Similarly, copy the contents
       * ### of the array "b" into corresponding portion of global array "g_b".
       * ### Use the array of strides "ld" to describe the physical layout of
       * ### arrays "a" and "b". */
      NGA_Put(g_a,lo1,hi1,a,ld);
      NGA_Put(g_b,lo1,hi1,b,ld);

    }
    
    /*  Synchronize all processors to make sure everyone has data */
    /* ### synchronize all processors */

    GA_Sync();


    /* Determine which block of data is locally owned. Note that
       the same block is locally owned for all GAs. */
    /* ### find out which block of data my node ("me") owns for the global
     * ### array "g_c" and store the contents in the integer arrays "lo" and
     * ### "hi". */
    NGA_Distribution(g_c,me,lo,hi);

    
    /* Get the blocks from g_a and g_b needed to compute this block in
       g_c and copy them into the local buffers a and b. */
    lo2[0] = lo[0];
    lo2[1] = 0;
    hi2[0] = hi[0];
    hi2[1] = dims[0]-1;
    /* ### copy the block of data described by the arrays "lo2" and "hi2" from
     * ### the global array "g_a" into the local array "a". Use the array of
     * ### strides "ld" to describe the physical layout of "a". */
    NGA_Get(g_a,lo2,hi2,a,ld);

    lo3[0] = 0;
    lo3[1] = lo[1];
    hi3[0] = dims[1]-1;
    hi3[1] = hi[1];
    /* ### copy the block of data described by the arrays "lo3" and "hi3" from
     * ### the global array "g_b" into the local array "b". Use the array of
     * ### strides "ld" to describe the physical layout of "b". */
     NGA_Get(g_b,lo3,hi3,b,ld);

    /* Do local matrix multiplication and store the result in local
       buffer c. Start by evaluating the transpose of b. */
    for(i=0; i < hi3[0]-lo3[0]+1; i++)
       for(j=0; j < hi3[1]-lo3[1]+1; j++) 
          btrns[j][i] = b[i][j];

    /* Multiply a and b to get c */
    for(i=0; i < hi[0] - lo[0] + 1; i++) {
       for(j=0; j < hi[1] - lo[1] + 1; j++) {
          c[i][j] = 0.0;
          for(k=0; k<dims[0]; k++)
             c[i][j] = c[i][j] + a[i][k]*btrns[j][k];
       }
    }
    
    /* Copy c back to g_c */
    /* ### copy data from the local array "c" into the block of the global
     * ### array "g_c" described by the integer arrays "lo" and "hi". Use
     * ### the array of strides "ld" to describe the physical layout of "c". */
    NGA_Put(g_c,lo,hi,c,ld);

    verify(g_a, g_b, g_c, lo1, hi1, ld);
    
    /* Deallocate arrays */
    /* ### destroy the global arrays "g_a", "g_b", "g_c" */
    GA_Destroy(g_a);
    GA_Destroy(g_b);
    GA_Destroy(g_c);
}

Пример #24

0

Показать файл

Файл: testmatmultc.c Проект: sg0/Elemental

/*
 * test ga_dgemm
 * Note: - change nummax for large arrays
 *       - turn off "dgemm_verify" for large arrays due to memory 
 *         limitations, as dgemm_verify=1 for large arrays produces 
 *         segfault, dumps core,or any crap.
 */
int main(int argc, char **argv)
{
    int num_m;
    int num_n;
    int num_k;
    int i;
    int ii;
    double *h0;
    int g_c;
    int g_b;
    int g_a;
    double a;
    double t1;
    double mf;
    double avg_t[ntrans];
    double avg_mf[ntrans];
    int itime;
    int ntimes;
    int nums_m[/*howmany*/] = {512,1024};
    int nums_n[/*howmany*/] = {512,1024};
    int nums_k[/*howmany*/] = {512,1024};
    char transa[/*ntrans*/] = "ntnt";
    char transb[/*ntrans*/] = "nntt";
    char ta;
    char tb;
    double *tmpa;
    double *tmpb;
    double *tmpc;
    int ndim;
    int dims[2];
#ifdef BLOCK_CYCLIC
    int block_size[2];
#endif

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );
    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
#else
    MP_INIT(argc,argv);
    if (!MA_init(MT_DBL,1,20000000)) {
        GA_Error("failed: ma_init(MT_DBL,1,20000000)",10);
    }
    GA_INIT(argc,argv);
    me = GA_Nodeid();
#endif

    h0 = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpa = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpb = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpc = (double*)malloc(sizeof(double) * nummax*nummax);

    ii = 0;
    for (i=0; i<nummax*nummax; i++) {
        ii = ii + 1;
        if (ii > nummax) {
            ii = 0;
        }
        h0[i] = ii;
    }

    /* Compute times assuming 500 mflops and 5 second target time */
    /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */
    ntimes = 5;

    for (ii=0; ii<howmany; ii++) {
        num_m = nums_m[ii];
        num_n = nums_n[ii];
        num_k = nums_k[ii];
        a = 0.5/(num_m*num_n);
        if (num_m > nummax || num_n > nummax || num_k > nummax) {
            GA_Error("Insufficient memory: check nummax", 1);
        }

#ifndef BLOCK_CYCLIC
        ndim = 2;

	/*
        dims[0] = num_m;
        dims[1] = num_n;
	*/
        dims[1] = num_m;
        dims[0] = num_n;

#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c );
#else
        if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) {
            GA_Error("failed: create g_c",20);
        }
#endif
	/*
        dims[0] = num_k;
        dims[1] = num_n;
	*/
        dims[1] = num_k;
        dims[0] = num_n;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b );
#else
        if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) {
            GA_Error("failed: create g_b",30);
        }
#endif
	/*
        dims[0] = num_m;
        dims[1] = num_k;
	*/
        dims[1] = num_m;
        dims[0] = num_k;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a );
#else
        if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) {
            GA_Error("failed: create g_a",40);
        }
#endif
#else
        ndim = 2;
        block_size[0] = 128;
        block_size[1] = 128;

        dims[0] = num_m;
        dims[1] = num_n;
        g_c = GA_Create_handle();
        GA_Set_data(g_c,ndim,dims,MT_DBL);
        GA_Set_array_name(g_c,"g_c");
        GA_Set_block_cyclic(g_c,block_size);
        if (!GA_Allocate(g_c)) {
            GA_Error("failed: create g_c",40);
        }

        dims[0] = num_k;
        dims[1] = num_n;
        g_b = GA_Create_handle();
        GA_Set_data(g_b,ndim,dims,MT_DBL);
        GA_Set_array_name(g_b,"g_b");
        GA_Set_block_cyclic(g_b,block_size);
        if (!ga_allocate(g_b)) {
            GA_Error("failed: create g_b",40);
        }

        dims[0] = num_m;
        dims[1] = num_k;
        g_a = GA_Create_handle();
        GA_Set_data(g_a,ndim,dims,MT_DBL);
        GA_Set_array_name(g_a,"g_a");
        GA_Set_block_cyclic(g_a,block_size);
        if (!ga_allocate(g_a)) {
            GA_Error('failed: create g_a',40);
        }
#endif         

        /* Initialize matrices A and B */
        if (me == 0) { 
            load_ga(g_a, h0, num_m, num_k);
            load_ga(g_b, h0, num_k, num_n);
        }
#if defined(USE_ELEMENTAL)
        double zero = 0.0;
        ElGlobalArraysFill_d( eldga, g_c, &zero );
	ElGlobalArraysSync_d( eldga );
#else
        GA_Zero(g_c);
        GA_Sync();
#endif
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n",
                    (long)num_m, (long)num_k, (long)num_k, (long)num_n);
            fflush(stdout);
        }

        for (i=0; i<ntrans; i++) {
            avg_t[i]  = 0.0;
            avg_mf[i] = 0.0;
        }

        for (itime=0; itime<ntimes; itime++) {
            for (i=0; i<ntrans; i++) {
#if defined(USE_ELEMENTAL)
	        ElGlobalArraysSync_d( eldga );
#else
                GA_Sync();
#endif
                ta = transa[i];
                tb = transb[i];
                t1 = MP_TIMER();
#if defined(USE_ELEMENTAL)
		ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c );
#else
                GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c);
#endif
                t1 = MP_TIMER() - t1;
#if defined(USE_ELEMENTAL)
                if (me == 0) {
#else
                if (GA_Nodeid() == 0) {
#endif
#if defined(USE_ELEMENTAL)
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc;
#else
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes();
#endif
                    avg_t[i]  = avg_t[i]+t1;
                    avg_mf[i] = avg_mf[i] + mf;
                    printf("%15s%2d: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                            "Run#", itime, t1, mf, ta, tb);
                    fflush(stdout);
                    if (dgemm_verify && itime == 0) {
                        /* recall the C API swaps the matrix order */
                        /* we swap it here for the Fortran-based verify */
                        verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0,
                                g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc);
                    }
                }
            }
        }
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\n");
            for (i=0; i<ntrans; i++) {
                printf("%17s: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                        "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes,
                        transa[i], transb[i]);
            }
            if(dgemm_verify) {
                printf("All GA_Dgemms are verified...O.K.\n");
            }
            fflush(stdout);
        }

        /*
           GA_Print(g_a);
           GA_Print(g_b);
           GA_Print(g_c);
           */
#if defined(USE_ELEMENTAL)
        ElGlobalArraysDestroy_d( eldga, g_a );
        ElGlobalArraysDestroy_d( eldga, g_b );
        ElGlobalArraysDestroy_d( eldga, g_c );
#else
        GA_Destroy(g_c);
        GA_Destroy(g_b);
        GA_Destroy(g_a);
#endif
    }

    /* ???
       format(a15, i2, ': ', e12.4, ' seconds ',f12.1, 
       .     ' mflops/proc ', 3a2)
       */
#if defined(USE_ELEMENTAL)
    if (me == 0) {
#else
    if (GA_Nodeid() == 0) {
#endif
        printf("All tests successful\n");
    }

    free(h0);
    free(tmpa);
    free(tmpb);
    free(tmpc);
#if defined(USE_ELEMENTAL)
    // call el::global arrays destructor
    ElGlobalArraysTerminate_d( eldga );
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    return 0;
}


/*
 * Verify for correctness. Process 0 computes BLAS dgemm 
 * locally. For larger arrays, disbale this test as memory
 * might not be sufficient
 */
void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k,
        double alpha, int g_a, int g_b, double beta, int g_c,
        double *tmpa, double *tmpb, double *tmpc)
{
    int i,j,type,ndim,dims[2],lo[2],hi[2];
    double abs_value;

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            tmpc[j+i*num_m] = -1.0;
            tmpa[j+i*num_m] = -2.0;
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_a, lo, hi, tmpa, &dims[1]);
#endif

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] );
#else
    NGA_Get(g_b, lo, hi, tmpb, &dims[1]);
#endif

    /* compute dgemm sequentially */
#if defined(USE_ELEMENTAL)
    cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), 
	    ( xt2 == 'n'? CblasNoTrans: CblasTrans ), 
	    num_m /* M */, num_n /* N */, num_k /* K */, 
	    alpha, tmpa, num_m, /* lda */ 
	    tmpb, num_k, /* ldb */ beta, 
	    tmpc, num_m /* ldc */);
#else
    xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k,
            &alpha, tmpa, &num_m,
            tmpb, &num_k, &beta,
            tmpc, &num_m);
#endif

    /* after computing c locally, verify it with the values in g_c */

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_c, lo, hi, tmpa, &dims[1]);
#endif

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]);
            if(abs_value > 1.0 || abs_value < -1.0) {
                printf("Values are = %f %f\n",
                        tmpc[j+i*num_m], tmpa[j+i*num_m]);
                printf("Values are = %f %f\n", 
                        fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value);
                fflush(stdout);
                GA_Error("verify ga_dgemm failed", 1);
            }
        }
    }
}

/**
 * called by process '0' (or your master process )
 */
void load_ga(int handle, double *f, int dim1, int dim2)
{
      int lo[2], hi[2];
      
      if (dim1 < 0 || dim2 < 0) {
          return;
      }

      lo[0] = 0;
      lo[1] = 0;
      hi[0] = dim1-1;
      hi[1] = dim2-1;
#if defined(USE_ELEMENTAL)
      ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 );
#else
      NGA_Put(handle, lo, hi, f, &dim1);
#endif
}

Пример #25

0

Показать файл

Файл: gada.c Проект: adrielb/DCell

PetscErrorCode testCreate3D(  )
{
  int ga;
  DA da;
  DALocalInfo info;
  Vec vec;
  PetscErrorCode ierr;
  
  PetscFunctionBegin;
  int d1 = 229, d2 = 229, d3 = 229;
  int rank;
  MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
  ierr = DACreate3d(PETSC_COMM_WORLD,DA_NONPERIODIC,DA_STENCIL_STAR,
              d1,d2,d3,
              PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,
              1,1,
              0,0,0, &da); CHKERRQ(ierr);
  ierr = DAGetLocalInfo(da,&info); CHKERRQ(ierr);
  ierr = DACreateGlobalArray( da, &ga, &vec); CHKERRQ(ierr);
  
  PetscReal ***v;
  ierr = DAVecGetArray(da,vec,&v); CHKERRQ(ierr);
  int xe = info.xs+info.xm,
      ye = info.ys+info.ym,
      ze = info.zs+info.zm;
  for (int k = info.zs; k < ze; ++k) {
    for (int j = info.ys; j < ye; ++j) {
      for (int i = info.xs; i < xe; ++i) {
        v[k][j][i] = 1.*i + d1*j + d1*d2*k;
      }
    }
  }
  ierr = DAVecRestoreArray(da,vec,&v); CHKERRQ(ierr);
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Sequential values filled in petsc vec.\n"); CHKERRQ(ierr);
  
  ierr = PetscBarrier(0); CHKERRQ(ierr);
  int lo[3],ld, p = 10;
  int patch[10][10][10];
  double val;
  for (int k = 0; k < d3; k+=p) {
    for (int j = 0; j < d2; j+=p) {
      for (int i = 0; i < d1; i+=p) {
        lo[0] = k;
        lo[1] = j;
        lo[2] = i;
        NGA_Get(ga,lo,lo,&val,&ld);
        if( PetscAbs( i + d1*j + d1*d2*k - val) > .1 )
//          printf(".");
          printf("(%3.0f,%3.0f) ", 1.*i + d1*j + d1*d2*k, val);
      }
    }
  }
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Ended NGA_Get() test.\n"); CHKERRQ(ierr);
  
  ierr = PetscBarrier(0); CHKERRQ(ierr);
  if( rank == 0 )
  {
    for (int k = 0; k < d3; ++k) {
      printf(">%d\n",k);
      for (int j = 0; j < d2; ++j) {
        for (int i = 0; i < d1; ++i) {
          lo[0] = k; lo[1] = j; lo[2] = i;
          val = 1.*i + d1*j + d1*d2*k;
          val *= -1;
          NGA_Put(ga,lo,lo,&val,&ld);
        }
      }
    }
  }
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Ended NGA_Put() negative seq values.\n"); CHKERRQ(ierr);
  
  ierr = PetscBarrier(0); CHKERRQ(ierr);
  ierr = DAVecGetArray(da,vec,&v); CHKERRQ(ierr);
  for (int k = info.zs; k < ze; ++k) {
    for (int j = info.ys; j < ye; ++j) {
      for (int i = info.xs; i < xe; ++i) {
        val = -1 * (1.*i + d1*j + d1*d2*k);
        if( PetscAbs( val - v[k][j][i] ) > .1 )
          printf(".");
      }
    }
  }
  ierr = DAVecRestoreArray(da,vec,&v); CHKERRQ(ierr);
  ierr = PetscPrintf(PETSC_COMM_WORLD, "Ended petsc vec update test.\n"); CHKERRQ(ierr);
  
  if( rank == 0 )
    GA_Print_stats();
  
  ierr = VecDestroy(vec); CHKERRQ(ierr);
  GA_Destroy(ga);
  PetscFunctionReturn(0);
}

Пример #26

0

Показать файл

Файл: testc.c Проект: jeffhammond/ga

void do_work()
{
int ONE=1 ;   /* useful constants */
int g_a, g_b;
int n=N, type=MT_F_DBL;
int me=GA_Nodeid(), nproc=GA_Nnodes();
int i, row;
int dims[2]={N,N};
int lo[2], hi[2], ld;

/* Note: on all current platforms DoublePrecision == double */
double buf[N], err, alpha, beta;

     if(me==0)printf("Creating matrix A\n");
     g_a = NGA_Create(type, 2, dims, "A", NULL);
     if(!g_a) GA_Error("create failed: A",n); 
     if(me==0)printf("OK\n");

     if(me==0)printf("Creating matrix B\n");
     /* create matrix B  so that it has dims and distribution of A*/
     g_b = GA_Duplicate(g_a, "B");
     if(! g_b) GA_Error("duplicate failed",n); 
     if(me==0)printf("OK\n");

     GA_Zero(g_a);   /* zero the matrix */

     if(me==0)printf("Initializing matrix A\n");
     /* fill in matrix A with random values in range 0.. 1 */ 
     lo[1]=0; hi[1]=n-1;
     for(row=me; row<n; row+= nproc){
         /* each process works on a different row in MIMD style */
         lo[0]=hi[0]=row;   
         for(i=0; i<n; i++) buf[i]=sin((double)i + 0.1*(row+1));
         NGA_Put(g_a, lo, hi, buf, &n);
     }


     if(me==0)printf("Symmetrizing matrix A\n");
     GA_Symmetrize(g_a);   /* symmetrize the matrix A = 0.5*(A+A') */
   

     /* check if A is symmetric */ 
     if(me==0)printf("Checking if matrix A is symmetric\n");
     GA_Transpose(g_a, g_b); /* B=A' */
     alpha=1.; beta=-1.;
     GA_Add(&alpha, g_a, &beta, g_b, g_b);  /* B= A - B */
     err= GA_Ddot(g_b, g_b);
     
     if(me==0)printf("Error=%f\n",(double)err);
     
     if(me==0)printf("\nChecking atomic accumulate \n");

     GA_Zero(g_a);   /* zero the matrix */
     for(i=0; i<n; i++) buf[i]=(double)i;

     /* everybody accumulates to the same location/row */
     alpha = 1.0;
     row = n/2;
     lo[0]=hi[0]=row;
     lo[1]=0; hi[1]=n-1;
     ld = hi[1]-lo[1]+1;
     NGA_Acc(g_a, lo, hi, buf, &ld, &alpha );
     GA_Sync();

     if(me==0){ /* node 0 is checking the result */

        NGA_Get(g_a, lo, hi, buf,&ld);
        for(i=0; i<n; i++) if(buf[i] != (double)nproc*i)
           GA_Error("failed: column=",i);
        printf("OK\n\n");

     }
     
     GA_Destroy(g_a);
     GA_Destroy(g_b);
}

Пример #27

0

Показать файл

Файл: ga_lu.c Проект: dmlb2000/nwchem-cml

/* input is matrix size */
void ga_lu(double *A, int matrix_size) 
{
    int g_a, g_b, dims[2], type=C_DBL;
    int lo[2], hi[2], ld;
    int block_size[2], proc_grid[2];
    double time, gflops;
    
    /* create a 2-d GA (global matrix) */
    dims[0] = matrix_size;
    dims[1] = matrix_size;
    block_size[0] = BLOCK_SIZE;
    block_size[1] = BLOCK_SIZE;
#ifdef USE_SCALAPACK_DISTR
    proc_grid[0] = 2;
    proc_grid[1] = nprocs/2;
    if(nprocs%2) GA_Error("For ScaLAPACK stle distribution, nprocs must be "
                         " divisible by 2", 0);
#endif
    
    
#ifndef BLOCK_CYCLIC
    g_a = NGA_Create(type, 2, dims, "A", NULL);
    g_b = GA_Duplicate(g_a, "transposed array B");
#else
    g_a = GA_Create_handle();
    GA_Set_data(g_a, 2, dims, type);
    GA_Set_array_name(g_a,"A");
#  ifdef USE_SCALAPACK_DISTR
    GA_Set_block_cyclic_proc_grid(g_a, block_size, proc_grid);
#  else
    GA_Set_block_cyclic(g_a, block_size);    
#  endif
    GA_Allocate(g_a);
    
    g_b = GA_Create_handle();
    GA_Set_data(g_b, 2, dims, type);
    GA_Set_array_name(g_b,"B");
#  ifdef USE_SCALAPACK_DISTR
    GA_Set_block_cyclic_proc_grid(g_b, block_size, proc_grid);
#  else
    GA_Set_block_cyclic(g_b, block_size);
#  endif
    GA_Allocate(g_b);
    
#endif
    
    /* copy the local matrix into GA */
    if(me==0) 
    {
       lo[0] = 0;
       hi[0] = matrix_size - 1;
       lo[1] = 0;
       hi[1] = matrix_size - 1;
       ld    = matrix_size;
       
       NGA_Put(g_a, lo, hi, A, &ld);
    }
    GA_Sync();

    GA_Transpose(g_a, g_b);
    time = CLOCK_();
    GA_Lu('n', g_b);
    time = CLOCK_() - time;

    /* 2/3 N^3 - 1/2 N^2 flops for LU and 2*N^2 for solver */
    gflops = ( (((double)matrix_size) * matrix_size)/(time*1.0e+9) *
               (2.0/3.0 * (double)matrix_size - 0.5) );
    if(me==0) printf("\nGA_Lu: N=%d flops=%2.5e Gflops, time=%2.5e secs\n\n",
                     matrix_size, gflops, time);

#if DEBUG
    GA_Print(g_a);
    GA_Print(g_b);
#endif
    /* if(me==0) lu(A, matrix_size);     */

    GA_Destroy(g_a);
    GA_Destroy(g_b);
}

Пример #28

0

Показать файл

Файл: test_sum.c Проект: niuqingpeng/microbenchmark

int main(int argc, char**argv)
{
  int nprocs, me;
  int i,j;
  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
  GA_Initialize();
  int flag=1;
  int Nx=108; int Ny=108; int Nz = 108;
  Nx+=3; Ny+=3; Nz+=3;
  int data[Nx*Ny*Nz];
  int num_splines = 32;
  int g_a,dims[4]={Nx,Ny,Nz,num_splines},chunk[4]={-1,-1,-1,num_splines};
  int type=C_INT;
  g_a=NGA_Create(type,4,dims,"Coefs",chunk);
  int lo[4],hi[4],ld[3];
  double value=9.0;
  GA_Fill(g_a,&value);
  GA_Print_distribution(g_a);
  fflush(stdout);
  if(me==0)
  {
      for (i=0; i<num_splines; i++) 
      {
          for (j=0; j<Nx*Ny*Nz; j++) 
              data[j] = rand()%1000;
          lo[0]=lo[1]=lo[2]=0;
          hi[0]=Nx-1;hi[1]=Ny-1;hi[2]=Nz-1;
          lo[3]=hi[3]=i%num_splines;
          ld[0]=Ny;ld[1]=Nz;ld[2]=1;
          NGA_Put(g_a,lo,hi,data,ld);
      }
  }
  printf("done\n"),fflush(stdout);
  GA_Sync();
  ga_coefs_t *ga_coefs = malloc(sizeof(ga_coefs_t));
  ga_coefs->Mx = Nx; 
  ga_coefs->My = Ny;
  ga_coefs->Mz = Nz;
  ga_coefs->nsplines = num_splines;
  ga_coefs->g_a=g_a;
  int *coefs1 = (int*)malloc((size_t)1*sizeof(int)*4*4*4*num_splines);
  int ix,iy,iz;
  Nx-=3; Ny-=3; Nz-=3;
  ga_coefs->sumt=ga_coefs->amount=0;
  NGA_Distribution(g_a,me,lo,hi);
  int low[16][4],high[16][4];
  for(i=0;i<nprocs;i++)
      NGA_Distribution(g_a,i,low[i],high[i]);
  srand ( time(NULL) );
  int k;
  for(k=0;k<nprocs;k++)
  {
      ga_coefs->sumt=ga_coefs->amount=0;
      {
          for(i=0;i<1000;i++)
          {
              ix=rand_index(low[k][0],high[k][0]);
              iy=rand_index(low[k][1],high[k][1]);
              iz=rand_index(low[k][2],high[k][2]);
              coefs_ga_get_3d(ga_coefs,coefs1,ix,iy,iz);
              mini_cube_sum(coefs1, ga_coefs->nsplines);
          }
      }
      printf("<%d,%d>\t %lf \t %d \t %lf\n", me,k, ga_coefs->sumt, ga_coefs->amount, ga_coefs->sumt/ga_coefs->amount),fflush(stdout);
  }
  free(coefs1);
  GA_Terminate();
  MPI_Finalize();
  return 0;
}

Пример #29

0

Показать файл

Файл: adjacency_list.cpp Проект: Anastien/GridPACK

// -------------------------------------------------------------
// AdjacencyList::ready
// -------------------------------------------------------------
void
AdjacencyList::ready(void)
{
#if 1
  int grp = this->communicator().getGroup();
  int me = GA_Pgroup_nodeid(grp);
  int nprocs = GA_Pgroup_nnodes(grp);
  p_adjacency.clear();
  p_adjacency.resize(p_global_nodes.size());

  // Find total number of nodes and edges. Assume no duplicates
  int nedges = p_edges.size();
  int total_edges = nedges;
  char plus[2];
  strcpy(plus,"+");
  GA_Pgroup_igop(grp,&total_edges, 1, plus);
  int nnodes = p_original_nodes.size();
  int total_nodes = nnodes;
  GA_Pgroup_igop(grp,&total_nodes, 1, plus);

  // Create a global array containing original indices of all nodes and indexed
  // by the global index of the node
  int i, p;
  int dist[nprocs];
  for (p=0; p<nprocs; p++) {
    dist[p] = 0;
  }
  dist[me] = nnodes;
  GA_Pgroup_igop(grp,dist,nprocs,plus);
  int *mapc = new int[nprocs+1];
  mapc[0] = 0;
  for (p=1; p<nprocs; p++) {
    mapc[p] = mapc[p-1] + dist[p-1];
  }
  mapc[nprocs] = total_nodes;
  int g_nodes = GA_Create_handle();
  int dims = total_nodes;
  NGA_Set_data(g_nodes,1,&dims,C_INT);
  NGA_Set_pgroup(g_nodes, grp);
  if (!GA_Allocate(g_nodes)) {
    char buf[256];
    sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array"
        " for bus indices\n");
    printf(buf);
    throw gridpack::Exception(buf);
  }
  int lo, hi;
  lo = mapc[me];
  hi = mapc[me+1]-1;
  int size = hi - lo + 1;
  int o_idx[size], g_idx[size];
  for (i=0; i<size; i++) o_idx[i] = p_original_nodes[i]; 
  for (i=0; i<size; i++) g_idx[i] = p_global_nodes[i]; 
  int **indices= new int*[size];
  int *iptr = g_idx;
  for (i=0; i<size; i++) {
    indices[i] = iptr;
    iptr++;
  }
  if (size > 0) NGA_Scatter(g_nodes,o_idx,indices,size);
  GA_Pgroup_sync(grp);
  delete [] indices;
  delete [] mapc;

  // Cycle through all nodes and match them up with nodes at end of edges.
  for (p=0; p<nprocs; p++) {
    int iproc = (me+p)%nprocs;
    // Get node data from process iproc
    NGA_Distribution(g_nodes,iproc,&lo,&hi);
    size = hi - lo + 1;
    if (size <= 0) continue;
    int *buf = new int[size];
    int ld = 1;
    NGA_Get(g_nodes,&lo,&hi,buf,&ld);
    // Create a map of the nodes from process p
    std::map<int,int> nmap;
    std::map<int,int>::iterator it;
    std::pair<int,int> pr;
    for (i=lo; i<=hi; i++){
      pr = std::pair<int,int>(buf[i-lo],i);
      nmap.insert(pr);
    }
    delete [] buf;
    // scan through the edges looking for matches. If there is a match, set the
    // global index
    int idx;
    for (i=0; i<nedges; i++) {
      idx = static_cast<int>(p_edges[i].original_conn.first);
      it = nmap.find(idx);
      if (it != nmap.end()) {
        p_edges[i].global_conn.first = static_cast<Index>(it->second);
      }
      idx = static_cast<int>(p_edges[i].original_conn.second);
      it = nmap.find(idx);
      if (it != nmap.end()) {
        p_edges[i].global_conn.second = static_cast<Index>(it->second);
      }
    }
  }
  GA_Destroy(g_nodes);

  // All edges now have global indices assigned to them. Begin constructing
  // adjacency list. Start by creating a global array containing all edges
  dist[0] = 0;
  for (p=1; p<nprocs; p++) {
    double max = static_cast<double>(total_edges);
    max = (static_cast<double>(p))*(max/(static_cast<double>(nprocs)));
    dist[p] = 2*(static_cast<int>(max));
  }
  int g_edges = GA_Create_handle();
  dims = 2*total_edges;
  NGA_Set_data(g_edges,1,&dims,C_INT);
  NGA_Set_irreg_distr(g_edges,dist,&nprocs);
  NGA_Set_pgroup(g_edges, grp);
  if (!GA_Allocate(g_edges)) {
    char buf[256];
    sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array"
        " for branch indices\n");
    printf(buf);
    throw gridpack::Exception(buf);
  }

  // Add edge information to global array. Start by figuring out how much data
  // is associated with each process
  for (p=0; p<nprocs; p++) {
    dist[p] = 0;
  }
  dist[me] = nedges;
  GA_Pgroup_igop(grp,dist, nprocs, plus);
  int offset[nprocs];
  offset[0] = 0;
  for (p=1; p<nprocs; p++) {
    offset[p] = offset[p-1] + 2*dist[p-1];
  }
  // Figure out where local data goes in GA and then copy it to GA
  lo = offset[me];
  hi = lo + 2*nedges - 1;
  int edge_ids[2*nedges];
  for (i=0; i<nedges; i++) {
    edge_ids[2*i] = static_cast<int>(p_edges[i].global_conn.first);
    edge_ids[2*i+1] = static_cast<int>(p_edges[i].global_conn.second);
  }
  if (lo <= hi) {
    int ld = 1;
    NGA_Put(g_edges,&lo,&hi,edge_ids,&ld);
  }
  GA_Pgroup_sync(grp);

  // Cycle through all edges and find out how many are attached to the nodes on
  // your process. Start by creating a map between the global node indices and
  // the local node indices
  std::map<int,int> gmap;
  std::map<int,int>::iterator it;
  std::pair<int,int> pr;
  for (i=0; i<nnodes; i++){
    pr = std::pair<int,int>(static_cast<int>(p_global_nodes[i]),i);
    gmap.insert(pr);
  }
  // Cycle through edge information on each processor
  for (p=0; p<nprocs; p++) {
    int iproc = (me+p)%nprocs;
    NGA_Distribution(g_edges,iproc,&lo,&hi);
    int size = hi - lo + 1;
    int *buf = new int[size];
    int ld = 1;
    NGA_Get(g_edges,&lo,&hi,buf,&ld);
    BOOST_ASSERT(size%2 == 0);
    size = size/2;
    int idx1, idx2;
    Index idx;
    for (i=0; i<size; i++) {
      idx1 = buf[2*i];
      idx2 = buf[2*i+1];
      it = gmap.find(idx1);
      if (it != gmap.end()) {
        idx = static_cast<Index>(idx2);
        p_adjacency[it->second].push_back(idx);
      }
      it = gmap.find(idx2);
      if (it != gmap.end()) {
        idx = static_cast<Index>(idx1);
        p_adjacency[it->second].push_back(idx);
      }
    }
    delete [] buf;
  }
  GA_Destroy(g_edges);
  GA_Pgroup_sync(grp);
#else
  int me(this->processor_rank());
  int nproc(this->processor_size());

  p_adjacency.clear();
  p_adjacency.resize(p_nodes.size());

  IndexVector current_indexes;
  IndexVector connected_indexes;

  for (int p = 0; p < nproc; ++p) {

    // broadcast the node indexes owned by process p to all processes,
    // all processes work on these at once

    current_indexes.clear();
    if (me == p) {
      std::copy(p_nodes.begin(), p_nodes.end(), 
	  std::back_inserter(current_indexes));
      // std::cout << me << ": node indexes: ";
      // std::copy(current_indexes.begin(), current_indexes.end(),
      //           std::ostream_iterator<Index>(std::cout, ","));
      // std::cout << std::endl;
    }
    boost::mpi::broadcast(this->communicator(), current_indexes, p);

    // make a copy of the local edges in a list (so it's easier to
    // remove those completely accounted for)
    std::list<p_Edge> tmpedges;
    std::copy(p_edges.begin(), p_edges.end(), 
	std::back_inserter(tmpedges));

    // loop over the process p's node index set

    int local_index(0);
    for (IndexVector::iterator n = current_indexes.begin(); 
	n != current_indexes.end(); ++n, ++local_index) {

      // determine the local edges that refer to the current node index

      connected_indexes.clear();
      std::list<p_Edge>::iterator e(tmpedges.begin());
      //      std::cout << me << ": current node index: " << *n 
      //                << ", edges: " << tmpedges.size() 
      //                << std::endl;

      while (e != tmpedges.end()) {
	if (*n == e->conn.first && e->conn.second != bogus) {
	  connected_indexes.push_back(e->conn.second);
	  e->found.first = true;
	  // std::cout << me << ": found connection: edge " << e->index
	  //           << " (" << e->conn.first << ", " << e->conn.second << ")"
	  //           << std::endl;
	}
	if (*n == e->conn.second && e->conn.first != bogus) {
	  connected_indexes.push_back(e->conn.first);
	  e->found.second = true;
	  // std::cout << me << ": found connection: edge " << e->index
	  //           << " (" << e->conn.first << ", " << e->conn.second << ")"
	  //           << std::endl;
	}

	if (e->found.first && e->found.second) {
	  e = tmpedges.erase(e);
	} else if (e->conn.first == bogus || 
	    e->conn.second == bogus) {
	  e = tmpedges.erase(e);
	} else {
	  ++e;
	}
      }

      // gather all connections for the current node index to the
      // node's owner process, we have to gather the vectors because
      // processes will have different numbers of connections

      if (me == p) {
	size_t allsize;
        boost::mpi::reduce(this->communicator(), 
                           connected_indexes.size(), allsize, std::plus<size_t>(), p);

	std::vector<IndexVector> all_connected_indexes;
        boost::mpi::gather(this->communicator(), 
                           connected_indexes, all_connected_indexes, p);
	p_adjacency[local_index].clear();
	for (std::vector<IndexVector>::iterator k = all_connected_indexes.begin();
	    k != all_connected_indexes.end(); ++k) {
	  std::copy(k->begin(), k->end(), 
	      std::back_inserter(p_adjacency[local_index]));
	}
      } else {
	boost::mpi::reduce(this->communicator(), 
                           connected_indexes.size(), std::plus<size_t>(), p);
	boost::mpi::gather(this->communicator(), connected_indexes, p);
      }
      this->communicator().barrier();
    }
    this->communicator().barrier();
  }
#endif
}

Пример #30

0

Показать файл

Файл: transpose.c Проект: timasjov/parallel-computing-ga-mpi

void TRANSPOSE1D() {
    
    int ndim, dims[1], chunk[1], ld[1], lo[1], hi[1];
    int lo1[1], hi1[1], lo2[1], hi2[1];
    int g_a, g_b, a[MAXPROC*TOTALELEMS],b[MAXPROC*TOTALELEMS];
    int nelem, i, q;    
    int me, nprocs;
    
    /* Find local processor ID and number of processors */
    me = GA_Nodeid();
    nprocs = GA_Nnodes();
	
	/* Configure array dimensions. Force an unequal data distribution */
	ndim     = 1; /* 1-d transpose */
	dims[0]  = nprocs*TOTALELEMS + nprocs/2;
	ld[0]    = dims[0];
	chunk[0] = TOTALELEMS; /* minimum data on each process */
 
	/* create a global array g_a and duplicate it to get g_b */
	g_a = NGA_Create(C_INT, 1, dims, "array A", chunk);
	if (!g_a) GA_Error("create failed: A", 0);
	
	g_b = GA_Duplicate(g_a, "array B");
	if (! g_b) GA_Error("duplicate failed", 0);
	
	/* initialize data in g_a */
	if (me==0) {
	   for(i=0; i<dims[0]; i++) a[i] = i;
	   lo[0]  = 0;
	   hi[0] = dims[0]-1;
	   NGA_Put(g_a, lo, hi, a, ld);
	}

	/* Synchronize all processors to guarantee that everyone has data
	   before proceeding to the next step. */
	GA_Sync();

	/* Start initial phase of inversion by inverting the data held locally on
	   each processor. Start by finding out which data each processor owns. */
	NGA_Distribution(g_a, me, lo1, hi1);

	/* Get locally held data and copy it into local buffer a  */
	NGA_Get(g_a, lo1, hi1, a, ld);
	
	/* Invert data locally */
	nelem = hi1[0] - lo1[0] + 1;
	for (i=0; i<nelem; i++) b[i] = a[nelem-1-i];
	
	/* Invert data globally by copying locally inverted blocks into
	 * their inverted positions in the GA */
	lo2[0] = dims[0] - hi1[0] -1;
	hi2[0] = dims[0] - lo1[0] -1;
	NGA_Put(g_b,lo2,hi2,b,ld);

	/* Synchronize all processors to make sure inversion is complete */
	GA_Sync();
	
	
	
	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);

}