Example #1
0
// -------------------------------------------------------------
// CreateMatGA
// -------------------------------------------------------------
static
PetscErrorCode
CreateMatGA(int pgroup, int lrows, int lcols, int grows, int gcols, int *ga)
{
  PetscErrorCode ierr = 0;

  /* Try to honor local ownership request (of rows). */

  int nprocs = GA_Pgroup_nnodes(pgroup);
  int me = GA_Pgroup_nodeid(pgroup);
  int tmapc[nprocs+1];
  int mapc[nprocs+1];
  int i;

  for (i = 0; i < nprocs+1; i++) tmapc[i] = 0;
  tmapc[me] = lrows;
  GA_Pgroup_igop(pgroup, tmapc, nprocs+1, "+");
  mapc[0] = 0;
  for (i = 1; i < nprocs; i++) mapc[i] = mapc[i-1]+tmapc[i-1];
  mapc[nprocs] = 0;

  int dims[2] = {grows, gcols};
  int blocks[2] = { nprocs, 1 };

  *ga = GA_Create_handle();
  GA_Set_data(*ga, 2, dims, MT_PETSC_SCALAR);
  GA_Set_irreg_distr(*ga, mapc, blocks);
  GA_Set_pgroup(*ga, pgroup);
  if (!GA_Allocate(*ga)) {
    ierr = 1;
  }
  PetscScalar z(0.0);
  GA_Fill(*ga, &z);
  
  return ierr;
}
Example #2
0
// -------------------------------------------------------------
// AdjacencyList::ready
// -------------------------------------------------------------
void
AdjacencyList::ready(void)
{
#if 1
  int grp = this->communicator().getGroup();
  int me = GA_Pgroup_nodeid(grp);
  int nprocs = GA_Pgroup_nnodes(grp);
  p_adjacency.clear();
  p_adjacency.resize(p_global_nodes.size());

  // Find total number of nodes and edges. Assume no duplicates
  int nedges = p_edges.size();
  int total_edges = nedges;
  char plus[2];
  strcpy(plus,"+");
  GA_Pgroup_igop(grp,&total_edges, 1, plus);
  int nnodes = p_original_nodes.size();
  int total_nodes = nnodes;
  GA_Pgroup_igop(grp,&total_nodes, 1, plus);

  // Create a global array containing original indices of all nodes and indexed
  // by the global index of the node
  int i, p;
  int dist[nprocs];
  for (p=0; p<nprocs; p++) {
    dist[p] = 0;
  }
  dist[me] = nnodes;
  GA_Pgroup_igop(grp,dist,nprocs,plus);
  int *mapc = new int[nprocs+1];
  mapc[0] = 0;
  for (p=1; p<nprocs; p++) {
    mapc[p] = mapc[p-1] + dist[p-1];
  }
  mapc[nprocs] = total_nodes;
  int g_nodes = GA_Create_handle();
  int dims = total_nodes;
  NGA_Set_data(g_nodes,1,&dims,C_INT);
  NGA_Set_pgroup(g_nodes, grp);
  if (!GA_Allocate(g_nodes)) {
    char buf[256];
    sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array"
        " for bus indices\n");
    printf(buf);
    throw gridpack::Exception(buf);
  }
  int lo, hi;
  lo = mapc[me];
  hi = mapc[me+1]-1;
  int size = hi - lo + 1;
  int o_idx[size], g_idx[size];
  for (i=0; i<size; i++) o_idx[i] = p_original_nodes[i]; 
  for (i=0; i<size; i++) g_idx[i] = p_global_nodes[i]; 
  int **indices= new int*[size];
  int *iptr = g_idx;
  for (i=0; i<size; i++) {
    indices[i] = iptr;
    iptr++;
  }
  if (size > 0) NGA_Scatter(g_nodes,o_idx,indices,size);
  GA_Pgroup_sync(grp);
  delete [] indices;
  delete [] mapc;

  // Cycle through all nodes and match them up with nodes at end of edges.
  for (p=0; p<nprocs; p++) {
    int iproc = (me+p)%nprocs;
    // Get node data from process iproc
    NGA_Distribution(g_nodes,iproc,&lo,&hi);
    size = hi - lo + 1;
    if (size <= 0) continue;
    int *buf = new int[size];
    int ld = 1;
    NGA_Get(g_nodes,&lo,&hi,buf,&ld);
    // Create a map of the nodes from process p
    std::map<int,int> nmap;
    std::map<int,int>::iterator it;
    std::pair<int,int> pr;
    for (i=lo; i<=hi; i++){
      pr = std::pair<int,int>(buf[i-lo],i);
      nmap.insert(pr);
    }
    delete [] buf;
    // scan through the edges looking for matches. If there is a match, set the
    // global index
    int idx;
    for (i=0; i<nedges; i++) {
      idx = static_cast<int>(p_edges[i].original_conn.first);
      it = nmap.find(idx);
      if (it != nmap.end()) {
        p_edges[i].global_conn.first = static_cast<Index>(it->second);
      }
      idx = static_cast<int>(p_edges[i].original_conn.second);
      it = nmap.find(idx);
      if (it != nmap.end()) {
        p_edges[i].global_conn.second = static_cast<Index>(it->second);
      }
    }
  }
  GA_Destroy(g_nodes);

  // All edges now have global indices assigned to them. Begin constructing
  // adjacency list. Start by creating a global array containing all edges
  dist[0] = 0;
  for (p=1; p<nprocs; p++) {
    double max = static_cast<double>(total_edges);
    max = (static_cast<double>(p))*(max/(static_cast<double>(nprocs)));
    dist[p] = 2*(static_cast<int>(max));
  }
  int g_edges = GA_Create_handle();
  dims = 2*total_edges;
  NGA_Set_data(g_edges,1,&dims,C_INT);
  NGA_Set_irreg_distr(g_edges,dist,&nprocs);
  NGA_Set_pgroup(g_edges, grp);
  if (!GA_Allocate(g_edges)) {
    char buf[256];
    sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array"
        " for branch indices\n");
    printf(buf);
    throw gridpack::Exception(buf);
  }

  // Add edge information to global array. Start by figuring out how much data
  // is associated with each process
  for (p=0; p<nprocs; p++) {
    dist[p] = 0;
  }
  dist[me] = nedges;
  GA_Pgroup_igop(grp,dist, nprocs, plus);
  int offset[nprocs];
  offset[0] = 0;
  for (p=1; p<nprocs; p++) {
    offset[p] = offset[p-1] + 2*dist[p-1];
  }
  // Figure out where local data goes in GA and then copy it to GA
  lo = offset[me];
  hi = lo + 2*nedges - 1;
  int edge_ids[2*nedges];
  for (i=0; i<nedges; i++) {
    edge_ids[2*i] = static_cast<int>(p_edges[i].global_conn.first);
    edge_ids[2*i+1] = static_cast<int>(p_edges[i].global_conn.second);
  }
  if (lo <= hi) {
    int ld = 1;
    NGA_Put(g_edges,&lo,&hi,edge_ids,&ld);
  }
  GA_Pgroup_sync(grp);

  // Cycle through all edges and find out how many are attached to the nodes on
  // your process. Start by creating a map between the global node indices and
  // the local node indices
  std::map<int,int> gmap;
  std::map<int,int>::iterator it;
  std::pair<int,int> pr;
  for (i=0; i<nnodes; i++){
    pr = std::pair<int,int>(static_cast<int>(p_global_nodes[i]),i);
    gmap.insert(pr);
  }
  // Cycle through edge information on each processor
  for (p=0; p<nprocs; p++) {
    int iproc = (me+p)%nprocs;
    NGA_Distribution(g_edges,iproc,&lo,&hi);
    int size = hi - lo + 1;
    int *buf = new int[size];
    int ld = 1;
    NGA_Get(g_edges,&lo,&hi,buf,&ld);
    BOOST_ASSERT(size%2 == 0);
    size = size/2;
    int idx1, idx2;
    Index idx;
    for (i=0; i<size; i++) {
      idx1 = buf[2*i];
      idx2 = buf[2*i+1];
      it = gmap.find(idx1);
      if (it != gmap.end()) {
        idx = static_cast<Index>(idx2);
        p_adjacency[it->second].push_back(idx);
      }
      it = gmap.find(idx2);
      if (it != gmap.end()) {
        idx = static_cast<Index>(idx1);
        p_adjacency[it->second].push_back(idx);
      }
    }
    delete [] buf;
  }
  GA_Destroy(g_edges);
  GA_Pgroup_sync(grp);
#else
  int me(this->processor_rank());
  int nproc(this->processor_size());

  p_adjacency.clear();
  p_adjacency.resize(p_nodes.size());

  IndexVector current_indexes;
  IndexVector connected_indexes;

  for (int p = 0; p < nproc; ++p) {

    // broadcast the node indexes owned by process p to all processes,
    // all processes work on these at once

    current_indexes.clear();
    if (me == p) {
      std::copy(p_nodes.begin(), p_nodes.end(), 
	  std::back_inserter(current_indexes));
      // std::cout << me << ": node indexes: ";
      // std::copy(current_indexes.begin(), current_indexes.end(),
      //           std::ostream_iterator<Index>(std::cout, ","));
      // std::cout << std::endl;
    }
    boost::mpi::broadcast(this->communicator(), current_indexes, p);

    // make a copy of the local edges in a list (so it's easier to
    // remove those completely accounted for)
    std::list<p_Edge> tmpedges;
    std::copy(p_edges.begin(), p_edges.end(), 
	std::back_inserter(tmpedges));

    // loop over the process p's node index set

    int local_index(0);
    for (IndexVector::iterator n = current_indexes.begin(); 
	n != current_indexes.end(); ++n, ++local_index) {

      // determine the local edges that refer to the current node index

      connected_indexes.clear();
      std::list<p_Edge>::iterator e(tmpedges.begin());
      //      std::cout << me << ": current node index: " << *n 
      //                << ", edges: " << tmpedges.size() 
      //                << std::endl;

      while (e != tmpedges.end()) {
	if (*n == e->conn.first && e->conn.second != bogus) {
	  connected_indexes.push_back(e->conn.second);
	  e->found.first = true;
	  // std::cout << me << ": found connection: edge " << e->index
	  //           << " (" << e->conn.first << ", " << e->conn.second << ")"
	  //           << std::endl;
	}
	if (*n == e->conn.second && e->conn.first != bogus) {
	  connected_indexes.push_back(e->conn.first);
	  e->found.second = true;
	  // std::cout << me << ": found connection: edge " << e->index
	  //           << " (" << e->conn.first << ", " << e->conn.second << ")"
	  //           << std::endl;
	}

	if (e->found.first && e->found.second) {
	  e = tmpedges.erase(e);
	} else if (e->conn.first == bogus || 
	    e->conn.second == bogus) {
	  e = tmpedges.erase(e);
	} else {
	  ++e;
	}
      }

      // gather all connections for the current node index to the
      // node's owner process, we have to gather the vectors because
      // processes will have different numbers of connections

      if (me == p) {
	size_t allsize;
        boost::mpi::reduce(this->communicator(), 
                           connected_indexes.size(), allsize, std::plus<size_t>(), p);

	std::vector<IndexVector> all_connected_indexes;
        boost::mpi::gather(this->communicator(), 
                           connected_indexes, all_connected_indexes, p);
	p_adjacency[local_index].clear();
	for (std::vector<IndexVector>::iterator k = all_connected_indexes.begin();
	    k != all_connected_indexes.end(); ++k) {
	  std::copy(k->begin(), k->end(), 
	      std::back_inserter(p_adjacency[local_index]));
	}
      } else {
	boost::mpi::reduce(this->communicator(), 
                           connected_indexes.size(), std::plus<size_t>(), p);
	boost::mpi::gather(this->communicator(), connected_indexes, p);
      }
      this->communicator().barrier();
    }
    this->communicator().barrier();
  }
#endif
}
Example #3
0
void
test(int data_type) {
  int me=GA_Nodeid();
  int nproc = GA_Nnodes();
  int g_a, g_b, g_c;
  int ndim = 2;
  int dims[2]={N,N};
  int lo[2]={0,0};
  int hi[2]={N-1,N-1};
  int block_size[2]={NB,NB-1};
  int proc_grid[2];
  int i,j,l,k,m,n, ld;

  double alpha_dbl = 1.0, beta_dbl = 0.0;
  double dzero = 0.0;
  double ddiff;

  float alpha_flt = 1.0, beta_flt = 0.0;
  float fzero = 0.0;
  float fdiff;
  float ftmp;
  double dtmp;
  SingleComplex ctmp;
  DoubleComplex ztmp;

  DoubleComplex alpha_dcpl = {1.0, 0.0} , beta_dcpl = {0.0, 0.0}; 
  DoubleComplex zzero = {0.0,0.0};
  DoubleComplex zdiff;

  SingleComplex alpha_scpl = {1.0, 0.0} , beta_scpl = {0.0, 0.0}; 
  SingleComplex czero = {0.0,0.0};
  SingleComplex cdiff;

  void *alpha=NULL, *beta=NULL;
  void *abuf=NULL, *bbuf=NULL, *cbuf=NULL, *c_ptr=NULL;

  switch (data_type) {
  case C_FLOAT:
    alpha  = (void *)&alpha_flt;
    beta   = (void *)&beta_flt;
    abuf = (void*)malloc(N*N*sizeof(float));
    bbuf = (void*)malloc(N*N*sizeof(float));
    cbuf = (void*)malloc(N*N*sizeof(float));
    if(me==0) printf("Single Precision: Testing GA_Sgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;      
  case C_DBL:
    alpha  = (void *)&alpha_dbl;
    beta   = (void *)&beta_dbl;
    abuf = (void*)malloc(N*N*sizeof(double));
    bbuf = (void*)malloc(N*N*sizeof(double));
    cbuf = (void*)malloc(N*N*sizeof(double));
    if(me==0) printf("Double Precision: Testing GA_Dgemm,NGA_Matmul_patch for %d-Dimension", ndim); 
    break;    
  case C_DCPL:
    alpha  = (void *)&alpha_dcpl;
    beta   = (void *)&beta_dcpl;
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    if(me==0) printf("Double Complex:   Testing GA_Zgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  case C_SCPL:
    alpha  = (void *)&alpha_scpl;
    beta   = (void *)&beta_scpl;
    abuf = (void*)malloc(N*N*sizeof(SingleComplex));
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    if(me==0) printf("Single Complex:   Testing GA_Cgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  if (me==0) printf("\nCreate A, B, C\n");
#ifdef USE_REGULAR
  g_a = NGA_Create(data_type, ndim, dims, "array A", NULL);
#endif
#ifdef USE_SIMPLE_CYCLIC
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  NGA_Set_block_cyclic(g_a,block_size);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_SCALAPACK
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_block_cyclic_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_TILED
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_tiled_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
  g_b = GA_Duplicate(g_a, "array B");  
  g_c = GA_Duplicate(g_a, "array C");
  if(!g_a || !g_b || !g_c) GA_Error("Create failed: a, b or c",1);

  ld = N;
  if (me==0) printf("\nInitialize A\n");
  /* Set up matrix A */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[i*N+j] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[i*N+j] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[i*N+j].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[i*N+j].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[i*N+j] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[i*N+j] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[i*N+j].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[i*N+j].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nPerform matrix multiply\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (me==0) printf("\nCheck answer\n");
  /*
  GA_Print(g_a);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_b);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_c); 
  */

  /* Check answer */
  NGA_Get(g_a,lo,hi,abuf,&ld);
  NGA_Get(g_b,lo,hi,bbuf,&ld);
  for (i=0; i<N; i++) {
    for (j=0; j<N; j++) {
      switch (data_type) {
        case C_FLOAT:
          ((float*)cbuf)[i*N+j] = fzero;
          break;
        case C_DBL:
          ((double*)cbuf)[i*N+j] = dzero;
          break;
        case C_DCPL:
          ((DoubleComplex*)cbuf)[i*N+j] = zzero;
          break;
        case C_SCPL:
          ((SingleComplex*)cbuf)[i*N+j] = czero;
          break;
        default:
          GA_Error("wrong data type", data_type);
      }
      for (k=0; k<N; k++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N+j] += ((float*)abuf)[i*N+k]
              *((float*)bbuf)[k*N+j];
            break;
          case C_DBL:
            ((double*)cbuf)[i*N+j] += ((double*)abuf)[i*N+k]
              *((double*)bbuf)[k*N+j];
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N+j].real +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].real
               -(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].imag));
            ((DoubleComplex*)cbuf)[i*N+j].imag +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].imag
               +(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].real));
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N+j].real +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].real
               -(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].imag));
            ((SingleComplex*)cbuf)[i*N+j].imag +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].imag
               +(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].real));
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();
  if (me == 0) {
    NGA_Get(g_c,lo,hi,abuf,&ld);
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            fdiff = ((float*)abuf)[i*N+j]-((float*)cbuf)[i*N+j];
            if (((float*)abuf)[i*N+j] != 0.0) {
              fdiff /= ((float*)abuf)[i*N+j];
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((float*)abuf)[i*N+j],((float*)cbuf)[i*N+j]);
            }
            break;
          case C_DBL:
            ddiff = ((double*)abuf)[i*N+j]-((double*)cbuf)[i*N+j];
            if (((double*)abuf)[i*N+j] != 0.0) {
              ddiff /= ((double*)abuf)[i*N+j];
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((double*)abuf)[i*N+j],((double*)cbuf)[i*N+j]);
            }
            break;
          case C_DCPL:
            zdiff.real = ((DoubleComplex*)abuf)[i*N+j].real
              -((DoubleComplex*)cbuf)[i*N+j].real;
            zdiff.imag = ((DoubleComplex*)abuf)[i*N+j].imag
              -((DoubleComplex*)cbuf)[i*N+j].imag;
            if (((DoubleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((DoubleComplex*)abuf)[i*N+j].imag != 0.0) {
              ztmp = ((DoubleComplex*)abuf)[i*N+j];
              ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
                  /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
            } else {
              ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((DoubleComplex*)abuf)[i*N+j].real,
                  ((DoubleComplex*)abuf)[i*N+j].imag,
                  ((DoubleComplex*)cbuf)[i*N+j].real,
                  ((DoubleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          case C_SCPL:
            cdiff.real = ((SingleComplex*)abuf)[i*N+j].real
              -((SingleComplex*)cbuf)[i*N+j].real;
            cdiff.imag = ((SingleComplex*)abuf)[i*N+j].imag
              -((SingleComplex*)cbuf)[i*N+j].imag;
            if (((SingleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((SingleComplex*)abuf)[i*N+j].imag != 0.0) {
              ctmp = ((SingleComplex*)abuf)[i*N+j];
              fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
                  /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
            } else {
              fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((SingleComplex*)abuf)[i*N+j].real,
                  ((SingleComplex*)abuf)[i*N+j].imag,
                  ((SingleComplex*)cbuf)[i*N+j].real,
                  ((SingleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();

  /* copy cbuf back to g_a */
  if (me == 0) {
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  GA_Sync();

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = GA_Fdot(g_a,g_a);
      break;
    case C_DBL:
      dtmp = GA_Ddot(g_a,g_a);
      break;
    case C_DCPL:
      ztmp = GA_Zdot(g_a,g_a);
      break;
    case C_SCPL:
      ctmp = GA_Cdot(g_a,g_a);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  GA_Zero(g_b);
  GA_Add(alpha,g_a,beta,g_c,g_b);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = GA_Fdot(g_b, g_b);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = GA_Ddot(g_b, g_b);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = GA_Zdot(g_b, g_b);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = GA_Cdot(g_b, g_b);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
#endif

  free(abuf);
  free(bbuf);
  free(cbuf);

  switch (data_type) {
  case C_FLOAT:
    abuf = (void*)malloc(N*N*sizeof(float)/4);
    bbuf = (void*)malloc(N*N*sizeof(float)/4);
    cbuf = (void*)malloc(N*N*sizeof(float)/4);
    break;      
  case C_DBL:
    abuf = (void*)malloc(N*N*sizeof(double)/4);
    bbuf = (void*)malloc(N*N*sizeof(double)/4);
    cbuf = (void*)malloc(N*N*sizeof(double)/4);
    break;    
  case C_DCPL:
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    break;
  case C_SCPL:
    abuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  /* Test multiply on a fraction of matrix. Start by reinitializing
   * A and B */
  GA_Zero(g_a);
  GA_Zero(g_b);
  GA_Zero(g_c);

  if (me==0) printf("\nTest patch multiply\n");

  lo[0] = N/4;
  lo[1] = N/4;
  hi[0] = 3*N/4-1;
  hi[1] = 3*N/4-1;
  ld = N/2;

  /* Set up matrix A */
  if (me==0) printf("\nInitialize A\n");
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[(i-N/4)*N/2+(j-N/4)] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[(i-N/4)*N/2+(j-N/4)] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  beta_flt = 0.0;
  beta_dbl = 0.0;
  beta_scpl.real = 0.0;
  beta_dcpl.real = 0.0;
  if (me==0) printf("\nPerform matrix multiply on sub-blocks\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (0) {
  /*
  if (data_type != C_SCPL && data_type != C_DCPL) {
  */

  if (me==0) printf("\nCheck answer\n");

  /* Multiply buffers by hand */
  if (me == 0) {
    for (i=0; i<N/2; i++) {
      for (j=0; j<N/2; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N/2+j] = fzero;
            break;
          case C_DBL:
            ((double*)cbuf)[i*N/2+j] = dzero;
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N/2+j] = zzero;
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N/2+j] = czero;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
        for (k=0; k<N/2; k++) {
          switch (data_type) {
            case C_FLOAT:
              ((float*)cbuf)[i*N/2+j] += ((float*)abuf)[i*N/2+k]
                *((float*)bbuf)[k*N/2+j];
              break;
            case C_DBL:
              ((double*)cbuf)[i*N/2+j] += ((double*)abuf)[i*N/2+k]
                *((double*)bbuf)[k*N/2+j];
              break;
            case C_DCPL:
              ((DoubleComplex*)cbuf)[i*N/2+j].real +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].real
                 -(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].imag));
              ((DoubleComplex*)cbuf)[i*N/2+j].imag +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].imag
                 +(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].real));
              break;
            case C_SCPL:
              ((SingleComplex*)cbuf)[i*N/2+j].real +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].real
                 -(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].imag));
              ((SingleComplex*)cbuf)[i*N/2+j].imag +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].imag
                 +(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].real));
              break;
            default:
              GA_Error("wrong data type", data_type);
          }
        }
      }
    }
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  if (me == 0) printf("\n\n\n\n");

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = NGA_Fdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DBL:
      dtmp = NGA_Ddot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DCPL:
      ztmp = NGA_Zdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_SCPL:
      ctmp = NGA_Cdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  NGA_Zero_patch(g_b,lo,hi);
  NGA_Add_patch(alpha,g_a,lo,hi,beta,g_c,lo,hi,g_b,lo,hi);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = NGA_Fdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = NGA_Ddot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = NGA_Zdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = NGA_Cdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }

  }
#endif
  free(abuf);
  free(bbuf);
  free(cbuf);

  GA_Destroy(g_a);
  GA_Destroy(g_b);
  GA_Destroy(g_c);
}
Example #4
0
File: gada.c Project: adrielb/DCell
PetscErrorCode vizGA2DA()
{
  PetscErrorCode  ierr;
  int rank;
  MPI_Comm_rank(PETSC_COMM_WORLD,&rank);  
  int d1 = 40, d2 = 50;
  
  DA da;
  Vec vec;
  const PetscInt *lx, *ly, *lz;
  PetscInt m,n,p;
  DALocalInfo info;
  ierr = DACreate2d(PETSC_COMM_WORLD,DA_NONPERIODIC,DA_STENCIL_STAR,
            d1,d2,PETSC_DECIDE,PETSC_DECIDE,1,1,0,0, &da); CHKERRQ(ierr);
  ierr = DACreateGlobalVector(da, &vec); CHKERRQ(ierr);
  ierr = DAGetOwnershipRanges(da, &lx, &ly, &lz); CHKERRQ(ierr);
  ierr = DAGetLocalInfo(da,&info); CHKERRQ(ierr);
  ierr = DAGetInfo(da,0,0,0,0,&m,&n,&p,0,0,0,0); CHKERRQ(ierr);
  /**/
  ierr = DAView(da, PETSC_VIEWER_STDOUT_WORLD); CHKERRQ(ierr);
  for (int i = 0; i < m; ++i) {
    PetscPrintf(PETSC_COMM_WORLD,"%d\tlx: %d\n",i,lx[i]);
  }
  for (int i = 0; i < n; ++i) {
    PetscPrintf(PETSC_COMM_WORLD,"%d\tly: %d\n",i,ly[i]);
  }
  /**/
 
  
  int ga = GA_Create_handle();
  int ndim = 2;
  int dims[2] = {d2,d1};
  GA_Set_data(ga,2,dims,MT_DBL);
  int *map;
  PetscMalloc( sizeof(int)*(m+n), &map);
  map[0] = 0;
  for( int i = 1; i < n; i++ )
  {
    map[i] = ly[i-1] + map[i-1];
  }
  map[n] = 0;
  for( int i = n+1; i < m+n; i++ )
  {
    map[i] = lx[i-n-1] + map[i-1];
  }
  /* correct ordering, but nodeid's dont line up with mpi rank for petsc's da
   * DA: +---+---+   GA: +---+---+   
   *     +-2-+-3-+       +-1-+-3-+
   *     +---+---+       +---+---+
   *     +-0-+-1-+       +-0-+-2-+
   *     +---+---+       +---+---+
  int *map;
  PetscMalloc( sizeof(int)*(m+n), &map);
  map[0] = 0;
  for( int i = 1; i < m; i++ )
  {
    map[i] = lx[i] + map[i-1];
  }
  map[m] = 0;
  for( int i = m+1; i < m+n; i++ )
  {
    map[i] = ly[i-m] + map[i-1];
  }
  */
  int block[2] = {n,m};  
  GA_Set_irreg_distr(ga,map,block);
  ierr = GA_Allocate( ga );
  if( !ierr ) GA_Error("\n\n\nga allocaltion failed\n\n",ierr);
  if( !ga ) GA_Error("\n\n\n ga null \n\n",ierr); 
  if( rank != GA_Nodeid() ) GA_Error("MPI rank does not match GA_Nodeid()",1);
  GA_Print_distribution(ga);  
  
  int lo[2], hi[2];
  NGA_Distribution(ga,rank,lo,hi);
  if( lo[1] != info.xs || hi[1] != info.xs+info.xm-1 ||
      lo[0] != info.ys || hi[0] != info.ys+info.ym-1 )
  {
    PetscSynchronizedPrintf(PETSC_COMM_SELF,"[%d] lo:(%2d,%2d)  hi:(%2d,%2d) \t DA: (%2d,%2d), (%2d, %2d)\n",
        rank, lo[1], lo[0], hi[1], hi[0], info.xs, info.ys, info.xs+info.xm-1, info.ys+info.ym-1);
  }
  PetscBarrier(0);
  PetscSynchronizedFlush(PETSC_COMM_WORLD);

  AO ao;
  DAGetAO(da,&ao);
  if( rank == 0 )
  {
    int *idx, len = d1*d2;
    PetscReal *val;
    PetscMalloc(sizeof(PetscReal)*len, &val);
    PetscMalloc(sizeof(int)*len, &idx);
    for (int j = 0; j < d2; ++j)
    {
      for (int i = 0; i < d1; ++i)
      {
        idx[i + d1*j] = i + d1*j;
        val[i + d1*j] = i + d1*j;
      }
    }
    AOApplicationToPetsc(ao,len,idx);
    VecSetValues(vec,len,idx,val,INSERT_VALUES);

    int a[2], b[2],ld[1]={0};
    double c = 0;
    for (int j = 0; j < d2; ++j)
    {
      for (int i = 0; i < d1; ++i)
      {
        a[0] = j;
        a[1] = i;
//        printf("%5.0f ",c);
        NGA_Put(ga,a,a,&c,ld);
        c++;
      }
    }
  }
//  GA_Print(ga);
  VecAssemblyBegin(vec);
  VecAssemblyEnd(vec);
  
  int ld;
  double *ptr;
  NGA_Access(ga,lo,hi,&ptr,&ld);
  PetscReal **d;
  int c=0;
  ierr = DAVecGetArray(da,vec,&d); CHKERRQ(ierr);
  for (int j = info.ys; j < info.ys+info.ym; ++j)
  {
    for (int i = info.xs; i < info.xs+info.xm; ++i)
    {
      if( d[j][i] != ptr[(i-info.xs)+ld*(j-info.ys)] )
        GA_Error("DA array is not equal to GA array",1);
//      printf("%d (%d,%d):\t%3.0f\t%3.0f\n", c, i, j, d[j][i], ptr[(i-info.xs)+ld*(j-info.ys)]);
      c++;
    }
  }
  ierr = DAVecRestoreArray(da,vec,&d); CHKERRQ(ierr);
  
  c=0;
  PetscReal *v;
  int start, end;
  VecGetOwnershipRange(vec, &start, &end);
  VecGetArray( vec, &v );
  for( int i = start; i < end; i++)
  {
//    printf("%d:\t%3.0f\t%3.0f\t%s\n", start, v[i-start], ptr[i-start], (v[i-start]-ptr[i-start]==0?"":"NO") );
  }
  VecRestoreArray( vec, &v );
  
  NGA_Release_update(ga,lo,hi);

  Vec gada;
  VecCreateMPIWithArray(((PetscObject)da)->comm,da->Nlocal,PETSC_DETERMINE,ptr,&gada);
  VecView(gada,PETSC_VIEWER_STDOUT_SELF);
  
  GA_Destroy(ga);
  
  
  
  ierr = VecDestroy(vec); CHKERRQ(ierr);
  ierr = DADestroy(da); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
Example #5
0
int main(int argc, char **argv)
{
    int me;
    int nproc;
    int status;
    int g_a;
    int dims[NDIM];
    int chunk[NDIM];
    int pg_world;
    size_t num = 10;
    double *p1 = NULL;
    double *p2 = NULL;
    size_t i;
    int num_mutex;
    int lo[1];
    int hi[1];
    int ld[1]={1};
    MPI_Comm comm;

    MP_INIT(argc,argv);
    GA_INIT(argc,argv);

    me = GA_Nodeid();
    nproc = GA_Nnodes();
    comm = GA_MPI_Comm_pgroup_default();

    printf("%d: Hello world!\n",me);

    if (me==0) printf("%d: GA_Initialize\n",me);
    /*if (me==0) printf("%d: ARMCI_Init\n",me);*/
    /*ARMCI_Init();*/
    /*if (me==0) printf("%d: MA_Init\n",me);*/
    /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/

    if (me==0) printf("%d: GA_Create_handle\n",me);
    g_a = GA_Create_handle();

    if (me==0) printf("%d: GA_Set_array_name\n",me);
    GA_Set_array_name(g_a,"test array A");

    dims[0] = 30;
    if (me==0) printf("%d: GA_Set_data\n",me);
    GA_Set_data(g_a,NDIM,dims,MT_DBL);

    chunk[0] = -1;
    if (me==0) printf("%d: GA_Set_chunk\n",me);
    GA_Set_chunk(g_a,chunk);

    if (me==0) printf("%d: GA_Pgroup_get_world\n",me);
    pg_world = GA_Pgroup_get_world();
    if (me==0) printf("%d: GA_Set_pgroup\n",me);
    GA_Set_pgroup(g_a,pg_world);

    if (me==0) printf("%d: GA_Allocate\n",me);
    status = GA_Allocate(g_a);
    if(0 == status) MPI_Abort(comm,100);

    if (me==0) printf("%d: GA_Zero\n",me);
    GA_Zero(g_a);

    if (me==0) printf("%d: GA_Sync\n",me);
    GA_Sync();

    num = 10;
    p1 = malloc(num*sizeof(double));
    /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p1==NULL) MPI_Abort(comm,1000);
    p2 = malloc(num*sizeof(double));
    /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p2==NULL) MPI_Abort(comm,2000);

    for ( i=0 ; i<num ; i++ ) p1[i] = 7.0;
    for ( i=0 ; i<num ; i++ ) p2[i] = 3.0;

    num_mutex = 17;
    status = GA_Create_mutexes(num_mutex);
    if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status);

/***************************************************************/
    if (me==0) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Put(g_a,lo,hi,p1,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
    } 
    GA_Print(g_a);
    if (me==1) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Get(g_a,lo,hi,p2,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
        for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n",
                (long unsigned)i,p2[i]);
    }
/***************************************************************/



    status = GA_Destroy_mutexes();
    if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status);

    /*ARMCI_Free(p2);*/
    /*ARMCI_Free(p1);*/
    free(p2);
    free(p1);

    if (me==0) printf("%d: GA_Destroy\n",me);
    GA_Destroy(g_a);

    /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/
    /*ARMCI_Finalize();*/
    if (me==0) printf("%d: GA_Terminate\n",me);
    GA_Terminate();
    if (me==0) printf("%d: MPI_Finalize\n",me);
    MPI_Finalize();

    return(0);
}
Example #6
0
/* input is matrix size */
void ga_lu(double *A, int matrix_size) 
{
    int g_a, g_b, dims[2], type=C_DBL;
    int lo[2], hi[2], ld;
    int block_size[2], proc_grid[2];
    double time, gflops;
    
    /* create a 2-d GA (global matrix) */
    dims[0] = matrix_size;
    dims[1] = matrix_size;
    block_size[0] = BLOCK_SIZE;
    block_size[1] = BLOCK_SIZE;
#ifdef USE_SCALAPACK_DISTR
    proc_grid[0] = 2;
    proc_grid[1] = nprocs/2;
    if(nprocs%2) GA_Error("For ScaLAPACK stle distribution, nprocs must be "
                         " divisible by 2", 0);
#endif
    
    
#ifndef BLOCK_CYCLIC
    g_a = NGA_Create(type, 2, dims, "A", NULL);
    g_b = GA_Duplicate(g_a, "transposed array B");
#else
    g_a = GA_Create_handle();
    GA_Set_data(g_a, 2, dims, type);
    GA_Set_array_name(g_a,"A");
#  ifdef USE_SCALAPACK_DISTR
    GA_Set_block_cyclic_proc_grid(g_a, block_size, proc_grid);
#  else
    GA_Set_block_cyclic(g_a, block_size);    
#  endif
    GA_Allocate(g_a);
    
    g_b = GA_Create_handle();
    GA_Set_data(g_b, 2, dims, type);
    GA_Set_array_name(g_b,"B");
#  ifdef USE_SCALAPACK_DISTR
    GA_Set_block_cyclic_proc_grid(g_b, block_size, proc_grid);
#  else
    GA_Set_block_cyclic(g_b, block_size);
#  endif
    GA_Allocate(g_b);
    
#endif
    
    /* copy the local matrix into GA */
    if(me==0) 
    {
       lo[0] = 0;
       hi[0] = matrix_size - 1;
       lo[1] = 0;
       hi[1] = matrix_size - 1;
       ld    = matrix_size;
       
       NGA_Put(g_a, lo, hi, A, &ld);
    }
    GA_Sync();

    GA_Transpose(g_a, g_b);
    time = CLOCK_();
    GA_Lu('n', g_b);
    time = CLOCK_() - time;

    /* 2/3 N^3 - 1/2 N^2 flops for LU and 2*N^2 for solver */
    gflops = ( (((double)matrix_size) * matrix_size)/(time*1.0e+9) *
               (2.0/3.0 * (double)matrix_size - 0.5) );
    if(me==0) printf("\nGA_Lu: N=%d flops=%2.5e Gflops, time=%2.5e secs\n\n",
                     matrix_size, gflops, time);

#if DEBUG
    GA_Print(g_a);
    GA_Print(g_b);
#endif
    /* if(me==0) lu(A, matrix_size);     */

    GA_Destroy(g_a);
    GA_Destroy(g_b);
}
Example #7
0
/**
 * Evaluate offsets for each network component
 */
void setOffsets(void)
{
  // Interleave contributions from buses and branches to match matrices
  int i,j,jdx,jdx1,jdx2;
  int *i_bus_offsets = new int[p_nBuses];
  int *i_branch_offsets = new int[p_nBranches];
  for (i=0; i<p_nBuses; i++) {
    i_bus_offsets[i] = 0;
  }
  for (i=0; i<p_nBranches; i++) {
    i_branch_offsets[i] = 0;
  }
  int icnt = 0;
  int nsize;
  // Evaluate offsets for individual network components
  for (i=0; i<p_nBuses; i++) {
    if (p_network->getActiveBus(i)) {
      i_bus_offsets[i] = icnt;
      icnt += p_network->getBus(i)->vectorNumElements();
      std::vector<int> nghbrs = p_network->getConnectedBranches(i);
      nsize = nghbrs.size();
      for (j=0; j<nsize; j++) {
        // Need to avoid double counting of branches when evaluating offsets.
        // If branch is non-local and it is active, then include it in offsets.
        // Otherwise, if branch is local and bus i is equal to the "from" bus,
        // then include it in the offsets.
        jdx = nghbrs[j];
        if (isLocalBranch(jdx)) {
          p_network->getBranchEndpoints(jdx,&jdx1,&jdx2);
          if (jdx1 == i) {
            i_branch_offsets[jdx] = icnt;
            icnt += p_network->getBranch(jdx)->vectorNumElements();
          }
        } else {
          if (p_network->getActiveBranch(jdx)) {
            i_branch_offsets[jdx] = icnt;
            icnt += p_network->getBranch(jdx)->vectorNumElements();
          }
        }
      }
    }
  }
  // Total number of rows and columns from this processor have been evaluated,
  // now create buffers that can scatter individual offsets to global arrays
  int **i_bus_index = new int*[p_nBuses];
  int **i_branch_index = new int*[p_nBranches];
  int *i_bus_index_buf = new int[p_nBuses];
  int *i_branch_index_buf = new int[p_nBranches];
  int *i_bus_value_buf = new int[p_nBuses];
  int *i_branch_value_buf = new int[p_nBranches];
  int i_bus_cnt = 0;
  int i_branch_cnt = 0;
  int row_offset = p_Offsets[p_me];
  int nbus = 0;
  int nbranch = 0;
  for (i=0; i<p_nBuses; i++) {
    if (p_network->getActiveBus(i)) {
      nbus++;
      i_bus_value_buf[i_bus_cnt] = i_bus_offsets[i]+row_offset;
      i_bus_index_buf[i_bus_cnt] = p_network->getGlobalBusIndex(i);
      i_bus_index[i_bus_cnt] = &i_bus_index_buf[i_bus_cnt];
      i_bus_cnt++;
    }
  }
  for (i=0; i<p_nBranches; i++) {
    if (p_network->getActiveBranch(i)) {
      nbranch++;
      i_branch_value_buf[i_branch_cnt] = i_branch_offsets[i]+row_offset;
      i_branch_index_buf[i_branch_cnt] = p_network->getGlobalBranchIndex(i);
      i_branch_index[i_branch_cnt] = &i_branch_index_buf[i_branch_cnt];
      i_branch_cnt++;
    }
  }
  delete [] i_bus_offsets;
  delete [] i_branch_offsets;
  // Create global arrays that hold column and row offsets for all buses and
  // branches in the network. First create map array for global arrays
  int *t_busMap = new int[p_nNodes];
  int *t_branchMap = new int[p_nNodes];
  for (i=0; i<p_nNodes; i++) {
    t_busMap[i] = 0;
    t_branchMap[i] = 0;
  }
  t_busMap[p_me] = nbus;
  t_branchMap[p_me] = nbranch;
  char plus[2];
  strcpy(plus,"+");
  GA_Pgroup_igop(p_GAgrp, t_busMap, p_nNodes, plus);
  GA_Pgroup_igop(p_GAgrp, t_branchMap, p_nNodes, plus);
  int *busMap = new int[p_nNodes];
  int *branchMap = new int[p_nNodes];
  busMap[0] = 0;
  branchMap[0] = 0;
  int total_buses = t_busMap[0];
  int total_branches = t_branchMap[0];
  for (i=1; i<p_nNodes; i++) {
    busMap[i] = busMap[i-1] + t_busMap[i-1];
    total_buses += t_busMap[i];
    branchMap[i] = branchMap[i-1] + t_branchMap[i-1];
    total_branches += t_branchMap[i];
  }
  delete [] t_busMap;
  delete [] t_branchMap;

  int one = 1;
  g_bus_offsets = GA_Create_handle();
  GA_Set_data(g_bus_offsets, one, &total_buses, C_INT);
  GA_Set_irreg_distr(g_bus_offsets, busMap, &p_nNodes);
  GA_Set_pgroup(g_bus_offsets, p_GAgrp);
  if (!GA_Allocate(g_bus_offsets)) {
    char buf[256];
    sprintf(buf,"GenVectorMap::setOffsets: Unable to allocate distributed array for bus offsets\n");
    printf("%s",buf);
    throw gridpack::Exception(buf);
  }
  GA_Zero(g_bus_offsets);

  g_branch_offsets = GA_Create_handle();
  GA_Set_data(g_branch_offsets, one, &total_branches, C_INT);
  GA_Set_irreg_distr(g_branch_offsets, branchMap, &p_nNodes);
  GA_Set_pgroup(g_branch_offsets, p_GAgrp);
  if (!GA_Allocate(g_branch_offsets)) {
    char buf[256];
    sprintf(buf,"GenVectorMap::setOffsets: Unable to allocate distributed array for branch offsets\n");
    printf("%s",buf);
    throw gridpack::Exception(buf);
  }
  GA_Zero(g_branch_offsets);

  delete [] busMap;
  delete [] branchMap;

  // Scatter offsets to global arrays
  NGA_Scatter(g_bus_offsets, i_bus_value_buf, i_bus_index, i_bus_cnt);
  NGA_Scatter(g_branch_offsets, i_branch_value_buf, i_branch_index, i_branch_cnt);
  NGA_Pgroup_sync(p_GAgrp);

  delete [] i_bus_index;
  delete [] i_branch_index;

  delete [] i_bus_index_buf;
  delete [] i_branch_index_buf;
  delete [] i_bus_value_buf;
  delete [] i_branch_value_buf;
}
Example #8
0
/*
 * test ga_dgemm
 * Note: - change nummax for large arrays
 *       - turn off "dgemm_verify" for large arrays due to memory 
 *         limitations, as dgemm_verify=1 for large arrays produces 
 *         segfault, dumps core,or any crap.
 */
int main(int argc, char **argv)
{
    int num_m;
    int num_n;
    int num_k;
    int i;
    int ii;
    double *h0;
    int g_c;
    int g_b;
    int g_a;
    double a;
    double t1;
    double mf;
    double avg_t[ntrans];
    double avg_mf[ntrans];
    int itime;
    int ntimes;
    int nums_m[/*howmany*/] = {512,1024};
    int nums_n[/*howmany*/] = {512,1024};
    int nums_k[/*howmany*/] = {512,1024};
    char transa[/*ntrans*/] = "ntnt";
    char transb[/*ntrans*/] = "nntt";
    char ta;
    char tb;
    double *tmpa;
    double *tmpb;
    double *tmpc;
    int ndim;
    int dims[2];
#ifdef BLOCK_CYCLIC
    int block_size[2];
#endif

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );
    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
#else
    MP_INIT(argc,argv);
    if (!MA_init(MT_DBL,1,20000000)) {
        GA_Error("failed: ma_init(MT_DBL,1,20000000)",10);
    }
    GA_INIT(argc,argv);
    me = GA_Nodeid();
#endif

    h0 = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpa = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpb = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpc = (double*)malloc(sizeof(double) * nummax*nummax);

    ii = 0;
    for (i=0; i<nummax*nummax; i++) {
        ii = ii + 1;
        if (ii > nummax) {
            ii = 0;
        }
        h0[i] = ii;
    }

    /* Compute times assuming 500 mflops and 5 second target time */
    /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */
    ntimes = 5;

    for (ii=0; ii<howmany; ii++) {
        num_m = nums_m[ii];
        num_n = nums_n[ii];
        num_k = nums_k[ii];
        a = 0.5/(num_m*num_n);
        if (num_m > nummax || num_n > nummax || num_k > nummax) {
            GA_Error("Insufficient memory: check nummax", 1);
        }

#ifndef BLOCK_CYCLIC
        ndim = 2;

	/*
        dims[0] = num_m;
        dims[1] = num_n;
	*/
        dims[1] = num_m;
        dims[0] = num_n;

#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c );
#else
        if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) {
            GA_Error("failed: create g_c",20);
        }
#endif
	/*
        dims[0] = num_k;
        dims[1] = num_n;
	*/
        dims[1] = num_k;
        dims[0] = num_n;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b );
#else
        if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) {
            GA_Error("failed: create g_b",30);
        }
#endif
	/*
        dims[0] = num_m;
        dims[1] = num_k;
	*/
        dims[1] = num_m;
        dims[0] = num_k;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a );
#else
        if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) {
            GA_Error("failed: create g_a",40);
        }
#endif
#else
        ndim = 2;
        block_size[0] = 128;
        block_size[1] = 128;

        dims[0] = num_m;
        dims[1] = num_n;
        g_c = GA_Create_handle();
        GA_Set_data(g_c,ndim,dims,MT_DBL);
        GA_Set_array_name(g_c,"g_c");
        GA_Set_block_cyclic(g_c,block_size);
        if (!GA_Allocate(g_c)) {
            GA_Error("failed: create g_c",40);
        }

        dims[0] = num_k;
        dims[1] = num_n;
        g_b = GA_Create_handle();
        GA_Set_data(g_b,ndim,dims,MT_DBL);
        GA_Set_array_name(g_b,"g_b");
        GA_Set_block_cyclic(g_b,block_size);
        if (!ga_allocate(g_b)) {
            GA_Error("failed: create g_b",40);
        }

        dims[0] = num_m;
        dims[1] = num_k;
        g_a = GA_Create_handle();
        GA_Set_data(g_a,ndim,dims,MT_DBL);
        GA_Set_array_name(g_a,"g_a");
        GA_Set_block_cyclic(g_a,block_size);
        if (!ga_allocate(g_a)) {
            GA_Error('failed: create g_a',40);
        }
#endif         

        /* Initialize matrices A and B */
        if (me == 0) { 
            load_ga(g_a, h0, num_m, num_k);
            load_ga(g_b, h0, num_k, num_n);
        }
#if defined(USE_ELEMENTAL)
        double zero = 0.0;
        ElGlobalArraysFill_d( eldga, g_c, &zero );
	ElGlobalArraysSync_d( eldga );
#else
        GA_Zero(g_c);
        GA_Sync();
#endif
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n",
                    (long)num_m, (long)num_k, (long)num_k, (long)num_n);
            fflush(stdout);
        }

        for (i=0; i<ntrans; i++) {
            avg_t[i]  = 0.0;
            avg_mf[i] = 0.0;
        }

        for (itime=0; itime<ntimes; itime++) {
            for (i=0; i<ntrans; i++) {
#if defined(USE_ELEMENTAL)
	        ElGlobalArraysSync_d( eldga );
#else
                GA_Sync();
#endif
                ta = transa[i];
                tb = transb[i];
                t1 = MP_TIMER();
#if defined(USE_ELEMENTAL)
		ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c );
#else
                GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c);
#endif
                t1 = MP_TIMER() - t1;
#if defined(USE_ELEMENTAL)
                if (me == 0) {
#else
                if (GA_Nodeid() == 0) {
#endif
#if defined(USE_ELEMENTAL)
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc;
#else
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes();
#endif
                    avg_t[i]  = avg_t[i]+t1;
                    avg_mf[i] = avg_mf[i] + mf;
                    printf("%15s%2d: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                            "Run#", itime, t1, mf, ta, tb);
                    fflush(stdout);
                    if (dgemm_verify && itime == 0) {
                        /* recall the C API swaps the matrix order */
                        /* we swap it here for the Fortran-based verify */
                        verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0,
                                g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc);
                    }
                }
            }
        }
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\n");
            for (i=0; i<ntrans; i++) {
                printf("%17s: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                        "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes,
                        transa[i], transb[i]);
            }
            if(dgemm_verify) {
                printf("All GA_Dgemms are verified...O.K.\n");
            }
            fflush(stdout);
        }

        /*
           GA_Print(g_a);
           GA_Print(g_b);
           GA_Print(g_c);
           */
#if defined(USE_ELEMENTAL)
        ElGlobalArraysDestroy_d( eldga, g_a );
        ElGlobalArraysDestroy_d( eldga, g_b );
        ElGlobalArraysDestroy_d( eldga, g_c );
#else
        GA_Destroy(g_c);
        GA_Destroy(g_b);
        GA_Destroy(g_a);
#endif
    }

    /* ???
       format(a15, i2, ': ', e12.4, ' seconds ',f12.1, 
       .     ' mflops/proc ', 3a2)
       */
#if defined(USE_ELEMENTAL)
    if (me == 0) {
#else
    if (GA_Nodeid() == 0) {
#endif
        printf("All tests successful\n");
    }

    free(h0);
    free(tmpa);
    free(tmpb);
    free(tmpc);
#if defined(USE_ELEMENTAL)
    // call el::global arrays destructor
    ElGlobalArraysTerminate_d( eldga );
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    return 0;
}


/*
 * Verify for correctness. Process 0 computes BLAS dgemm 
 * locally. For larger arrays, disbale this test as memory
 * might not be sufficient
 */
void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k,
        double alpha, int g_a, int g_b, double beta, int g_c,
        double *tmpa, double *tmpb, double *tmpc)
{
    int i,j,type,ndim,dims[2],lo[2],hi[2];
    double abs_value;

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            tmpc[j+i*num_m] = -1.0;
            tmpa[j+i*num_m] = -2.0;
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_a, lo, hi, tmpa, &dims[1]);
#endif

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] );
#else
    NGA_Get(g_b, lo, hi, tmpb, &dims[1]);
#endif

    /* compute dgemm sequentially */
#if defined(USE_ELEMENTAL)
    cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), 
	    ( xt2 == 'n'? CblasNoTrans: CblasTrans ), 
	    num_m /* M */, num_n /* N */, num_k /* K */, 
	    alpha, tmpa, num_m, /* lda */ 
	    tmpb, num_k, /* ldb */ beta, 
	    tmpc, num_m /* ldc */);
#else
    xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k,
            &alpha, tmpa, &num_m,
            tmpb, &num_k, &beta,
            tmpc, &num_m);
#endif

    /* after computing c locally, verify it with the values in g_c */

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_c, lo, hi, tmpa, &dims[1]);
#endif

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]);
            if(abs_value > 1.0 || abs_value < -1.0) {
                printf("Values are = %f %f\n",
                        tmpc[j+i*num_m], tmpa[j+i*num_m]);
                printf("Values are = %f %f\n", 
                        fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value);
                fflush(stdout);
                GA_Error("verify ga_dgemm failed", 1);
            }
        }
    }
}

/**
 * called by process '0' (or your master process )
 */
void load_ga(int handle, double *f, int dim1, int dim2)
{
      int lo[2], hi[2];
      
      if (dim1 < 0 || dim2 < 0) {
          return;
      }

      lo[0] = 0;
      lo[1] = 0;
      hi[0] = dim1-1;
      hi[1] = dim2-1;
#if defined(USE_ELEMENTAL)
      ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 );
#else
      NGA_Put(handle, lo, hi, f, &dim1);
#endif
}
Example #9
0
void do_work()
{
int g_a, g_b;
int me=GA_Nodeid(), nproc=GA_Nnodes(), proc, loop;
int dims[NDIM], lo[NDIM], hi[NDIM], block[NDIM], ld[NDIM-1];
int i,d,*proclist, offset;
int adims[NDIM], ndim,type;
typedef struct {
       int lo[NDIM];
       int hi[NDIM];
} patch_t;
patch_t *regions;
int *map;
double *buf;

     /***** create array A with default distribution  *****/
     if(me==0){printf("Creating array A\n"); fflush(stdout);}
     for(i = 0; i<NDIM; i++)dims[i] = N*(i+1);
#ifdef NEW_API
     g_a = GA_Create_handle();
     GA_Set_data(g_a,NDIM,dims,MT_F_DBL);
     GA_Set_array_name(g_a,"array A");
     (void)GA_Allocate(g_a);
#else
     g_a = NGA_Create(MT_F_DBL, NDIM, dims, "array A", NULL);
#endif
     if(!g_a) GA_Error("create failed: A",0); 
     if(me==0)printf("OK\n\n");

     /* print info about array we got */
     NGA_Inquire(g_a, &type, &ndim, adims);
     GA_Print_distribution(g_a);

     GA_Sync();
     /* duplicate array A with ga_create irreg rather than ga_duplicate
      * -- want to show distribution control 
      * -- with ga_duplicate it would be g_b=GA_Duplicate(g_a,name) 
      */
     if(me==0)printf("\nReconstructing distribution description for A\n");

     /* get memory for arrays describing distribution */
     proclist = (int*)malloc(nproc*sizeof(int));
     if(!proclist)GA_Error("malloc failed for proclist",0);
     regions = (patch_t*)malloc(nproc*sizeof(patch_t));
     if(!regions)GA_Error("malloc failed for regions",0);
     map = (int*)malloc((nproc+ndim)*sizeof(int)); /* ubound= nproc+mdim */
     if(!map)GA_Error("malloc failed for map",0);

     /* first find out how array g_a is distributed */
     for(i=0;i<ndim;i++)lo[i]=BASE;
     for(i=0;i<ndim;i++)hi[i]=adims[i] -1 + BASE;
     proc = NGA_Locate_region(g_a, lo, hi, (int*)regions, proclist);
     if(proc<1) GA_Error("error in NGA_Locate_region",proc);

     /* determine blocking for each dimension */
     for(i=0;i<ndim;i++)block[i]=0;
     for(i=0;i<ndim;i++)adims[i]=0;
     
     offset =0;
     for(d=0; d<ndim; d++)
         for(i=0;i<proc;i++)
             if( regions[i].hi[d]>adims[d] ){
                map[offset] = regions[i].lo[d];
                offset++;
                block[d]++;
                adims[d]= regions[i].hi[d];
             }
            
     if(me==0){
        printf("Distribution map contains %d elements\n",offset); 
        print_subscript("number of blocks for each dimension",ndim,block,"\n");
        print_subscript("distribution map",offset,map,"\n\n");
        fflush(stdout);
     }
     
     if(me==0)printf("Creating array B applying distribution of A\n");

#    ifdef USE_DUPLICATE
       g_b = GA_Duplicate(g_a,"array B");
#    else
       g_b = NGA_Create_irreg(MT_F_DBL, NDIM, dims, "array B", block,map);
#    endif
     if(!g_b) GA_Error("create failed: B",0); 
     if(me==0)printf("OK\n\n");
     free(proclist); free(regions); free(map);
     
     GA_Print_distribution(g_b);

     GA_Sync();

     if(me==0){
       printf("\nCompare distributions of A and B\n");
       if(GA_Compare_distr(g_a,g_b))
          printf("Failure: distributions NOT identical\n");
       else 
          printf("Success: distributions identical\n");
       fflush(stdout);
     }
       

     if(me==0){
        printf("\nAccessing local elements of A: set them to the owner process id\n");
        fflush(stdout);
     }
     GA_Sync();

     NGA_Distribution(g_a,me,lo,hi);

     if(hi[0]>=0){/* -1 means no elements stored on this processor */
         double *ptr;
         int locdim[NDIM];
         NGA_Access(g_a, lo,hi, &ptr, ld);
         for(i=0;i<ndim;i++)locdim[i]=hi[i]-lo[i]+1;
         fill_patch(ptr, locdim, ld, ndim,(double)me);
     }

     for(i=0;i<nproc; i++){
       if(me==i && hi[0]>=0){
         char msg[100];
         sprintf(msg,"%d: leading dimensions",me);
         print_subscript(msg,ndim-1,ld,"\n");
         fflush(stdout);
       }
       GA_Sync();
     }
     
     GA_Sync();
     if(me==0)printf("\nRandomly checking the update using ga_get on array sections\n");
     GA_Sync();

     /* show ga_get working and verify array updates 
      * every process does N random gets
      * for simplicity get only a single row at a time
      */
     srand(me); /* different seed for every process */
     hi[ndim-1]=adims[ndim-1] -1 + BASE;
     for(i=1;i<ndim-1; i++)ld[i]=1; ld[ndim-2]=adims[ndim-1] -1 + BASE;

     /* get buffer memory */
     buf = (double*)malloc(adims[ndim-1]*sizeof(double));
     if(!buf)GA_Error("malloc failed for buf",0);

     /* half of the processes check the result */
     if(me<=nproc/2) 
     for(loop = 0; loop< N; loop++){ /* task parallel loop */
         lo[ndim-1]=BASE;
         for (i= 0; i < ndim -1; i ++){
              lo[i] = hi[i] = rand()%adims[i]+BASE; 
         }

         /* print_subscript("getting",ndim,lo,"\n");*/
         NGA_Get(g_a,lo,hi,buf,ld); 
         
         /* check values */
         for(i=0;i<adims[ndim-1]; i++){
             int p = NGA_Locate(g_a, lo);
             if((double)p != buf[i]) {
                char msg[100];
                sprintf(msg,"%d: wrong value: %d != %lf a",me, p, buf[i]);
                print_subscript(msg,ndim,lo,"\n");
                GA_Error("Error - bye",i);  
             }
             lo[ndim-1]++;
          }
     }
             
     free(buf);
     GA_Sync();
           
     if(me==0)printf("OK\n");
     
     GA_Destroy(g_a);
     GA_Destroy(g_b);
}