// ------------------------------------------------------------- // CreateMatGA // ------------------------------------------------------------- static PetscErrorCode CreateMatGA(int pgroup, int lrows, int lcols, int grows, int gcols, int *ga) { PetscErrorCode ierr = 0; /* Try to honor local ownership request (of rows). */ int nprocs = GA_Pgroup_nnodes(pgroup); int me = GA_Pgroup_nodeid(pgroup); int tmapc[nprocs+1]; int mapc[nprocs+1]; int i; for (i = 0; i < nprocs+1; i++) tmapc[i] = 0; tmapc[me] = lrows; GA_Pgroup_igop(pgroup, tmapc, nprocs+1, "+"); mapc[0] = 0; for (i = 1; i < nprocs; i++) mapc[i] = mapc[i-1]+tmapc[i-1]; mapc[nprocs] = 0; int dims[2] = {grows, gcols}; int blocks[2] = { nprocs, 1 }; *ga = GA_Create_handle(); GA_Set_data(*ga, 2, dims, MT_PETSC_SCALAR); GA_Set_irreg_distr(*ga, mapc, blocks); GA_Set_pgroup(*ga, pgroup); if (!GA_Allocate(*ga)) { ierr = 1; } PetscScalar z(0.0); GA_Fill(*ga, &z); return ierr; }
// ------------------------------------------------------------- // AdjacencyList::ready // ------------------------------------------------------------- void AdjacencyList::ready(void) { #if 1 int grp = this->communicator().getGroup(); int me = GA_Pgroup_nodeid(grp); int nprocs = GA_Pgroup_nnodes(grp); p_adjacency.clear(); p_adjacency.resize(p_global_nodes.size()); // Find total number of nodes and edges. Assume no duplicates int nedges = p_edges.size(); int total_edges = nedges; char plus[2]; strcpy(plus,"+"); GA_Pgroup_igop(grp,&total_edges, 1, plus); int nnodes = p_original_nodes.size(); int total_nodes = nnodes; GA_Pgroup_igop(grp,&total_nodes, 1, plus); // Create a global array containing original indices of all nodes and indexed // by the global index of the node int i, p; int dist[nprocs]; for (p=0; p<nprocs; p++) { dist[p] = 0; } dist[me] = nnodes; GA_Pgroup_igop(grp,dist,nprocs,plus); int *mapc = new int[nprocs+1]; mapc[0] = 0; for (p=1; p<nprocs; p++) { mapc[p] = mapc[p-1] + dist[p-1]; } mapc[nprocs] = total_nodes; int g_nodes = GA_Create_handle(); int dims = total_nodes; NGA_Set_data(g_nodes,1,&dims,C_INT); NGA_Set_pgroup(g_nodes, grp); if (!GA_Allocate(g_nodes)) { char buf[256]; sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array" " for bus indices\n"); printf(buf); throw gridpack::Exception(buf); } int lo, hi; lo = mapc[me]; hi = mapc[me+1]-1; int size = hi - lo + 1; int o_idx[size], g_idx[size]; for (i=0; i<size; i++) o_idx[i] = p_original_nodes[i]; for (i=0; i<size; i++) g_idx[i] = p_global_nodes[i]; int **indices= new int*[size]; int *iptr = g_idx; for (i=0; i<size; i++) { indices[i] = iptr; iptr++; } if (size > 0) NGA_Scatter(g_nodes,o_idx,indices,size); GA_Pgroup_sync(grp); delete [] indices; delete [] mapc; // Cycle through all nodes and match them up with nodes at end of edges. for (p=0; p<nprocs; p++) { int iproc = (me+p)%nprocs; // Get node data from process iproc NGA_Distribution(g_nodes,iproc,&lo,&hi); size = hi - lo + 1; if (size <= 0) continue; int *buf = new int[size]; int ld = 1; NGA_Get(g_nodes,&lo,&hi,buf,&ld); // Create a map of the nodes from process p std::map<int,int> nmap; std::map<int,int>::iterator it; std::pair<int,int> pr; for (i=lo; i<=hi; i++){ pr = std::pair<int,int>(buf[i-lo],i); nmap.insert(pr); } delete [] buf; // scan through the edges looking for matches. If there is a match, set the // global index int idx; for (i=0; i<nedges; i++) { idx = static_cast<int>(p_edges[i].original_conn.first); it = nmap.find(idx); if (it != nmap.end()) { p_edges[i].global_conn.first = static_cast<Index>(it->second); } idx = static_cast<int>(p_edges[i].original_conn.second); it = nmap.find(idx); if (it != nmap.end()) { p_edges[i].global_conn.second = static_cast<Index>(it->second); } } } GA_Destroy(g_nodes); // All edges now have global indices assigned to them. Begin constructing // adjacency list. Start by creating a global array containing all edges dist[0] = 0; for (p=1; p<nprocs; p++) { double max = static_cast<double>(total_edges); max = (static_cast<double>(p))*(max/(static_cast<double>(nprocs))); dist[p] = 2*(static_cast<int>(max)); } int g_edges = GA_Create_handle(); dims = 2*total_edges; NGA_Set_data(g_edges,1,&dims,C_INT); NGA_Set_irreg_distr(g_edges,dist,&nprocs); NGA_Set_pgroup(g_edges, grp); if (!GA_Allocate(g_edges)) { char buf[256]; sprintf(buf,"AdjacencyList::ready: Unable to allocate distributed array" " for branch indices\n"); printf(buf); throw gridpack::Exception(buf); } // Add edge information to global array. Start by figuring out how much data // is associated with each process for (p=0; p<nprocs; p++) { dist[p] = 0; } dist[me] = nedges; GA_Pgroup_igop(grp,dist, nprocs, plus); int offset[nprocs]; offset[0] = 0; for (p=1; p<nprocs; p++) { offset[p] = offset[p-1] + 2*dist[p-1]; } // Figure out where local data goes in GA and then copy it to GA lo = offset[me]; hi = lo + 2*nedges - 1; int edge_ids[2*nedges]; for (i=0; i<nedges; i++) { edge_ids[2*i] = static_cast<int>(p_edges[i].global_conn.first); edge_ids[2*i+1] = static_cast<int>(p_edges[i].global_conn.second); } if (lo <= hi) { int ld = 1; NGA_Put(g_edges,&lo,&hi,edge_ids,&ld); } GA_Pgroup_sync(grp); // Cycle through all edges and find out how many are attached to the nodes on // your process. Start by creating a map between the global node indices and // the local node indices std::map<int,int> gmap; std::map<int,int>::iterator it; std::pair<int,int> pr; for (i=0; i<nnodes; i++){ pr = std::pair<int,int>(static_cast<int>(p_global_nodes[i]),i); gmap.insert(pr); } // Cycle through edge information on each processor for (p=0; p<nprocs; p++) { int iproc = (me+p)%nprocs; NGA_Distribution(g_edges,iproc,&lo,&hi); int size = hi - lo + 1; int *buf = new int[size]; int ld = 1; NGA_Get(g_edges,&lo,&hi,buf,&ld); BOOST_ASSERT(size%2 == 0); size = size/2; int idx1, idx2; Index idx; for (i=0; i<size; i++) { idx1 = buf[2*i]; idx2 = buf[2*i+1]; it = gmap.find(idx1); if (it != gmap.end()) { idx = static_cast<Index>(idx2); p_adjacency[it->second].push_back(idx); } it = gmap.find(idx2); if (it != gmap.end()) { idx = static_cast<Index>(idx1); p_adjacency[it->second].push_back(idx); } } delete [] buf; } GA_Destroy(g_edges); GA_Pgroup_sync(grp); #else int me(this->processor_rank()); int nproc(this->processor_size()); p_adjacency.clear(); p_adjacency.resize(p_nodes.size()); IndexVector current_indexes; IndexVector connected_indexes; for (int p = 0; p < nproc; ++p) { // broadcast the node indexes owned by process p to all processes, // all processes work on these at once current_indexes.clear(); if (me == p) { std::copy(p_nodes.begin(), p_nodes.end(), std::back_inserter(current_indexes)); // std::cout << me << ": node indexes: "; // std::copy(current_indexes.begin(), current_indexes.end(), // std::ostream_iterator<Index>(std::cout, ",")); // std::cout << std::endl; } boost::mpi::broadcast(this->communicator(), current_indexes, p); // make a copy of the local edges in a list (so it's easier to // remove those completely accounted for) std::list<p_Edge> tmpedges; std::copy(p_edges.begin(), p_edges.end(), std::back_inserter(tmpedges)); // loop over the process p's node index set int local_index(0); for (IndexVector::iterator n = current_indexes.begin(); n != current_indexes.end(); ++n, ++local_index) { // determine the local edges that refer to the current node index connected_indexes.clear(); std::list<p_Edge>::iterator e(tmpedges.begin()); // std::cout << me << ": current node index: " << *n // << ", edges: " << tmpedges.size() // << std::endl; while (e != tmpedges.end()) { if (*n == e->conn.first && e->conn.second != bogus) { connected_indexes.push_back(e->conn.second); e->found.first = true; // std::cout << me << ": found connection: edge " << e->index // << " (" << e->conn.first << ", " << e->conn.second << ")" // << std::endl; } if (*n == e->conn.second && e->conn.first != bogus) { connected_indexes.push_back(e->conn.first); e->found.second = true; // std::cout << me << ": found connection: edge " << e->index // << " (" << e->conn.first << ", " << e->conn.second << ")" // << std::endl; } if (e->found.first && e->found.second) { e = tmpedges.erase(e); } else if (e->conn.first == bogus || e->conn.second == bogus) { e = tmpedges.erase(e); } else { ++e; } } // gather all connections for the current node index to the // node's owner process, we have to gather the vectors because // processes will have different numbers of connections if (me == p) { size_t allsize; boost::mpi::reduce(this->communicator(), connected_indexes.size(), allsize, std::plus<size_t>(), p); std::vector<IndexVector> all_connected_indexes; boost::mpi::gather(this->communicator(), connected_indexes, all_connected_indexes, p); p_adjacency[local_index].clear(); for (std::vector<IndexVector>::iterator k = all_connected_indexes.begin(); k != all_connected_indexes.end(); ++k) { std::copy(k->begin(), k->end(), std::back_inserter(p_adjacency[local_index])); } } else { boost::mpi::reduce(this->communicator(), connected_indexes.size(), std::plus<size_t>(), p); boost::mpi::gather(this->communicator(), connected_indexes, p); } this->communicator().barrier(); } this->communicator().barrier(); } #endif }
void test(int data_type) { int me=GA_Nodeid(); int nproc = GA_Nnodes(); int g_a, g_b, g_c; int ndim = 2; int dims[2]={N,N}; int lo[2]={0,0}; int hi[2]={N-1,N-1}; int block_size[2]={NB,NB-1}; int proc_grid[2]; int i,j,l,k,m,n, ld; double alpha_dbl = 1.0, beta_dbl = 0.0; double dzero = 0.0; double ddiff; float alpha_flt = 1.0, beta_flt = 0.0; float fzero = 0.0; float fdiff; float ftmp; double dtmp; SingleComplex ctmp; DoubleComplex ztmp; DoubleComplex alpha_dcpl = {1.0, 0.0} , beta_dcpl = {0.0, 0.0}; DoubleComplex zzero = {0.0,0.0}; DoubleComplex zdiff; SingleComplex alpha_scpl = {1.0, 0.0} , beta_scpl = {0.0, 0.0}; SingleComplex czero = {0.0,0.0}; SingleComplex cdiff; void *alpha=NULL, *beta=NULL; void *abuf=NULL, *bbuf=NULL, *cbuf=NULL, *c_ptr=NULL; switch (data_type) { case C_FLOAT: alpha = (void *)&alpha_flt; beta = (void *)&beta_flt; abuf = (void*)malloc(N*N*sizeof(float)); bbuf = (void*)malloc(N*N*sizeof(float)); cbuf = (void*)malloc(N*N*sizeof(float)); if(me==0) printf("Single Precision: Testing GA_Sgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; case C_DBL: alpha = (void *)&alpha_dbl; beta = (void *)&beta_dbl; abuf = (void*)malloc(N*N*sizeof(double)); bbuf = (void*)malloc(N*N*sizeof(double)); cbuf = (void*)malloc(N*N*sizeof(double)); if(me==0) printf("Double Precision: Testing GA_Dgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; case C_DCPL: alpha = (void *)&alpha_dcpl; beta = (void *)&beta_dcpl; abuf = (void*)malloc(N*N*sizeof(DoubleComplex)); bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)); cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)); if(me==0) printf("Double Complex: Testing GA_Zgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; case C_SCPL: alpha = (void *)&alpha_scpl; beta = (void *)&beta_scpl; abuf = (void*)malloc(N*N*sizeof(SingleComplex)); bbuf = (void*)malloc(N*N*sizeof(SingleComplex)); cbuf = (void*)malloc(N*N*sizeof(SingleComplex)); if(me==0) printf("Single Complex: Testing GA_Cgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; default: GA_Error("wrong data type", data_type); } if (me==0) printf("\nCreate A, B, C\n"); #ifdef USE_REGULAR g_a = NGA_Create(data_type, ndim, dims, "array A", NULL); #endif #ifdef USE_SIMPLE_CYCLIC g_a = NGA_Create_handle(); NGA_Set_data(g_a,ndim,dims,data_type); NGA_Set_array_name(g_a,"array A"); NGA_Set_block_cyclic(g_a,block_size); if (!GA_Allocate(g_a)) { GA_Error("Failed: create: g_a",40); } #endif #ifdef USE_SCALAPACK g_a = NGA_Create_handle(); NGA_Set_data(g_a,ndim,dims,data_type); NGA_Set_array_name(g_a,"array A"); grid_factor(nproc,&i,&j); proc_grid[0] = i; proc_grid[1] = j; NGA_Set_block_cyclic_proc_grid(g_a,block_size,proc_grid); if (!GA_Allocate(g_a)) { GA_Error("Failed: create: g_a",40); } #endif #ifdef USE_TILED g_a = NGA_Create_handle(); NGA_Set_data(g_a,ndim,dims,data_type); NGA_Set_array_name(g_a,"array A"); grid_factor(nproc,&i,&j); proc_grid[0] = i; proc_grid[1] = j; NGA_Set_tiled_proc_grid(g_a,block_size,proc_grid); if (!GA_Allocate(g_a)) { GA_Error("Failed: create: g_a",40); } #endif g_b = GA_Duplicate(g_a, "array B"); g_c = GA_Duplicate(g_a, "array C"); if(!g_a || !g_b || !g_c) GA_Error("Create failed: a, b or c",1); ld = N; if (me==0) printf("\nInitialize A\n"); /* Set up matrix A */ if (me == 0) { for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: ((float*)abuf)[i*N+j] = (float)(i*N+j); break; case C_DBL: ((double*)abuf)[i*N+j] = (double)(i*N+j); break; case C_DCPL: ((DoubleComplex*)abuf)[i*N+j].real = (double)(i*N+j); ((DoubleComplex*)abuf)[i*N+j].imag = 1.0; break; case C_SCPL: ((SingleComplex*)abuf)[i*N+j].real = (float)(i*N+j); ((SingleComplex*)abuf)[i*N+j].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_a,lo,hi,abuf,&ld); } GA_Sync(); if (me==0) printf("\nInitialize B\n"); /* Set up matrix B */ if (me == 0) { for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: ((float*)bbuf)[i*N+j] = (float)(j*N+i); break; case C_DBL: ((double*)bbuf)[i*N+j] = (double)(j*N+i); break; case C_DCPL: ((DoubleComplex*)bbuf)[i*N+j].real = (double)(j*N+i); ((DoubleComplex*)bbuf)[i*N+j].imag = 1.0; break; case C_SCPL: ((SingleComplex*)bbuf)[i*N+j].real = (float)(j*N+i); ((SingleComplex*)bbuf)[i*N+j].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_b,lo,hi,bbuf,&ld); } GA_Sync(); if (me==0) printf("\nPerform matrix multiply\n"); switch (data_type) { case C_FLOAT: NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DBL: NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_SCPL: NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DCPL: NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; default: GA_Error("wrong data type", data_type); } GA_Sync(); #if 0 if (me==0) printf("\nCheck answer\n"); /* GA_Print(g_a); if (me == 0) printf("\n\n\n\n"); GA_Print(g_b); if (me == 0) printf("\n\n\n\n"); GA_Print(g_c); */ /* Check answer */ NGA_Get(g_a,lo,hi,abuf,&ld); NGA_Get(g_b,lo,hi,bbuf,&ld); for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N+j] = fzero; break; case C_DBL: ((double*)cbuf)[i*N+j] = dzero; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N+j] = zzero; break; case C_SCPL: ((SingleComplex*)cbuf)[i*N+j] = czero; break; default: GA_Error("wrong data type", data_type); } for (k=0; k<N; k++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N+j] += ((float*)abuf)[i*N+k] *((float*)bbuf)[k*N+j]; break; case C_DBL: ((double*)cbuf)[i*N+j] += ((double*)abuf)[i*N+k] *((double*)bbuf)[k*N+j]; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N+j].real += (((DoubleComplex*)abuf)[i*N+k].real *((DoubleComplex*)bbuf)[k*N+j].real -(((DoubleComplex*)abuf)[i*N+k].imag *((DoubleComplex*)bbuf)[k*N+j].imag)); ((DoubleComplex*)cbuf)[i*N+j].imag += (((DoubleComplex*)abuf)[i*N+k].real *((DoubleComplex*)bbuf)[k*N+j].imag +(((DoubleComplex*)abuf)[i*N+k].imag *((DoubleComplex*)bbuf)[k*N+j].real)); break; case C_SCPL: ((SingleComplex*)cbuf)[i*N+j].real += (((SingleComplex*)abuf)[i*N+k].real *((SingleComplex*)bbuf)[k*N+j].real -(((SingleComplex*)abuf)[i*N+k].imag *((SingleComplex*)bbuf)[k*N+j].imag)); ((SingleComplex*)cbuf)[i*N+j].imag += (((SingleComplex*)abuf)[i*N+k].real *((SingleComplex*)bbuf)[k*N+j].imag +(((SingleComplex*)abuf)[i*N+k].imag *((SingleComplex*)bbuf)[k*N+j].real)); break; default: GA_Error("wrong data type", data_type); } } } } GA_Sync(); if (me == 0) { NGA_Get(g_c,lo,hi,abuf,&ld); for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: fdiff = ((float*)abuf)[i*N+j]-((float*)cbuf)[i*N+j]; if (((float*)abuf)[i*N+j] != 0.0) { fdiff /= ((float*)abuf)[i*N+j]; } if (fabs(fdiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j, ((float*)abuf)[i*N+j],((float*)cbuf)[i*N+j]); } break; case C_DBL: ddiff = ((double*)abuf)[i*N+j]-((double*)cbuf)[i*N+j]; if (((double*)abuf)[i*N+j] != 0.0) { ddiff /= ((double*)abuf)[i*N+j]; } if (fabs(ddiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j, ((double*)abuf)[i*N+j],((double*)cbuf)[i*N+j]); } break; case C_DCPL: zdiff.real = ((DoubleComplex*)abuf)[i*N+j].real -((DoubleComplex*)cbuf)[i*N+j].real; zdiff.imag = ((DoubleComplex*)abuf)[i*N+j].imag -((DoubleComplex*)cbuf)[i*N+j].imag; if (((DoubleComplex*)abuf)[i*N+j].real != 0.0 || ((DoubleComplex*)abuf)[i*N+j].imag != 0.0) { ztmp = ((DoubleComplex*)abuf)[i*N+j]; ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag) /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag)); } else { ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag); } if (fabs(ddiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j, ((DoubleComplex*)abuf)[i*N+j].real, ((DoubleComplex*)abuf)[i*N+j].imag, ((DoubleComplex*)cbuf)[i*N+j].real, ((DoubleComplex*)cbuf)[i*N+j].imag); } break; case C_SCPL: cdiff.real = ((SingleComplex*)abuf)[i*N+j].real -((SingleComplex*)cbuf)[i*N+j].real; cdiff.imag = ((SingleComplex*)abuf)[i*N+j].imag -((SingleComplex*)cbuf)[i*N+j].imag; if (((SingleComplex*)abuf)[i*N+j].real != 0.0 || ((SingleComplex*)abuf)[i*N+j].imag != 0.0) { ctmp = ((SingleComplex*)abuf)[i*N+j]; fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag) /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag)); } else { fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag); } if (fabs(fdiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j, ((SingleComplex*)abuf)[i*N+j].real, ((SingleComplex*)abuf)[i*N+j].imag, ((SingleComplex*)cbuf)[i*N+j].real, ((SingleComplex*)cbuf)[i*N+j].imag); } break; default: GA_Error("wrong data type", data_type); } } } } GA_Sync(); /* copy cbuf back to g_a */ if (me == 0) { NGA_Put(g_a,lo,hi,cbuf,&ld); } GA_Sync(); /* Get norm of g_a */ switch (data_type) { case C_FLOAT: ftmp = GA_Fdot(g_a,g_a); break; case C_DBL: dtmp = GA_Ddot(g_a,g_a); break; case C_DCPL: ztmp = GA_Zdot(g_a,g_a); break; case C_SCPL: ctmp = GA_Cdot(g_a,g_a); break; default: GA_Error("wrong data type", data_type); } /* subtract C from A and put the results in B */ beta_flt = -1.0; beta_dbl = -1.0; beta_scpl.real = -1.0; beta_dcpl.real = -1.0; GA_Zero(g_b); GA_Add(alpha,g_a,beta,g_c,g_b); /* evaluate the norm of the difference between the two matrices */ switch (data_type) { case C_FLOAT: fdiff = GA_Fdot(g_b, g_b); if (ftmp != 0.0) { fdiff /= ftmp; } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE); GA_Error("GA_Sgemm Failed", 1); } else if (me == 0) { printf("\nGA_Sgemm OK\n\n"); } break; case C_DBL: ddiff = GA_Ddot(g_b, g_b); if (dtmp != 0.0) { ddiff /= dtmp; } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE); GA_Error("GA_Dgemm Failed", 1); } else if (me == 0) { printf("\nGA_Dgemm OK\n\n"); } break; case C_DCPL: zdiff = GA_Zdot(g_b, g_b); if (ztmp.real != 0.0 || ztmp.imag != 0.0) { ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag) /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag)); } else { ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag); } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE); GA_Error("GA_Zgemm Failed", 1); } else if (me == 0) { printf("\nGA_Zgemm OK\n\n"); } break; case C_SCPL: cdiff = GA_Cdot(g_b, g_b); if (ctmp.real != 0.0 || ctmp.imag != 0.0) { fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag) /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag)); } else { fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag); } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE); GA_Error("GA_Cgemm Failed", 1); } else if (me == 0) { printf("\nGA_Cgemm OK\n\n"); } break; default: GA_Error("wrong data type", data_type); } #endif free(abuf); free(bbuf); free(cbuf); switch (data_type) { case C_FLOAT: abuf = (void*)malloc(N*N*sizeof(float)/4); bbuf = (void*)malloc(N*N*sizeof(float)/4); cbuf = (void*)malloc(N*N*sizeof(float)/4); break; case C_DBL: abuf = (void*)malloc(N*N*sizeof(double)/4); bbuf = (void*)malloc(N*N*sizeof(double)/4); cbuf = (void*)malloc(N*N*sizeof(double)/4); break; case C_DCPL: abuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4); bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4); cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4); break; case C_SCPL: abuf = (void*)malloc(N*N*sizeof(SingleComplex)/4); bbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4); cbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4); break; default: GA_Error("wrong data type", data_type); } /* Test multiply on a fraction of matrix. Start by reinitializing * A and B */ GA_Zero(g_a); GA_Zero(g_b); GA_Zero(g_c); if (me==0) printf("\nTest patch multiply\n"); lo[0] = N/4; lo[1] = N/4; hi[0] = 3*N/4-1; hi[1] = 3*N/4-1; ld = N/2; /* Set up matrix A */ if (me==0) printf("\nInitialize A\n"); if (me == 0) { for (i=N/4; i<3*N/4; i++) { for (j=N/4; j<3*N/4; j++) { switch (data_type) { case C_FLOAT: ((float*)abuf)[(i-N/4)*N/2+(j-N/4)] = (float)(i*N+j); break; case C_DBL: ((double*)abuf)[(i-N/4)*N/2+(j-N/4)] = (double)(i*N+j); break; case C_DCPL: ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(i*N+j); ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; case C_SCPL: ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(i*N+j); ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_a,lo,hi,abuf,&ld); } GA_Sync(); if (me==0) printf("\nInitialize B\n"); /* Set up matrix B */ if (me == 0) { for (i=N/4; i<3*N/4; i++) { for (j=N/4; j<3*N/4; j++) { switch (data_type) { case C_FLOAT: ((float*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (float)(j*N+i); break; case C_DBL: ((double*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (double)(j*N+i); break; case C_DCPL: ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(j*N+i); ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; case C_SCPL: ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(j*N+i); ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_b,lo,hi,bbuf,&ld); } GA_Sync(); beta_flt = 0.0; beta_dbl = 0.0; beta_scpl.real = 0.0; beta_dcpl.real = 0.0; if (me==0) printf("\nPerform matrix multiply on sub-blocks\n"); switch (data_type) { case C_FLOAT: NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DBL: NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_SCPL: NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DCPL: NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; default: GA_Error("wrong data type", data_type); } GA_Sync(); #if 0 if (0) { /* if (data_type != C_SCPL && data_type != C_DCPL) { */ if (me==0) printf("\nCheck answer\n"); /* Multiply buffers by hand */ if (me == 0) { for (i=0; i<N/2; i++) { for (j=0; j<N/2; j++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N/2+j] = fzero; break; case C_DBL: ((double*)cbuf)[i*N/2+j] = dzero; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N/2+j] = zzero; break; case C_SCPL: ((SingleComplex*)cbuf)[i*N/2+j] = czero; break; default: GA_Error("wrong data type", data_type); } for (k=0; k<N/2; k++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N/2+j] += ((float*)abuf)[i*N/2+k] *((float*)bbuf)[k*N/2+j]; break; case C_DBL: ((double*)cbuf)[i*N/2+j] += ((double*)abuf)[i*N/2+k] *((double*)bbuf)[k*N/2+j]; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N/2+j].real += (((DoubleComplex*)abuf)[i*N/2+k].real *((DoubleComplex*)bbuf)[k*N/2+j].real -(((DoubleComplex*)abuf)[i*N/2+k].imag *((DoubleComplex*)bbuf)[k*N/2+j].imag)); ((DoubleComplex*)cbuf)[i*N/2+j].imag += (((DoubleComplex*)abuf)[i*N/2+k].real *((DoubleComplex*)bbuf)[k*N/2+j].imag +(((DoubleComplex*)abuf)[i*N/2+k].imag *((DoubleComplex*)bbuf)[k*N/2+j].real)); break; case C_SCPL: ((SingleComplex*)cbuf)[i*N/2+j].real += (((SingleComplex*)abuf)[i*N/2+k].real *((SingleComplex*)bbuf)[k*N/2+j].real -(((SingleComplex*)abuf)[i*N/2+k].imag *((SingleComplex*)bbuf)[k*N/2+j].imag)); ((SingleComplex*)cbuf)[i*N/2+j].imag += (((SingleComplex*)abuf)[i*N/2+k].real *((SingleComplex*)bbuf)[k*N/2+j].imag +(((SingleComplex*)abuf)[i*N/2+k].imag *((SingleComplex*)bbuf)[k*N/2+j].real)); break; default: GA_Error("wrong data type", data_type); } } } } NGA_Put(g_a,lo,hi,cbuf,&ld); } if (me == 0) printf("\n\n\n\n"); /* Get norm of g_a */ switch (data_type) { case C_FLOAT: ftmp = NGA_Fdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; case C_DBL: dtmp = NGA_Ddot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; case C_DCPL: ztmp = NGA_Zdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; case C_SCPL: ctmp = NGA_Cdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; default: GA_Error("wrong data type", data_type); } /* subtract C from A and put the results in B */ beta_flt = -1.0; beta_dbl = -1.0; beta_scpl.real = -1.0; beta_dcpl.real = -1.0; NGA_Zero_patch(g_b,lo,hi); NGA_Add_patch(alpha,g_a,lo,hi,beta,g_c,lo,hi,g_b,lo,hi); /* evaluate the norm of the difference between the two matrices */ switch (data_type) { case C_FLOAT: fdiff = NGA_Fdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (ftmp != 0.0) { fdiff /= ftmp; } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE); GA_Error("GA_Sgemm Failed", 1); } else if (me == 0) { printf("\nGA_Sgemm OK\n\n"); } break; case C_DBL: ddiff = NGA_Ddot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (dtmp != 0.0) { ddiff /= dtmp; } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE); GA_Error("GA_Dgemm Failed", 1); } else if (me == 0) { printf("\nGA_Dgemm OK\n\n"); } break; case C_DCPL: zdiff = NGA_Zdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (ztmp.real != 0.0 || ztmp.imag != 0.0) { ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag) /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag)); } else { ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag); } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE); GA_Error("GA_Zgemm Failed", 1); } else if (me == 0) { printf("\nGA_Zgemm OK\n\n"); } break; case C_SCPL: cdiff = NGA_Cdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (ctmp.real != 0.0 || ctmp.imag != 0.0) { fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag) /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag)); } else { fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag); } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE); GA_Error("GA_Cgemm Failed", 1); } else if (me == 0) { printf("\nGA_Cgemm OK\n\n"); } break; default: GA_Error("wrong data type", data_type); } } #endif free(abuf); free(bbuf); free(cbuf); GA_Destroy(g_a); GA_Destroy(g_b); GA_Destroy(g_c); }
PetscErrorCode vizGA2DA() { PetscErrorCode ierr; int rank; MPI_Comm_rank(PETSC_COMM_WORLD,&rank); int d1 = 40, d2 = 50; DA da; Vec vec; const PetscInt *lx, *ly, *lz; PetscInt m,n,p; DALocalInfo info; ierr = DACreate2d(PETSC_COMM_WORLD,DA_NONPERIODIC,DA_STENCIL_STAR, d1,d2,PETSC_DECIDE,PETSC_DECIDE,1,1,0,0, &da); CHKERRQ(ierr); ierr = DACreateGlobalVector(da, &vec); CHKERRQ(ierr); ierr = DAGetOwnershipRanges(da, &lx, &ly, &lz); CHKERRQ(ierr); ierr = DAGetLocalInfo(da,&info); CHKERRQ(ierr); ierr = DAGetInfo(da,0,0,0,0,&m,&n,&p,0,0,0,0); CHKERRQ(ierr); /**/ ierr = DAView(da, PETSC_VIEWER_STDOUT_WORLD); CHKERRQ(ierr); for (int i = 0; i < m; ++i) { PetscPrintf(PETSC_COMM_WORLD,"%d\tlx: %d\n",i,lx[i]); } for (int i = 0; i < n; ++i) { PetscPrintf(PETSC_COMM_WORLD,"%d\tly: %d\n",i,ly[i]); } /**/ int ga = GA_Create_handle(); int ndim = 2; int dims[2] = {d2,d1}; GA_Set_data(ga,2,dims,MT_DBL); int *map; PetscMalloc( sizeof(int)*(m+n), &map); map[0] = 0; for( int i = 1; i < n; i++ ) { map[i] = ly[i-1] + map[i-1]; } map[n] = 0; for( int i = n+1; i < m+n; i++ ) { map[i] = lx[i-n-1] + map[i-1]; } /* correct ordering, but nodeid's dont line up with mpi rank for petsc's da * DA: +---+---+ GA: +---+---+ * +-2-+-3-+ +-1-+-3-+ * +---+---+ +---+---+ * +-0-+-1-+ +-0-+-2-+ * +---+---+ +---+---+ int *map; PetscMalloc( sizeof(int)*(m+n), &map); map[0] = 0; for( int i = 1; i < m; i++ ) { map[i] = lx[i] + map[i-1]; } map[m] = 0; for( int i = m+1; i < m+n; i++ ) { map[i] = ly[i-m] + map[i-1]; } */ int block[2] = {n,m}; GA_Set_irreg_distr(ga,map,block); ierr = GA_Allocate( ga ); if( !ierr ) GA_Error("\n\n\nga allocaltion failed\n\n",ierr); if( !ga ) GA_Error("\n\n\n ga null \n\n",ierr); if( rank != GA_Nodeid() ) GA_Error("MPI rank does not match GA_Nodeid()",1); GA_Print_distribution(ga); int lo[2], hi[2]; NGA_Distribution(ga,rank,lo,hi); if( lo[1] != info.xs || hi[1] != info.xs+info.xm-1 || lo[0] != info.ys || hi[0] != info.ys+info.ym-1 ) { PetscSynchronizedPrintf(PETSC_COMM_SELF,"[%d] lo:(%2d,%2d) hi:(%2d,%2d) \t DA: (%2d,%2d), (%2d, %2d)\n", rank, lo[1], lo[0], hi[1], hi[0], info.xs, info.ys, info.xs+info.xm-1, info.ys+info.ym-1); } PetscBarrier(0); PetscSynchronizedFlush(PETSC_COMM_WORLD); AO ao; DAGetAO(da,&ao); if( rank == 0 ) { int *idx, len = d1*d2; PetscReal *val; PetscMalloc(sizeof(PetscReal)*len, &val); PetscMalloc(sizeof(int)*len, &idx); for (int j = 0; j < d2; ++j) { for (int i = 0; i < d1; ++i) { idx[i + d1*j] = i + d1*j; val[i + d1*j] = i + d1*j; } } AOApplicationToPetsc(ao,len,idx); VecSetValues(vec,len,idx,val,INSERT_VALUES); int a[2], b[2],ld[1]={0}; double c = 0; for (int j = 0; j < d2; ++j) { for (int i = 0; i < d1; ++i) { a[0] = j; a[1] = i; // printf("%5.0f ",c); NGA_Put(ga,a,a,&c,ld); c++; } } } // GA_Print(ga); VecAssemblyBegin(vec); VecAssemblyEnd(vec); int ld; double *ptr; NGA_Access(ga,lo,hi,&ptr,&ld); PetscReal **d; int c=0; ierr = DAVecGetArray(da,vec,&d); CHKERRQ(ierr); for (int j = info.ys; j < info.ys+info.ym; ++j) { for (int i = info.xs; i < info.xs+info.xm; ++i) { if( d[j][i] != ptr[(i-info.xs)+ld*(j-info.ys)] ) GA_Error("DA array is not equal to GA array",1); // printf("%d (%d,%d):\t%3.0f\t%3.0f\n", c, i, j, d[j][i], ptr[(i-info.xs)+ld*(j-info.ys)]); c++; } } ierr = DAVecRestoreArray(da,vec,&d); CHKERRQ(ierr); c=0; PetscReal *v; int start, end; VecGetOwnershipRange(vec, &start, &end); VecGetArray( vec, &v ); for( int i = start; i < end; i++) { // printf("%d:\t%3.0f\t%3.0f\t%s\n", start, v[i-start], ptr[i-start], (v[i-start]-ptr[i-start]==0?"":"NO") ); } VecRestoreArray( vec, &v ); NGA_Release_update(ga,lo,hi); Vec gada; VecCreateMPIWithArray(((PetscObject)da)->comm,da->Nlocal,PETSC_DETERMINE,ptr,&gada); VecView(gada,PETSC_VIEWER_STDOUT_SELF); GA_Destroy(ga); ierr = VecDestroy(vec); CHKERRQ(ierr); ierr = DADestroy(da); CHKERRQ(ierr); PetscFunctionReturn(0); }
int main(int argc, char **argv) { int me; int nproc; int status; int g_a; int dims[NDIM]; int chunk[NDIM]; int pg_world; size_t num = 10; double *p1 = NULL; double *p2 = NULL; size_t i; int num_mutex; int lo[1]; int hi[1]; int ld[1]={1}; MPI_Comm comm; MP_INIT(argc,argv); GA_INIT(argc,argv); me = GA_Nodeid(); nproc = GA_Nnodes(); comm = GA_MPI_Comm_pgroup_default(); printf("%d: Hello world!\n",me); if (me==0) printf("%d: GA_Initialize\n",me); /*if (me==0) printf("%d: ARMCI_Init\n",me);*/ /*ARMCI_Init();*/ /*if (me==0) printf("%d: MA_Init\n",me);*/ /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/ if (me==0) printf("%d: GA_Create_handle\n",me); g_a = GA_Create_handle(); if (me==0) printf("%d: GA_Set_array_name\n",me); GA_Set_array_name(g_a,"test array A"); dims[0] = 30; if (me==0) printf("%d: GA_Set_data\n",me); GA_Set_data(g_a,NDIM,dims,MT_DBL); chunk[0] = -1; if (me==0) printf("%d: GA_Set_chunk\n",me); GA_Set_chunk(g_a,chunk); if (me==0) printf("%d: GA_Pgroup_get_world\n",me); pg_world = GA_Pgroup_get_world(); if (me==0) printf("%d: GA_Set_pgroup\n",me); GA_Set_pgroup(g_a,pg_world); if (me==0) printf("%d: GA_Allocate\n",me); status = GA_Allocate(g_a); if(0 == status) MPI_Abort(comm,100); if (me==0) printf("%d: GA_Zero\n",me); GA_Zero(g_a); if (me==0) printf("%d: GA_Sync\n",me); GA_Sync(); num = 10; p1 = malloc(num*sizeof(double)); /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/ if (p1==NULL) MPI_Abort(comm,1000); p2 = malloc(num*sizeof(double)); /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/ if (p2==NULL) MPI_Abort(comm,2000); for ( i=0 ; i<num ; i++ ) p1[i] = 7.0; for ( i=0 ; i<num ; i++ ) p2[i] = 3.0; num_mutex = 17; status = GA_Create_mutexes(num_mutex); if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status); /***************************************************************/ if (me==0) { printf("%d: before GA_Lock\n",me); GA_Lock(0); lo[0] = 0; hi[0] = num-1; GA_Init_fence(); NGA_Put(g_a,lo,hi,p1,ld); GA_Fence(); GA_Unlock(0); printf("%d: after GA_Unlock\n",me); } GA_Print(g_a); if (me==1) { printf("%d: before GA_Lock\n",me); GA_Lock(0); lo[0] = 0; hi[0] = num-1; GA_Init_fence(); NGA_Get(g_a,lo,hi,p2,ld); GA_Fence(); GA_Unlock(0); printf("%d: after GA_Unlock\n",me); for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n", (long unsigned)i,p2[i]); } /***************************************************************/ status = GA_Destroy_mutexes(); if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status); /*ARMCI_Free(p2);*/ /*ARMCI_Free(p1);*/ free(p2); free(p1); if (me==0) printf("%d: GA_Destroy\n",me); GA_Destroy(g_a); /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/ /*ARMCI_Finalize();*/ if (me==0) printf("%d: GA_Terminate\n",me); GA_Terminate(); if (me==0) printf("%d: MPI_Finalize\n",me); MPI_Finalize(); return(0); }
/* input is matrix size */ void ga_lu(double *A, int matrix_size) { int g_a, g_b, dims[2], type=C_DBL; int lo[2], hi[2], ld; int block_size[2], proc_grid[2]; double time, gflops; /* create a 2-d GA (global matrix) */ dims[0] = matrix_size; dims[1] = matrix_size; block_size[0] = BLOCK_SIZE; block_size[1] = BLOCK_SIZE; #ifdef USE_SCALAPACK_DISTR proc_grid[0] = 2; proc_grid[1] = nprocs/2; if(nprocs%2) GA_Error("For ScaLAPACK stle distribution, nprocs must be " " divisible by 2", 0); #endif #ifndef BLOCK_CYCLIC g_a = NGA_Create(type, 2, dims, "A", NULL); g_b = GA_Duplicate(g_a, "transposed array B"); #else g_a = GA_Create_handle(); GA_Set_data(g_a, 2, dims, type); GA_Set_array_name(g_a,"A"); # ifdef USE_SCALAPACK_DISTR GA_Set_block_cyclic_proc_grid(g_a, block_size, proc_grid); # else GA_Set_block_cyclic(g_a, block_size); # endif GA_Allocate(g_a); g_b = GA_Create_handle(); GA_Set_data(g_b, 2, dims, type); GA_Set_array_name(g_b,"B"); # ifdef USE_SCALAPACK_DISTR GA_Set_block_cyclic_proc_grid(g_b, block_size, proc_grid); # else GA_Set_block_cyclic(g_b, block_size); # endif GA_Allocate(g_b); #endif /* copy the local matrix into GA */ if(me==0) { lo[0] = 0; hi[0] = matrix_size - 1; lo[1] = 0; hi[1] = matrix_size - 1; ld = matrix_size; NGA_Put(g_a, lo, hi, A, &ld); } GA_Sync(); GA_Transpose(g_a, g_b); time = CLOCK_(); GA_Lu('n', g_b); time = CLOCK_() - time; /* 2/3 N^3 - 1/2 N^2 flops for LU and 2*N^2 for solver */ gflops = ( (((double)matrix_size) * matrix_size)/(time*1.0e+9) * (2.0/3.0 * (double)matrix_size - 0.5) ); if(me==0) printf("\nGA_Lu: N=%d flops=%2.5e Gflops, time=%2.5e secs\n\n", matrix_size, gflops, time); #if DEBUG GA_Print(g_a); GA_Print(g_b); #endif /* if(me==0) lu(A, matrix_size); */ GA_Destroy(g_a); GA_Destroy(g_b); }
/** * Evaluate offsets for each network component */ void setOffsets(void) { // Interleave contributions from buses and branches to match matrices int i,j,jdx,jdx1,jdx2; int *i_bus_offsets = new int[p_nBuses]; int *i_branch_offsets = new int[p_nBranches]; for (i=0; i<p_nBuses; i++) { i_bus_offsets[i] = 0; } for (i=0; i<p_nBranches; i++) { i_branch_offsets[i] = 0; } int icnt = 0; int nsize; // Evaluate offsets for individual network components for (i=0; i<p_nBuses; i++) { if (p_network->getActiveBus(i)) { i_bus_offsets[i] = icnt; icnt += p_network->getBus(i)->vectorNumElements(); std::vector<int> nghbrs = p_network->getConnectedBranches(i); nsize = nghbrs.size(); for (j=0; j<nsize; j++) { // Need to avoid double counting of branches when evaluating offsets. // If branch is non-local and it is active, then include it in offsets. // Otherwise, if branch is local and bus i is equal to the "from" bus, // then include it in the offsets. jdx = nghbrs[j]; if (isLocalBranch(jdx)) { p_network->getBranchEndpoints(jdx,&jdx1,&jdx2); if (jdx1 == i) { i_branch_offsets[jdx] = icnt; icnt += p_network->getBranch(jdx)->vectorNumElements(); } } else { if (p_network->getActiveBranch(jdx)) { i_branch_offsets[jdx] = icnt; icnt += p_network->getBranch(jdx)->vectorNumElements(); } } } } } // Total number of rows and columns from this processor have been evaluated, // now create buffers that can scatter individual offsets to global arrays int **i_bus_index = new int*[p_nBuses]; int **i_branch_index = new int*[p_nBranches]; int *i_bus_index_buf = new int[p_nBuses]; int *i_branch_index_buf = new int[p_nBranches]; int *i_bus_value_buf = new int[p_nBuses]; int *i_branch_value_buf = new int[p_nBranches]; int i_bus_cnt = 0; int i_branch_cnt = 0; int row_offset = p_Offsets[p_me]; int nbus = 0; int nbranch = 0; for (i=0; i<p_nBuses; i++) { if (p_network->getActiveBus(i)) { nbus++; i_bus_value_buf[i_bus_cnt] = i_bus_offsets[i]+row_offset; i_bus_index_buf[i_bus_cnt] = p_network->getGlobalBusIndex(i); i_bus_index[i_bus_cnt] = &i_bus_index_buf[i_bus_cnt]; i_bus_cnt++; } } for (i=0; i<p_nBranches; i++) { if (p_network->getActiveBranch(i)) { nbranch++; i_branch_value_buf[i_branch_cnt] = i_branch_offsets[i]+row_offset; i_branch_index_buf[i_branch_cnt] = p_network->getGlobalBranchIndex(i); i_branch_index[i_branch_cnt] = &i_branch_index_buf[i_branch_cnt]; i_branch_cnt++; } } delete [] i_bus_offsets; delete [] i_branch_offsets; // Create global arrays that hold column and row offsets for all buses and // branches in the network. First create map array for global arrays int *t_busMap = new int[p_nNodes]; int *t_branchMap = new int[p_nNodes]; for (i=0; i<p_nNodes; i++) { t_busMap[i] = 0; t_branchMap[i] = 0; } t_busMap[p_me] = nbus; t_branchMap[p_me] = nbranch; char plus[2]; strcpy(plus,"+"); GA_Pgroup_igop(p_GAgrp, t_busMap, p_nNodes, plus); GA_Pgroup_igop(p_GAgrp, t_branchMap, p_nNodes, plus); int *busMap = new int[p_nNodes]; int *branchMap = new int[p_nNodes]; busMap[0] = 0; branchMap[0] = 0; int total_buses = t_busMap[0]; int total_branches = t_branchMap[0]; for (i=1; i<p_nNodes; i++) { busMap[i] = busMap[i-1] + t_busMap[i-1]; total_buses += t_busMap[i]; branchMap[i] = branchMap[i-1] + t_branchMap[i-1]; total_branches += t_branchMap[i]; } delete [] t_busMap; delete [] t_branchMap; int one = 1; g_bus_offsets = GA_Create_handle(); GA_Set_data(g_bus_offsets, one, &total_buses, C_INT); GA_Set_irreg_distr(g_bus_offsets, busMap, &p_nNodes); GA_Set_pgroup(g_bus_offsets, p_GAgrp); if (!GA_Allocate(g_bus_offsets)) { char buf[256]; sprintf(buf,"GenVectorMap::setOffsets: Unable to allocate distributed array for bus offsets\n"); printf("%s",buf); throw gridpack::Exception(buf); } GA_Zero(g_bus_offsets); g_branch_offsets = GA_Create_handle(); GA_Set_data(g_branch_offsets, one, &total_branches, C_INT); GA_Set_irreg_distr(g_branch_offsets, branchMap, &p_nNodes); GA_Set_pgroup(g_branch_offsets, p_GAgrp); if (!GA_Allocate(g_branch_offsets)) { char buf[256]; sprintf(buf,"GenVectorMap::setOffsets: Unable to allocate distributed array for branch offsets\n"); printf("%s",buf); throw gridpack::Exception(buf); } GA_Zero(g_branch_offsets); delete [] busMap; delete [] branchMap; // Scatter offsets to global arrays NGA_Scatter(g_bus_offsets, i_bus_value_buf, i_bus_index, i_bus_cnt); NGA_Scatter(g_branch_offsets, i_branch_value_buf, i_branch_index, i_branch_cnt); NGA_Pgroup_sync(p_GAgrp); delete [] i_bus_index; delete [] i_branch_index; delete [] i_bus_index_buf; delete [] i_branch_index_buf; delete [] i_bus_value_buf; delete [] i_branch_value_buf; }
/* * test ga_dgemm * Note: - change nummax for large arrays * - turn off "dgemm_verify" for large arrays due to memory * limitations, as dgemm_verify=1 for large arrays produces * segfault, dumps core,or any crap. */ int main(int argc, char **argv) { int num_m; int num_n; int num_k; int i; int ii; double *h0; int g_c; int g_b; int g_a; double a; double t1; double mf; double avg_t[ntrans]; double avg_mf[ntrans]; int itime; int ntimes; int nums_m[/*howmany*/] = {512,1024}; int nums_n[/*howmany*/] = {512,1024}; int nums_k[/*howmany*/] = {512,1024}; char transa[/*ntrans*/] = "ntnt"; char transb[/*ntrans*/] = "nntt"; char ta; char tb; double *tmpa; double *tmpb; double *tmpc; int ndim; int dims[2]; #ifdef BLOCK_CYCLIC int block_size[2]; #endif #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); #else MP_INIT(argc,argv); if (!MA_init(MT_DBL,1,20000000)) { GA_Error("failed: ma_init(MT_DBL,1,20000000)",10); } GA_INIT(argc,argv); me = GA_Nodeid(); #endif h0 = (double*)malloc(sizeof(double) * nummax*nummax); tmpa = (double*)malloc(sizeof(double) * nummax*nummax); tmpb = (double*)malloc(sizeof(double) * nummax*nummax); tmpc = (double*)malloc(sizeof(double) * nummax*nummax); ii = 0; for (i=0; i<nummax*nummax; i++) { ii = ii + 1; if (ii > nummax) { ii = 0; } h0[i] = ii; } /* Compute times assuming 500 mflops and 5 second target time */ /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */ ntimes = 5; for (ii=0; ii<howmany; ii++) { num_m = nums_m[ii]; num_n = nums_n[ii]; num_k = nums_k[ii]; a = 0.5/(num_m*num_n); if (num_m > nummax || num_n > nummax || num_k > nummax) { GA_Error("Insufficient memory: check nummax", 1); } #ifndef BLOCK_CYCLIC ndim = 2; /* dims[0] = num_m; dims[1] = num_n; */ dims[1] = num_m; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c ); #else if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) { GA_Error("failed: create g_c",20); } #endif /* dims[0] = num_k; dims[1] = num_n; */ dims[1] = num_k; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b ); #else if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) { GA_Error("failed: create g_b",30); } #endif /* dims[0] = num_m; dims[1] = num_k; */ dims[1] = num_m; dims[0] = num_k; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a ); #else if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) { GA_Error("failed: create g_a",40); } #endif #else ndim = 2; block_size[0] = 128; block_size[1] = 128; dims[0] = num_m; dims[1] = num_n; g_c = GA_Create_handle(); GA_Set_data(g_c,ndim,dims,MT_DBL); GA_Set_array_name(g_c,"g_c"); GA_Set_block_cyclic(g_c,block_size); if (!GA_Allocate(g_c)) { GA_Error("failed: create g_c",40); } dims[0] = num_k; dims[1] = num_n; g_b = GA_Create_handle(); GA_Set_data(g_b,ndim,dims,MT_DBL); GA_Set_array_name(g_b,"g_b"); GA_Set_block_cyclic(g_b,block_size); if (!ga_allocate(g_b)) { GA_Error("failed: create g_b",40); } dims[0] = num_m; dims[1] = num_k; g_a = GA_Create_handle(); GA_Set_data(g_a,ndim,dims,MT_DBL); GA_Set_array_name(g_a,"g_a"); GA_Set_block_cyclic(g_a,block_size); if (!ga_allocate(g_a)) { GA_Error('failed: create g_a',40); } #endif /* Initialize matrices A and B */ if (me == 0) { load_ga(g_a, h0, num_m, num_k); load_ga(g_b, h0, num_k, num_n); } #if defined(USE_ELEMENTAL) double zero = 0.0; ElGlobalArraysFill_d( eldga, g_c, &zero ); ElGlobalArraysSync_d( eldga ); #else GA_Zero(g_c); GA_Sync(); #endif #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n", (long)num_m, (long)num_k, (long)num_k, (long)num_n); fflush(stdout); } for (i=0; i<ntrans; i++) { avg_t[i] = 0.0; avg_mf[i] = 0.0; } for (itime=0; itime<ntimes; itime++) { for (i=0; i<ntrans; i++) { #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); #else GA_Sync(); #endif ta = transa[i]; tb = transb[i]; t1 = MP_TIMER(); #if defined(USE_ELEMENTAL) ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c ); #else GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c); #endif t1 = MP_TIMER() - t1; #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif #if defined(USE_ELEMENTAL) mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc; #else mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes(); #endif avg_t[i] = avg_t[i]+t1; avg_mf[i] = avg_mf[i] + mf; printf("%15s%2d: %12.4f seconds %12.1f mflops/proc %c %c\n", "Run#", itime, t1, mf, ta, tb); fflush(stdout); if (dgemm_verify && itime == 0) { /* recall the C API swaps the matrix order */ /* we swap it here for the Fortran-based verify */ verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0, g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc); } } } } #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\n"); for (i=0; i<ntrans; i++) { printf("%17s: %12.4f seconds %12.1f mflops/proc %c %c\n", "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes, transa[i], transb[i]); } if(dgemm_verify) { printf("All GA_Dgemms are verified...O.K.\n"); } fflush(stdout); } /* GA_Print(g_a); GA_Print(g_b); GA_Print(g_c); */ #if defined(USE_ELEMENTAL) ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysDestroy_d( eldga, g_b ); ElGlobalArraysDestroy_d( eldga, g_c ); #else GA_Destroy(g_c); GA_Destroy(g_b); GA_Destroy(g_a); #endif } /* ??? format(a15, i2, ': ', e12.4, ' seconds ',f12.1, . ' mflops/proc ', 3a2) */ #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("All tests successful\n"); } free(h0); free(tmpa); free(tmpb); free(tmpc); #if defined(USE_ELEMENTAL) // call el::global arrays destructor ElGlobalArraysTerminate_d( eldga ); ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Terminate(); MP_FINALIZE(); #endif return 0; } /* * Verify for correctness. Process 0 computes BLAS dgemm * locally. For larger arrays, disbale this test as memory * might not be sufficient */ void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k, double alpha, int g_a, int g_b, double beta, int g_c, double *tmpa, double *tmpb, double *tmpc) { int i,j,type,ndim,dims[2],lo[2],hi[2]; double abs_value; for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { tmpc[j+i*num_m] = -1.0; tmpa[j+i*num_m] = -2.0; } } #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_a, lo, hi, tmpa, &dims[1]); #endif #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] ); #else NGA_Get(g_b, lo, hi, tmpb, &dims[1]); #endif /* compute dgemm sequentially */ #if defined(USE_ELEMENTAL) cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), ( xt2 == 'n'? CblasNoTrans: CblasTrans ), num_m /* M */, num_n /* N */, num_k /* K */, alpha, tmpa, num_m, /* lda */ tmpb, num_k, /* ldb */ beta, tmpc, num_m /* ldc */); #else xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k, &alpha, tmpa, &num_m, tmpb, &num_k, &beta, tmpc, &num_m); #endif /* after computing c locally, verify it with the values in g_c */ #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_c, lo, hi, tmpa, &dims[1]); #endif for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]); if(abs_value > 1.0 || abs_value < -1.0) { printf("Values are = %f %f\n", tmpc[j+i*num_m], tmpa[j+i*num_m]); printf("Values are = %f %f\n", fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value); fflush(stdout); GA_Error("verify ga_dgemm failed", 1); } } } } /** * called by process '0' (or your master process ) */ void load_ga(int handle, double *f, int dim1, int dim2) { int lo[2], hi[2]; if (dim1 < 0 || dim2 < 0) { return; } lo[0] = 0; lo[1] = 0; hi[0] = dim1-1; hi[1] = dim2-1; #if defined(USE_ELEMENTAL) ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 ); #else NGA_Put(handle, lo, hi, f, &dim1); #endif }
void do_work() { int g_a, g_b; int me=GA_Nodeid(), nproc=GA_Nnodes(), proc, loop; int dims[NDIM], lo[NDIM], hi[NDIM], block[NDIM], ld[NDIM-1]; int i,d,*proclist, offset; int adims[NDIM], ndim,type; typedef struct { int lo[NDIM]; int hi[NDIM]; } patch_t; patch_t *regions; int *map; double *buf; /***** create array A with default distribution *****/ if(me==0){printf("Creating array A\n"); fflush(stdout);} for(i = 0; i<NDIM; i++)dims[i] = N*(i+1); #ifdef NEW_API g_a = GA_Create_handle(); GA_Set_data(g_a,NDIM,dims,MT_F_DBL); GA_Set_array_name(g_a,"array A"); (void)GA_Allocate(g_a); #else g_a = NGA_Create(MT_F_DBL, NDIM, dims, "array A", NULL); #endif if(!g_a) GA_Error("create failed: A",0); if(me==0)printf("OK\n\n"); /* print info about array we got */ NGA_Inquire(g_a, &type, &ndim, adims); GA_Print_distribution(g_a); GA_Sync(); /* duplicate array A with ga_create irreg rather than ga_duplicate * -- want to show distribution control * -- with ga_duplicate it would be g_b=GA_Duplicate(g_a,name) */ if(me==0)printf("\nReconstructing distribution description for A\n"); /* get memory for arrays describing distribution */ proclist = (int*)malloc(nproc*sizeof(int)); if(!proclist)GA_Error("malloc failed for proclist",0); regions = (patch_t*)malloc(nproc*sizeof(patch_t)); if(!regions)GA_Error("malloc failed for regions",0); map = (int*)malloc((nproc+ndim)*sizeof(int)); /* ubound= nproc+mdim */ if(!map)GA_Error("malloc failed for map",0); /* first find out how array g_a is distributed */ for(i=0;i<ndim;i++)lo[i]=BASE; for(i=0;i<ndim;i++)hi[i]=adims[i] -1 + BASE; proc = NGA_Locate_region(g_a, lo, hi, (int*)regions, proclist); if(proc<1) GA_Error("error in NGA_Locate_region",proc); /* determine blocking for each dimension */ for(i=0;i<ndim;i++)block[i]=0; for(i=0;i<ndim;i++)adims[i]=0; offset =0; for(d=0; d<ndim; d++) for(i=0;i<proc;i++) if( regions[i].hi[d]>adims[d] ){ map[offset] = regions[i].lo[d]; offset++; block[d]++; adims[d]= regions[i].hi[d]; } if(me==0){ printf("Distribution map contains %d elements\n",offset); print_subscript("number of blocks for each dimension",ndim,block,"\n"); print_subscript("distribution map",offset,map,"\n\n"); fflush(stdout); } if(me==0)printf("Creating array B applying distribution of A\n"); # ifdef USE_DUPLICATE g_b = GA_Duplicate(g_a,"array B"); # else g_b = NGA_Create_irreg(MT_F_DBL, NDIM, dims, "array B", block,map); # endif if(!g_b) GA_Error("create failed: B",0); if(me==0)printf("OK\n\n"); free(proclist); free(regions); free(map); GA_Print_distribution(g_b); GA_Sync(); if(me==0){ printf("\nCompare distributions of A and B\n"); if(GA_Compare_distr(g_a,g_b)) printf("Failure: distributions NOT identical\n"); else printf("Success: distributions identical\n"); fflush(stdout); } if(me==0){ printf("\nAccessing local elements of A: set them to the owner process id\n"); fflush(stdout); } GA_Sync(); NGA_Distribution(g_a,me,lo,hi); if(hi[0]>=0){/* -1 means no elements stored on this processor */ double *ptr; int locdim[NDIM]; NGA_Access(g_a, lo,hi, &ptr, ld); for(i=0;i<ndim;i++)locdim[i]=hi[i]-lo[i]+1; fill_patch(ptr, locdim, ld, ndim,(double)me); } for(i=0;i<nproc; i++){ if(me==i && hi[0]>=0){ char msg[100]; sprintf(msg,"%d: leading dimensions",me); print_subscript(msg,ndim-1,ld,"\n"); fflush(stdout); } GA_Sync(); } GA_Sync(); if(me==0)printf("\nRandomly checking the update using ga_get on array sections\n"); GA_Sync(); /* show ga_get working and verify array updates * every process does N random gets * for simplicity get only a single row at a time */ srand(me); /* different seed for every process */ hi[ndim-1]=adims[ndim-1] -1 + BASE; for(i=1;i<ndim-1; i++)ld[i]=1; ld[ndim-2]=adims[ndim-1] -1 + BASE; /* get buffer memory */ buf = (double*)malloc(adims[ndim-1]*sizeof(double)); if(!buf)GA_Error("malloc failed for buf",0); /* half of the processes check the result */ if(me<=nproc/2) for(loop = 0; loop< N; loop++){ /* task parallel loop */ lo[ndim-1]=BASE; for (i= 0; i < ndim -1; i ++){ lo[i] = hi[i] = rand()%adims[i]+BASE; } /* print_subscript("getting",ndim,lo,"\n");*/ NGA_Get(g_a,lo,hi,buf,ld); /* check values */ for(i=0;i<adims[ndim-1]; i++){ int p = NGA_Locate(g_a, lo); if((double)p != buf[i]) { char msg[100]; sprintf(msg,"%d: wrong value: %d != %lf a",me, p, buf[i]); print_subscript(msg,ndim,lo,"\n"); GA_Error("Error - bye",i); } lo[ndim-1]++; } } free(buf); GA_Sync(); if(me==0)printf("OK\n"); GA_Destroy(g_a); GA_Destroy(g_b); }