int main(int argc, char **argv) { if (MPI_Init(&argc, &argv) != MPI_SUCCESS) { fprintf(stderr, "MPI initialization failed.\n"); return 1; } int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (size < 2) { fprintf(stderr, "cant play this game alone.\n"); return 1; } int sendbuf[2] = { rank, 1 }; fprintf(stderr, "[ %d ] my numbers are: %3d %3d\n", rank, sendbuf[0], sendbuf[1]); int recvbuf[2]; if (MPI_Exscan(sendbuf, recvbuf, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD)) { fprintf(stderr, "MPI_Exscan failed\n"); MPI_Abort(MPI_COMM_WORLD, 1); } if (rank) fprintf(stderr, "[ %d ] received sum Exscan %3d %3d\n", rank, recvbuf[0], recvbuf[1]); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int root = 0; int processCount; int currentRank; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD,&processCount); MPI_Comm_rank(MPI_COMM_WORLD,¤tRank); int reduce = currentRank; int reduce2 = currentRank; int reduce3 = 0; MPI_Scan(¤tRank,&reduce,1,MPI_INT,MPI_SUM,MPI_COMM_WORLD); printf("Scan: process %d: reduce = %d\n", currentRank, reduce); MPI_Exscan(¤tRank,&reduce2,1,MPI_INT,MPI_SUM,MPI_COMM_WORLD); printf("Exscan: process %d: reduce = %d\n", currentRank, reduce2); MPI_Reduce(¤tRank,&reduce3,1,MPI_INT,MPI_SUM, 0, MPI_COMM_WORLD); if(currentRank==0) printf("Reduce: process %d: reduce = %d\n", currentRank, reduce3); MPI_Finalize(); return 0; }
FORT_DLL_SPEC void FORT_CALL mpi_exscan_ ( void*v1, void*v2, MPI_Fint *v3, MPI_Fint *v4, MPI_Fint *v5, MPI_Fint *v6, MPI_Fint *ierr ){ #ifndef HAVE_MPI_F_INIT_WORKS_WITH_C if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; } #endif if (v1 == MPIR_F_MPI_IN_PLACE) v1 = MPI_IN_PLACE; *ierr = MPI_Exscan( v1, v2, (int)*v3, (MPI_Datatype)(*v4), (MPI_Op)*v5, (MPI_Comm)(*v6) ); }
static MPI_Offset writeToMPI(const std::vector<T>& data, MPI_File f, MPI_Offset base, MPI_Comm comm) { MPI_Offset offset = 0, nbytes = data.size()*sizeof(T); MPI_Check( MPI_Exscan(&nbytes, &offset, 1, MPI_OFFSET, MPI_SUM, comm)); MPI_Check( MPI_File_write_at_all(f, base + offset, data.data(), nbytes, MPI_CHAR, MPI_STATUS_IGNORE)); MPI_Offset ntotal = 0; MPI_Check( MPI_Allreduce(&nbytes, &ntotal, 1, MPI_OFFSET, MPI_SUM, comm) ); return ntotal; }
T Comm::exscan(T x, Omega_h_Op op) const { #ifdef OMEGA_H_USE_MPI CALL(MPI_Exscan( MPI_IN_PLACE, &x, 1, MpiTraits<T>::datatype(), mpi_op(op), impl_)); if (rank() == 0) x = 0; return x; #else (void)op; (void)x; return 0; #endif }
std::tuple<vector<unsigned int>, vector<unsigned int>> compute_global_t_prime_sums_and_exscans_arrays(const vector<vector<unsigned int>> &prefix_summed_bucket_table, MPI_Comm comm) { vector<unsigned int> t_primes(prefix_summed_bucket_table.size()), t_primes_summed(prefix_summed_bucket_table.size()), t_primes_exscanned(prefix_summed_bucket_table.size()); // Initialize t_primes with the prefix sums of the bucket table for (auto i=0; i < t_primes.size(); ++i) { t_primes[i] = prefix_summed_bucket_table[i].back(); } // Calculate, for each digit, the total number of elements with the same digit across all processors MPI_Allreduce(&t_primes[0], &t_primes_summed[0], t_primes.size(), MPI_UNSIGNED, MPI_SUM, comm); // Calculate, for each digit, the total number of elements with the same digit but on processors with smaller rank MPI_Exscan(&t_primes[0], &t_primes_exscanned[0], t_primes.size(), MPI_UNSIGNED, MPI_SUM, comm); // Return both return make_tuple(t_primes_summed, t_primes_exscanned); }
void globalUniquenessOfIds(std::vector<Q>& localVector, ReadIdType localReadCount, MPI_Comm comm) { int rank; MPI_Comm_rank(comm, &rank); ReadIdType previousReadIdSum; //Get MPI Datatype using mxx library mxx::datatype<ReadIdType> MPI_ReadIDType; MPI_Exscan(&localReadCount, &previousReadIdSum, 1, MPI_ReadIDType.type(), MPI_SUM, comm); //Update all elements if(rank > 0) { for ( auto& eachTuple : localVector) { //Update Pc only std::get<readTuple::rid>(eachTuple) = std::get<readTuple::rid>(eachTuple) + previousReadIdSum; } } }
int main(int argc, char *argv[]) { int errs = 0; int rank, size; int sendbuf[1], recvbuf[1]; MPI_Comm comm; MTest_Init(&argc, &argv); comm = MPI_COMM_WORLD; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); sendbuf[0] = rank; recvbuf[0] = -2; MPI_Exscan(sendbuf, recvbuf, 1, MPI_INT, MPI_SUM, comm); /* Check the results. rank 0 has no data. Input is * 0 1 2 3 4 5 6 7 8 ... * Output is * - 0 1 3 6 10 15 21 28 36 * (scan, not counting the contribution from the calling process) */ if (rank > 0) { int result = (((rank) * (rank - 1)) / 2); /* printf("%d: %d\n", rank, result); */ if (recvbuf[0] != result) { errs++; fprintf(stderr, "Error in recvbuf = %d on %d, expected %d\n", recvbuf[0], rank, result); } } else if (recvbuf[0] != -2) { errs++; fprintf(stderr, "Error in recvbuf on zero, is %d\n", recvbuf[0]); } MTest_Finalize(errs); return MTestReturnValue(errs); }
int tree2elemental(InvMedTree<FMM_Mat_t> *tree, El::DistMatrix<T,El::VC,El::STAR> &Y){ int data_dof=2; int SCAL_EXP = 1; int nlocal,gsize; //local elements, start p_id, global size double *pt_array; // will hold local array int r,q,rq; //Grid sizes int nbigs; //Number of large sends (i.e. send 1 extra data point) int pstart; // p_id of nstart int rank = El::mpi::WorldRank(); //p_id int send_size; // base send size bool print = rank == -1; // Get Grid and associated params const El::Grid* g = &(Y.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; std::vector<FMMNode_t*> nlist = tree->GetNGLNodes(); int cheb_deg = InvMedTree<FMM_Mat_t>::cheb_deg; int omp_p=omp_get_max_threads(); size_t n_coeff3=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6; // Get sizes, array in petsc //VecGetSize(pt_vec,&gsize); gsize = tree->M/data_dof; nlocal = tree->m/data_dof; //VecGetLocalSize(pt_vec,&nlocal); //VecGetArray(pt_vec,&pt_array); int nstart = 0; MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm); //VecGetOwnershipRange(pt_vec,&nstart,NULL); //Find processor that nstart belongs to, number of larger sends rq = r * q; pstart = nstart % rq; //int div nbigs = nlocal % rq; send_size = nlocal/rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "send_size: " << send_size << std::endl; } // Make send_lengths std::vector<int> send_lengths(rq); std::fill(send_lengths.begin(),send_lengths.end(),send_size); if(nbigs >0){ for(int j=0;j<nbigs;j++){ send_lengths[(pstart + j) % rq] += 1; } } // Make send_disps std::vector<int> send_disps = exscan(send_lengths); std::vector<El::Complex<double>> indata(nlocal); // copy the data from an ffm tree to into a local vec of complex data for sending #pragma omp parallel for for(size_t tid=0;tid<omp_p;tid++){ size_t i_start=(nlist.size()* tid )/omp_p; size_t i_end =(nlist.size()*(tid+1))/omp_p; for(size_t i=i_start;i<i_end;i++){ pvfmm::Vector<double>& coeff_vec=nlist[i]->ChebData(); double s=std::pow(0.5,COORD_DIM*nlist[i]->Depth()*0.5*SCAL_EXP); size_t offset=i*n_coeff3; for(size_t j=0;j<n_coeff3;j++){ double real = coeff_vec[j]*s; // local indices as in the pvfmm trees double imag = coeff_vec[j+n_coeff3]*s; El::Complex<double> coeff; El::SetRealPart(coeff,real); El::SetImagPart(coeff,imag); indata[offset+j] = coeff; } } } // Make send_data std::vector<El::Complex<double>> send_data(nlocal); for(int proc=0;proc<rq;proc++){ int offset = send_disps[proc]; int base_idx = (proc - pstart + rq) % rq; for(int j=0; j<send_lengths[proc]; j++){ int idx = base_idx + (j * rq); send_data[offset + j] = indata[idx]; } } // Do all2all to get recv_lengths std::vector<int> recv_lengths(rq); MPI_Alltoall(&send_lengths[0], 1, MPI_INT, &recv_lengths[0], 1, MPI_INT,comm); // Scan to get recv_disps std::vector<int> recv_disps = exscan(recv_lengths); // Do all2allv to get data on correct processor El::Complex<double> * recv_data = Y.Buffer(); //MPI_Alltoallv(&send_data[0],&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ // &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); El::mpi::AllToAll(&send_data[0], &send_lengths[0], &send_disps[0], recv_data,&recv_lengths[0],&recv_disps[0],comm); if(print){ std::cout << "Send data: " <<std::endl << send_data <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } return 0; }
int main( int argc, char *argv[] ) { int errs = 0; int rank, size; int minsize = 2, count; int *sendbuf, *recvbuf, i; MPI_Comm comm; MTest_Init( &argc, &argv ); /* The following illustrates the use of the routines to run through a selection of communicators and datatypes. Use subsets of these for tests that do not involve combinations of communicators, datatypes, and counts of datatypes */ while (MTestGetIntracommGeneral( &comm, minsize, 1 )) { if (comm == MPI_COMM_NULL) continue; MPI_Comm_rank( comm, &rank ); MPI_Comm_size( comm, &size ); for (count = 1; count < 65000; count = count * 2) { sendbuf = (int *)malloc( count * sizeof(int) ); recvbuf = (int *)malloc( count * sizeof(int) ); for (i=0; i<count; i++) { sendbuf[i] = rank + i * size; recvbuf[i] = -1; } MPI_Exscan( sendbuf, recvbuf, count, MPI_INT, MPI_SUM, comm ); /* Check the results. rank 0 has no data */ if (rank > 0) { int result; for (i=0; i<count; i++) { result = rank * i * size + ((rank) * (rank-1))/2; if (recvbuf[i] != result) { errs++; if (errs < 10) { fprintf( stderr, "Error in recvbuf[%d] = %d on %d, expected %d\n", i, recvbuf[i], rank, result ); } } } } #if MTEST_HAVE_MIN_MPI_VERSION(2,2) /* now try the MPI_IN_PLACE flavor */ for (i=0; i<count; i++) { sendbuf[i] = -1; /* unused */ recvbuf[i] = rank + i * size; } MPI_Exscan( MPI_IN_PLACE, recvbuf, count, MPI_INT, MPI_SUM, comm ); /* Check the results. rank 0's data must remain unchanged */ for (i=0; i<count; i++) { int result; if (rank == 0) result = rank + i * size; else result = rank * i * size + ((rank) * (rank-1))/2; if (recvbuf[i] != result) { errs++; if (errs < 10) { fprintf( stderr, "Error in recvbuf[%d] = %d on %d, expected %d\n", i, recvbuf[i], rank, result ); } } } MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); /* Make sure that we check for buffer aliasing properly */ if (MPI_SUCCESS == MPI_Exscan( recvbuf, recvbuf, count, MPI_INT, MPI_SUM, comm )) errs++; #endif free( sendbuf ); free( recvbuf ); } MTestFreeComm( &comm ); } MTest_Finalize( errs ); MPI_Finalize(); return 0; }
/// Write a vtu file static int pvo_vtu_write_data( pvo_file_t self, pvo_xml_file_t f ) { int err = 0; pvo_vtu_file_t fh = (pvo_vtu_file_t )self; int ibuf[3], jbuf[3]; size_t offset = 0; int gnnodes, gncells, gnnz, bnodes, bnnz; pvo_var_t* p; int nbytes; MPI_Datatype type; size_t i; if( NULL == f->island ) { PVO_ERROR( "Invalid input: NULL == fh->base.cki." ); goto fn_fail; } ibuf[0] = fh->nnodes; ibuf[1] = fh->ncells; ibuf[2] = fh->cia[fh->ncells]; if( MPI_Allreduce( MPI_IN_PLACE, ibuf, 3, MPI_INT, MPI_SUM, f->island->comm )) { PVO_WARN( "MPI_Allreduce failed." ); goto fn_fail; } gnnodes = ibuf[0]; gncells = ibuf[1]; gnnz = ibuf[2]; /* Bottom index for the local process. This value can be added to the local * node or cell index to get unique identifiers (within the island */ ibuf[0] = fh->nnodes; ibuf[1] = fh->cia[fh->ncells]; if( MPI_Exscan( ibuf, jbuf, 2, MPI_INT, MPI_SUM, f->island->comm )) { PVO_WARN( "MPI_Exscane failed." ); goto fn_fail; } if( 0 == f->island->rank ) { jbuf[0] = 0; jbuf[1] = 0; } bnodes = jbuf[0]; bnnz = jbuf[1]; pvo_xml_file_new_group( f, "VTKFile type=\"UnstructuredGrid\" " "version=\"0.1\" " "byte_order=\"%s\"", self->bo_str ); pvo_xml_file_new_group( f, "UnstructuredGrid" ); pvo_xml_file_new_group( f, "Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\"", gnnodes, gncells ); pvo_xml_file_new_group( f, "PointData" ); for( p = self->cki->vlist; p; p = p->next ) { if( PVO_VAR_NODEDATA != p->grp ) continue; p->offset = offset; pvo_xml_file_write_element( f, "DataArray type=\"%s\" Name=\"%s\" NumberOfComponents=\"%d\" format=\"appended\" offset=\"%lu\"", pvo_var_type_names[p->type], p->name, p->ncomps, p->offset ); offset += pvo_var_type_sizeof[p->type]*p->ncomps*gnnodes + 4; } pvo_xml_file_end_group( f, "PointData" ); pvo_xml_file_new_group( f, "CellData" ); for( p = self->cki->vlist; p; p = p->next ) { if( PVO_VAR_CELLDATA != p->grp ) continue; p->offset = offset; pvo_xml_file_write_element( f, "DataArray type=\"%s\" Name=\"%s\" NumberOfComponents=\"%d\" format=\"appended\" offset=\"%lu\"", pvo_var_type_names[p->type], p->name, p->ncomps, p->offset ); offset += pvo_var_type_sizeof[p->type]*p->ncomps*gncells + 4; } pvo_xml_file_end_group( f, "CellData" ); pvo_xml_file_new_group( f, "Points" ); pvo_xml_file_write_element( f, "DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"appended\" offset=\"%lu\"", offset ); offset += pvo_var_type_sizeof[PVO_VAR_FLOAT32]*3*gnnodes + 4; pvo_xml_file_end_group( f, "Points" ); pvo_xml_file_new_group( f, "Cells" ); pvo_xml_file_write_element( f, "DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"%lu\"", offset ); offset += pvo_var_type_sizeof[PVO_VAR_INT32]*gnnz + 4; pvo_xml_file_write_element( f, "DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"%lu\"", offset ); offset += pvo_var_type_sizeof[PVO_VAR_INT32]*gncells + 4; pvo_xml_file_write_element( f, "DataArray type=\"UInt8\" Name=\"types\" format=\"appended\" offset=\"%lu\"", offset ); pvo_xml_file_end_group( f, "Cells" ); pvo_xml_file_end_group( f, "Piece" ); pvo_xml_file_end_group( f, "UnstructuredGrid" ); pvo_xml_file_new_group( f, "AppendedData encoding=\"raw\"" ); pvo_xml_file_write_single( f, "_", 1, MPI_CHAR ); for( p = self->cki->vlist; p; p = p->next ) { if( PVO_VAR_NODEDATA != p->grp ) continue; nbytes = pvo_var_type_sizeof[p->type]*p->ncomps*gnnodes; pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT ); pvo_var_type_mpi(p->type, &type); pvo_xml_file_write_ordered( f, (void *)p->ptr, p->ncomps*fh->nnodes, type ); } for( p = self->cki->vlist; p; p = p->next ) { if( PVO_VAR_CELLDATA != p->grp ) continue; nbytes = pvo_var_type_sizeof[p->type]*p->ncomps*gncells; pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT ); pvo_var_type_mpi(p->type, &type); pvo_xml_file_write_ordered( f, (void* )p->ptr, p->ncomps*fh->ncells, type ); } /* Write point coordinates */ nbytes = pvo_var_type_sizeof[PVO_VAR_FLOAT32]*3*gnnodes; pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT ); pvo_var_type_mpi( PVO_VAR_FLOAT32, &type ); pvo_xml_file_write_ordered( f, fh->pts, 3*fh->nnodes, type ); /* Bottom index for the local process. This value can be added to the local * node or cell index to get unique identifiers (within the island */ /* Write connectivity */ nbytes = pvo_var_type_sizeof[PVO_VAR_INT32]*gnnz; pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT ); for( i = 0; i < fh->cia[fh->ncells]; ++i ) fh->cja[i] += bnodes; pvo_var_type_mpi( PVO_VAR_INT32, &type ); pvo_xml_file_write_ordered( f, fh->cja, fh->cia[fh->ncells], type ); for( i = 0; i < fh->cia[fh->ncells]; ++i ) fh->cja[i] -= bnodes; /* Write offsets */ nbytes = pvo_var_type_sizeof[PVO_VAR_INT32]*gncells; pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT ); for( i = 1; i <= fh->ncells; ++i ) fh->cia[i] += bnnz; pvo_var_type_mpi( PVO_VAR_INT32, &type ); pvo_xml_file_write_ordered( f, fh->cia+1, fh->ncells, type ); for( i = 1; i <= fh->ncells; ++i ) fh->cia[i] -= bnnz; /* Write types */ nbytes = pvo_var_type_sizeof[PVO_VAR_UINT8]*gncells; pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT ); pvo_var_type_mpi( PVO_VAR_UINT8, &type ); pvo_xml_file_write_ordered( f, fh->types, fh->ncells, type ); pvo_xml_file_end_group( f, "AppendedData" ); pvo_xml_file_end_group( f, "VTKFile" ); fn_exit: return err; fn_fail: err = -1; goto fn_exit; }
/* check whether all items in buf are already in sorted order */ int DTCMP_Is_sorted( const void* buf, int count, MPI_Datatype key, MPI_Datatype keysat, DTCMP_Op cmp, DTCMP_Flags hints, MPI_Comm comm, int* flag) { int rc = DTCMP_SUCCESS; /* assume that items are globally sorted, * we'll set this to 0 if we find otherwise */ int sorted = 1; /* get our rank and the number of ranks in the communicator */ int rank, ranks; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &ranks); /* first, step through and check that all of our local items are in order */ DTCMP_Is_sorted_local(buf, count, key, keysat, cmp, hints, &sorted); /* bail out at this point if ranks == 1 */ if (ranks <= 1) { *flag = sorted; return DTCMP_SUCCESS; } /* get extent of keysat */ MPI_Aint lb, extent; MPI_Type_get_extent(keysat, &lb, &extent); /* get true extent of key */ MPI_Aint key_true_lb, key_true_extent; MPI_Type_get_true_extent(key, &key_true_lb, &key_true_extent); /* TODO: if we know that each proc has an item, * we could just do a single pt2pt send to the rank one higher, * compare, then allreduce, and thereby avoid the type/op creation * and scan that follows */ /* allocate type for scan, one int to say whether key is valid, * and our largest key */ size_t item_size = sizeof(int) + key_true_extent; char* sendbuf = dtcmp_malloc(item_size, 0, __FILE__, __LINE__); char* recvbuf = dtcmp_malloc(item_size, 0, __FILE__, __LINE__); /* copy our largest item into our send buffer, * set valid flag to 1 if we have a value */ int* valid = (int*) sendbuf; void* value = (void*) (sendbuf + sizeof(int)); if (count > 1) { *valid = 1; /* get pointer to largest element in our buffer, * and copy it to our send buffer */ const void* lastitem = (const void*) ((const char*)buf + (count - 1) * extent); DTCMP_Memcpy(value, 1, key, lastitem, 1, key); } else { /* we dont have any items, so set valid flag to 0 */ *valid = 0; } /* create and commit type that consists of leading int followed by key */ MPI_Datatype validtype; dtcmp_type_concat2(MPI_INT, key, &validtype); /* create user-defined reduction operation to copy key if its valid */ MPI_Op validop; MPI_Op_create(copy_key_if_valid, 0, &validop); /* execute scan to get key from next process to our left (that has an item) */ MPI_Exscan(sendbuf, recvbuf, 1, validtype, validop, comm); /* free off our user-defined reduction op and datatype */ MPI_Op_free(&validop); MPI_Type_free(&validtype); /* compare our smallest item to the received item */ if (count > 0 && rank > 0) { int recvvalid = *(int*) recvbuf; if (recvvalid) { const void* recvkey = (const void*) (recvbuf + sizeof(int)); if (dtcmp_op_eval(recvkey, buf, cmp) > 0) { sorted = 0; } } } /* allreduce to determine whether all items are in order */ int all_sorted; MPI_Allreduce(&sorted, &all_sorted, 1, MPI_INT, MPI_LAND, comm); /* free the scratch space */ dtcmp_free(&recvbuf); dtcmp_free(&sendbuf); /* set caller's output flag and return */ *flag = all_sorted; return rc; }
slint_t mpi_partition_radix2(elements_t *s, partcond2_t *pc, slint_t rhigh, slint_t rlow, slint_t rwidth, int *scounts, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_partition_radix2 */ { slkey_pure_t max_nclasses; slkey_pure_t nclasses, bit_mask; slkey_pure_t k; const slint_t max_nareas = size - 1; slint_t nareas, nareas_new; elements_t areas0[max_nareas], areas1[max_nareas], *areas, *areas_new; double *locals, *globals; double *local_counts, *local_weights, *global_counts, *global_weights; const slint_t max_nparts = size - 1; slint_t parts_low, parts_high, nparts_removed; slint_t parts[max_nparts], part_areas[max_nparts]; double parts_range_[2 * 2 * (1 + max_nparts + 1)]; double *parts_range = parts_range_ + (2 * 2); double parts_minmax_[2 * 4 * (1 + max_nparts + 1)]; double *parts_minmax = parts_minmax_ + (2 * 4); slint_t parts_update_[1 + max_nparts + 1]; slint_t *parts_update = parts_update_ + 1; double parts_minmax_new[2 * 4]; double current_minmax[2 * 2]; double final_locals[2 * max_nparts]; slint_t i, j, jp1, jm1, l, lp1, lm1; slint_t current_width; double minmax[2 * 4 * size]; slint_t last_new_area, last_new_class; #ifdef HAVENT_MPI_IN_PLACE double local_minmax[2 * 4]; #endif slint_t lc, lcs, gc, gcs; double lw, gw, lws, gws; double d, m; elements_t xi, end; slint_t round = 0; slint_t direction = 1; slint_t refine, finalize; #ifdef RCOUNTS_RDISPLS int *rcounts, *rdispls; #endif #ifdef WEIGHT_STATS slint_t total_count = 0, partial_counts[size + 1]; double total_weight = 0.0, partial_weights[size + 1]; double vmin, vmax; # ifdef HAVENT_MPI_IN_PLACE slint_t partial_counts2[size + 1]; double partial_weights2[size + 1]; # endif #endif rti_treset(rti_tid_mpi_partition_radix2_while); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_count); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_allreduce); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_round1); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_round1_allgather); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_exscan); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_pre); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_classes); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_final); /* sl_tid */ rti_treset(rti_tid_mpi_partition_radix2_while_check_post); /* sl_tid */ rti_tstart(rti_tid_mpi_partition_radix2_sync); #ifdef SYNC_ON_INIT MPI_Barrier(comm); #endif rti_tstop(rti_tid_mpi_partition_radix2_sync); rti_tstart(rti_tid_mpi_partition_radix2); if (rhigh < 0) rhigh = radix_high; if (rlow < 0) rlow = radix_low; if (rwidth < 0) rwidth = sort_radix_width_default; max_nclasses = powof2_typed(rwidth, slkey_pure_t); locals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double)); globals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double)); areas = areas0; areas_new = areas1; /* init the first area (all elements) */ nareas = 1; elem_assign(s, &areas[0]); /* init all parts */ parts_low = 0; parts_high = max_nparts - 1; for (i = parts_low; i <= parts_high; ++i) { parts[i] = i; part_areas[i] = 0; } /* init sdispls */ for (i = 0; i < size; ++i) sdispls[i] = 0; rti_tstart(rti_tid_mpi_partition_radix2_while); while (parts_low <= parts_high) { ++round; /* setup bitmask */ current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= (current_width > 0)?current_width - 1:rhigh; nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1; bit_mask = nclasses - 1; SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", rhigh: %" sl_int_type_fmt ", current_width: %" sl_int_type_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses); finalize = (current_width <= 0); if (!finalize || round == 1) { /* init counters */ local_counts = locals; global_counts = globals; local_weights = locals + (nareas * nclasses) + nareas; global_weights = globals + (nareas * nclasses) + nareas; /* zero all counter */ for (i = 0; i < nareas; ++i) for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = local_weights[i * nclasses + k] = 0.0; rti_tstart(rti_tid_mpi_partition_radix2_while_count); /* for every area */ for (i = 0; i < nareas; ++i) { elem_assign_at(&areas[i], areas[i].size, &end); if (nclasses > 1) { /* counts and weights in every class */ for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) { k = radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); local_counts[i * nclasses + k] += 1; local_weights[i * nclasses + k] += elem_weight_one(&xi, 0); } } else { /* total counts and weights */ local_counts[i * nclasses + 0] = areas[i].size; for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) local_weights[i * nclasses + 0] += elem_weight_one(&xi, 0); } /* total counts and weights in this area */ local_counts[nareas * nclasses + i] = areas[i].size; local_weights[nareas * nclasses + i] = 0.0; for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += local_weights[i * nclasses + k]; } rti_tstop(rti_tid_mpi_partition_radix2_while_count); --rhigh; rti_tstart(rti_tid_mpi_partition_radix2_while_allreduce); /* create global counts and weights */ #ifdef MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD if (size >= MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD) { MPI_Reduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm); MPI_Bcast(globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm); } else #endif MPI_Allreduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm); rti_tstop(rti_tid_mpi_partition_radix2_while_allreduce); } #ifdef TIMING SL_TRACE_IF(DEBUG_OR_NOT, "allreduce: %f, nareas: %" sl_int_type_fmt ", nclasses: %" sl_key_type_fmt ", doubles: %" sl_int_type_fmt, rti_tlast(rti_tid_mpi_partition_radix2_while_allreduce), nareas, nclasses, (1 + 1) * (nareas * nclasses + nareas)); #endif /* if (DEBUG_OR_NOT) { printf("%d: locals\n", rank); for (i = 0; i < nareas; ++i) { printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", local_counts[i * nclasses + k]); printf(" = %f\n", local_counts[nareas * nclasses + i]); printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", local_weights[i * nclasses + k]); printf(" = %f\n", local_weights[nareas * nclasses + i]); } printf("%d: globals\n", rank); for (i = 0; i < nareas; ++i) { printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", global_counts[i * nclasses + k]); printf(" = %f\n", global_counts[nareas * nclasses + i]); printf("%d: %" sl_int_type_fmt ":", rank, i); for (k = 0; k < nclasses; ++k) printf(" %f", global_weights[i * nclasses + k]); printf(" = %f\n", global_weights[nareas * nclasses + i]); } }*/ /* do some initializations */ if (round == 1) { rti_tstart(rti_tid_mpi_partition_radix2_while_round1); /* distribute min/max counts and weights */ minmax[rank * 2 * 4 + 0 + 0] = (pc->min_count >= 0)?pc->min_count:(-pc->min_count * global_counts[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 0 + 1] = (pc->max_count >= 0)?pc->max_count:(-pc->max_count * global_counts[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 0 + 2] = (pc->min_cpart >= 0)?pc->min_cpart:(-pc->min_cpart * global_counts[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 0 + 3] = (pc->max_cpart >= 0)?pc->max_cpart:(-pc->max_cpart * global_counts[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 4 + 0] = (pc->min_weight >= 0)?pc->min_weight:(-pc->min_weight * global_weights[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 4 + 1] = (pc->max_weight >= 0)?pc->max_weight:(-pc->max_weight * global_weights[nareas * nclasses + 0] / size); minmax[rank * 2 * 4 + 4 + 2] = (pc->min_wpart >= 0)?pc->min_wpart:(-pc->min_wpart * global_weights[nareas * nclasses + 0]); minmax[rank * 2 * 4 + 4 + 3] = (pc->max_wpart >= 0)?pc->max_wpart:(-pc->max_wpart * global_weights[nareas * nclasses + 0]); rti_tstart(rti_tid_mpi_partition_radix2_while_round1_allgather); #ifdef HAVENT_MPI_IN_PLACE local_minmax[0 + 0] = minmax[rank * 2 * 4 + 0 + 0]; local_minmax[0 + 1] = minmax[rank * 2 * 4 + 0 + 1]; local_minmax[0 + 2] = minmax[rank * 2 * 4 + 0 + 2]; local_minmax[0 + 3] = minmax[rank * 2 * 4 + 0 + 3]; local_minmax[4 + 0] = minmax[rank * 2 * 4 + 4 + 0]; local_minmax[4 + 1] = minmax[rank * 2 * 4 + 4 + 1]; local_minmax[4 + 2] = minmax[rank * 2 * 4 + 4 + 2]; local_minmax[4 + 3] = minmax[rank * 2 * 4 + 4 + 3]; MPI_Allgather(local_minmax, 2 * 4, MPI_DOUBLE, minmax, 2 * 4, MPI_DOUBLE, comm); /* MPI_Gather(local_minmax_weights, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, 0, comm); MPI_Bcast(minmax_weights, 2 * 4 * size, MPI_DOUBLE, 0, comm);*/ #else MPI_Allgather(MPI_IN_PLACE, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, comm); #endif rti_tstop(rti_tid_mpi_partition_radix2_while_round1_allgather); #ifdef WEIGHT_STATS total_count = global_counts[nareas * nclasses + 0]; total_weight = global_weights[nareas * nclasses + 0]; #endif parts_minmax[2 * 4 * (parts_low - 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 2] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 3] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 2] = 0; parts_minmax[2 * 4 * (parts_low - 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 3] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 2] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 3] = global_counts[nareas * nclasses + 0]; parts_minmax[2 * 4 * (parts_high + 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 2] = 0; parts_minmax[2 * 4 * (parts_high + 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 3] = global_weights[nareas * nclasses + 0]; parts_range[2 * 2 * (parts_low - 1) + 0 + 0] = parts_range[2 * 2 * (parts_high + 1) + 0 + 0] = 0.0; parts_range[2 * 2 * (parts_low - 1) + 0 + 1] = parts_range[2 * 2 * (parts_high + 1) + 0 + 1] = global_counts[nareas * nclasses + 0]; parts_range[2 * 2 * (parts_low - 1) + 2 + 0] = parts_range[2 * 2 * (parts_high + 1) + 2 + 0] = 0.0; parts_range[2 * 2 * (parts_low - 1) + 2 + 1] = parts_range[2 * 2 * (parts_high + 1) + 2 + 1] = global_weights[nareas * nclasses + 0]; for (i = parts_high; i >= parts_low; --i) { parts_minmax[2 * 4 * parts[i] + 0 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 1] - minmax[2 * 4 * (parts[i] + 1) + 0 + 0]; parts_minmax[2 * 4 * parts[i] + 0 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 3] - minmax[2 * 4 * (parts[i] + 1) + 0 + 1]; parts_minmax[2 * 4 * parts[i] + 4 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 1] - minmax[2 * 4 * (parts[i] + 1) + 4 + 0]; parts_minmax[2 * 4 * parts[i] + 4 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 3] - minmax[2 * 4 * (parts[i] + 1) + 4 + 1]; parts_minmax[2 * 4 * parts[i] + 0 + 0] = parts_minmax[2 * 4 * parts[i] + 0 + 2] = parts_minmax[2 * 4 * parts[i] + 4 + 0] = parts_minmax[2 * 4 * parts[i] + 4 + 2] = -1; parts_range[2 * 2 * parts[i] + 0 + 0] = 0.0; parts_range[2 * 2 * parts[i] + 0 + 1] = global_counts[nareas * nclasses + 0]; parts_range[2 * 2 * parts[i] + 2 + 0] = 0.0; parts_range[2 * 2 * parts[i] + 2 + 1] = global_weights[nareas * nclasses + 0]; /* SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 0 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 0 + 3]);*/ /* SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 4 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 4 + 3]);*/ parts_update[parts[i]] = 1; if (finalize) { final_locals[2 * i + 0] = local_counts[nareas * nclasses + 0]; final_locals[2 * i + 1] = local_weights[nareas * nclasses + 0]; } } rti_tstop(rti_tid_mpi_partition_radix2_while_round1); } if (finalize) { j = parts_high - parts_low + 1; SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" sl_int_type_fmt " parts", j); rti_tstart(rti_tid_mpi_partition_radix2_while_exscan); MPI_Exscan(&final_locals[2 * parts_low], &locals[2 * parts_low], 2 * j, MPI_DOUBLE, MPI_SUM, comm); if (rank == 0) for (i = parts_low; i <= parts_high; ++i) locals[2 * i + 0] = locals[2 * i + 1] = 0; rti_tstop(rti_tid_mpi_partition_radix2_while_exscan); } nareas_new = 0; last_new_area = last_new_class = -1; /* check all remaining parts */ SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", %s", round, (direction > 0)?"forward":"backward"); nparts_removed = 0; rti_tstart(rti_tid_mpi_partition_radix2_while_check); i = (direction > 0)?parts_low:parts_high; while ((direction > 0)?(i <= parts_high):(i >= parts_low)) { rti_tstart(rti_tid_mpi_partition_radix2_while_check_pre); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": PART: %" sl_int_type_fmt ",%" sl_int_type_fmt, round, i, parts[i]); j = 2 * 4 * parts[i]; jp1 = 2 * 4 * (parts[i] + 1); jm1 = 2 * 4 * (parts[i] - 1); l = 2 * 2 * parts[i]; lp1 = 2 * 2 * (parts[i] + 1); lm1 = 2 * 2 * (parts[i] - 1); if (parts_update[parts[i]]) { if (direction > 0) { parts_minmax_new[0 + 0] = parts_minmax[jm1 + 0 + 0] + minmax[j + 0 + 0]; parts_minmax_new[0 + 2] = parts_minmax[jm1 + 0 + 2] + minmax[j + 0 + 1]; parts_minmax_new[4 + 0] = parts_minmax[jm1 + 4 + 0] + minmax[j + 4 + 0]; parts_minmax_new[4 + 2] = parts_minmax[jm1 + 4 + 2] + minmax[j + 4 + 1]; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f + %f, %f + %f / %f + %f, %f + %f", i, parts[i], parts_minmax[jm1 + 0 + 0], minmax[j + 0 + 0], parts_minmax[jm1 + 0 + 2], minmax[j + 0 + 1], parts_minmax[jm1 + 4 + 0], minmax[j + 4 + 0], parts_minmax[jm1 + 4 + 2], minmax[j + 4 + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 0] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 0] = minmax[jp1 + 0 + 2]; if (parts_minmax_new[0 + 2] > minmax[j + 0 + 3]) parts_minmax_new[0 + 2] = minmax[j + 0 + 3]; if (parts_minmax_new[4 + 0] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 0] = minmax[jp1 + 4 + 2]; if (parts_minmax_new[4 + 2] > minmax[j + 4 + 3]) parts_minmax_new[4 + 2] = minmax[j + 4 + 3]; parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1]; parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3]; parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1]; parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3]; } else { parts_minmax_new[0 + 1] = parts_minmax[jp1 + 0 + 1] - minmax[jp1 + 0 + 0]; parts_minmax_new[0 + 3] = parts_minmax[jp1 + 0 + 3] - minmax[jp1 + 0 + 1]; parts_minmax_new[4 + 1] = parts_minmax[jp1 + 4 + 1] - minmax[jp1 + 4 + 0]; parts_minmax_new[4 + 3] = parts_minmax[jp1 + 4 + 3] - minmax[jp1 + 4 + 1]; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f - %f, %f - %f / %f - %f, %f - %f", i, parts[i], parts_minmax[jp1 + 0 + 1], minmax[jp1 + 0 + 0], parts_minmax[jp1 + 0 + 3], minmax[jp1 + 0 + 1], parts_minmax[jp1 + 4 + 1], minmax[jp1 + 4 + 0], parts_minmax[jp1 + 4 + 3], minmax[jp1 + 4 + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 3] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 3] = minmax[jp1 + 0 + 2]; if (parts_minmax_new[0 + 1] > minmax[j + 0 + 3]) parts_minmax_new[0 + 1] = minmax[j + 0 + 3]; if (parts_minmax_new[4 + 3] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 3] = minmax[jp1 + 4 + 2]; if (parts_minmax_new[4 + 1] > minmax[j + 4 + 3]) parts_minmax_new[4 + 1] = minmax[j + 4 + 3]; parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0]; parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2]; parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0]; parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2]; } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 1. parts_minmax_new: %f %f %f %f / %f %f %f %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": minmax: %f %f / %f %f", parts[i], minmax[2 * 4 * (parts[i] + 1) + 0 + 2], minmax[2 * 4 * (parts[i] + 0) + 0 + 3], minmax[2 * 4 * (parts[i] + 1) + 4 + 2], minmax[2 * 4 * (parts[i] + 0) + 4 + 3]); if (parts_minmax_new[0 + 0] > parts_minmax_new[0 + 1]) parts_minmax_new[0 + 0] = parts_minmax_new[0 + 1] = (parts_minmax_new[0 + 0] + parts_minmax_new[0 + 1]) / 2; if (parts_minmax_new[0 + 2] < parts_minmax_new[0 + 3]) parts_minmax_new[0 + 2] = parts_minmax_new[0 + 3] = (parts_minmax_new[0 + 2] + parts_minmax_new[0 + 3]) / 2; if (parts_minmax_new[4 + 0] > parts_minmax_new[4 + 1]) parts_minmax_new[4 + 0] = parts_minmax_new[4 + 1] = (parts_minmax_new[4 + 0] + parts_minmax_new[4 + 1]) / 2; if (parts_minmax_new[4 + 2] < parts_minmax_new[4 + 3]) parts_minmax_new[4 + 2] = parts_minmax_new[4 + 3] = (parts_minmax_new[4 + 2] + parts_minmax_new[4 + 3]) / 2; } else { parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0]; parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1]; parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2]; parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3]; parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0]; parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1]; parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2]; parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3]; } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 2. parts_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); current_minmax[0 + 0] = xmax(parts_minmax_new[0 + 0], parts_minmax_new[0 + 3]) - parts_range[l + 0 + 0]; current_minmax[0 + 1] = xmin(parts_minmax_new[0 + 2], parts_minmax_new[0 + 1]) - parts_range[l + 0 + 0]; current_minmax[2 + 0] = xmax(parts_minmax_new[4 + 0], parts_minmax_new[4 + 3]) - parts_range[l + 2 + 0]; current_minmax[2 + 1] = xmin(parts_minmax_new[4 + 2], parts_minmax_new[4 + 1]) - parts_range[l + 2 + 0]; SL_ASSERT(current_minmax[0 + 0] <= current_minmax[0 + 1]); SL_ASSERT(current_minmax[2 + 0] <= current_minmax[2 + 1]); rti_tstop(rti_tid_mpi_partition_radix2_while_check_pre); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": current_minmax: %f %f / %f %f", parts[i], current_minmax[0 + 0], current_minmax[0 + 1], current_minmax[2 + 0], current_minmax[2 + 1]); lcs = gcs = 0; lws = gws = 0; /* HIT is the default */ refine = 0; if (!finalize) { rti_tstart(rti_tid_mpi_partition_radix2_while_check_classes); for (k = 0; k < nclasses; ++k) { lc = local_counts[part_areas[i] * nclasses + k]; gc = global_counts[part_areas[i] * nclasses + k]; lw = local_weights[part_areas[i] * nclasses + k]; gw = global_weights[part_areas[i] * nclasses + k]; current_minmax[0 + 0] -= gc; current_minmax[0 + 1] -= gc; current_minmax[2 + 0] -= gw; current_minmax[2 + 1] -= gw; SL_TRACE_IF(DEBUG_OR_NOT, "k = %" sl_key_pure_type_fmt ", current_minmax: %f %f / %f %f", k, current_minmax[0], current_minmax[1], current_minmax[2], current_minmax[3]); /* stop and refine if max count is skipped OR min count AND max weight is skipped */ if ((current_minmax[0 + 1] < 0) || (current_minmax[0 + 0] < 0 && current_minmax[2 + 1] < 0)) { refine = 1; break; } lcs += lc; gcs += gc; lws += lw; gws += gw; gc = gw = 0.0; /* if between min/max counts */ if (current_minmax[0 + 0] <= 0 && current_minmax[0 + 1] >= 0) { /* go to next if max count not reached AND min weight not reached */ if (current_minmax[0 + 1] > 0 && current_minmax[2 + 0] > 0) continue; /* look ahead for a better stop */ if (k + 1 < nclasses && current_minmax[0 + 1] - global_counts[part_areas[i] * nclasses + k + 1] >= 0) { /* continue if weights will improve */ if (myabs(current_minmax[2 + 0] + current_minmax[2 + 1]) > myabs(current_minmax[2 + 0] + current_minmax[2 + 1] - 2 * global_weights[part_areas[i] * nclasses + k + 1])) continue; } /* stop */ break; } } SL_ASSERT(k < nclasses); SL_TRACE_IF(DEBUG_OR_NOT, "%s k = %" sl_key_pure_type_fmt, (refine)?"REFINE":"HIT", k); rti_tstop(rti_tid_mpi_partition_radix2_while_check_classes); } else { rti_tstart(rti_tid_mpi_partition_radix2_while_check_final); /* middle of min/max weight */ m = (current_minmax[2 + 0] + current_minmax[2 + 1]) / 2; /* min. part of weight to contribute */ d = xmax(0, m - locals[i * 2 + 1]); /* contribute all? */ if (d >= final_locals[i * 2 + 1]) { lc = final_locals[i * 2 + 0]; lw = final_locals[i * 2 + 1]; } else { /* contribute only a part */ lc = 0; lw = 0; /* not required */ do { d -= elem_weight_one(s, sdispls[1 + parts[i]] + lc); ++lc; } while (d >= 0 && lc < final_locals[i * 2 + 0]); --lc; /* if unweighted, then m = middle of min/max count, d = ..., lc = d */ } /* check mc against min/max count borders */ lc = xminmax(current_minmax[0 + 0] - locals[i * 2 + 0], lc, current_minmax[0 + 1] - locals[i * 2 + 0]); /* check agains 0 (don't step back!) and the local contribution */ lc = xminmax(0, lc, final_locals[i * 2 + 0]); /* the exact global counts/weights are unknown (set gc/gw so that parts_range is not changed) */ gc = 0; gw = 0; lcs += lc; gcs += gc; lws += lw; gws += gw; gc = (parts_range[2 * 2 * parts[i] + 0 + 1] - parts_range[2 * 2 * parts[i] + 0 + 0]); gw = (parts_range[2 * 2 * parts[i] + 2 + 1] - parts_range[2 * 2 * parts[i] + 2 + 0]); rti_tstop(rti_tid_mpi_partition_radix2_while_check_final); } rti_tstart(rti_tid_mpi_partition_radix2_while_check_post); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": sdispls[%" sl_int_type_fmt " + 1] = %d, lcs = %" sl_int_type_fmt, i, parts[i], parts[i], sdispls[parts[i] + 1], lcs); sdispls[parts[i] + 1] += lcs; if (gcs > 0 || gws > 0) { parts_range[l + 0 + 0] += gcs; parts_range[l + 0 + 1] = parts_range[l + 0 + 0] + gc; parts_range[l + 2 + 0] += gws; parts_range[l + 2 + 1] = parts_range[l + 2 + 0] + gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 3. part_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": parts_range: %f %f / %f %f", i, parts[i], parts_range[2 * 2 * parts[i] + 0 + 0], parts_range[2 * 2 * parts[i] + 0 + 1], parts_range[2 * 2 * parts[i] + 2 + 0], parts_range[2 * 2 * parts[i] + 2 + 1]); parts_minmax_new[0 + 0] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 0], parts_range[l + 0 + 1]); parts_minmax_new[0 + 2] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 2], parts_range[l + 0 + 1]); parts_minmax_new[0 + 1] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 1], parts_range[l + 0 + 1]); parts_minmax_new[0 + 3] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 3], parts_range[l + 0 + 1]); parts_minmax_new[4 + 0] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 0], parts_range[l + 2 + 1]); parts_minmax_new[4 + 2] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 2], parts_range[l + 2 + 1]); parts_minmax_new[4 + 1] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 1], parts_range[l + 2 + 1]); parts_minmax_new[4 + 3] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 3], parts_range[l + 2 + 1]); } SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 4. part_minmax_new: %f %f %f %f / %f %f %f %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]); if (parts_minmax_new[0 + 0] != parts_minmax[j + 0 + 0] || parts_minmax_new[0 + 2] != parts_minmax[j + 0 + 2] || parts_minmax_new[4 + 0] != parts_minmax[j + 4 + 0] || parts_minmax_new[4 + 2] != parts_minmax[j + 4 + 2]) { parts_minmax[j + 0 + 0] = parts_minmax_new[0 + 0]; parts_minmax[j + 0 + 2] = parts_minmax_new[0 + 2]; parts_minmax[j + 4 + 0] = parts_minmax_new[4 + 0]; parts_minmax[j + 4 + 2] = parts_minmax_new[4 + 2]; parts_update[parts[i] + 1] = 1; } if (parts_minmax_new[0 + 1] != parts_minmax[j + 0 + 1] || parts_minmax_new[0 + 3] != parts_minmax[j + 0 + 3] || parts_minmax_new[4 + 1] != parts_minmax[j + 4 + 1] || parts_minmax_new[4 + 3] != parts_minmax[j + 4 + 3]) { parts_minmax[j + 0 + 1] = parts_minmax_new[0 + 1]; parts_minmax[j + 0 + 3] = parts_minmax_new[0 + 3]; parts_minmax[j + 4 + 1] = parts_minmax_new[4 + 1]; parts_minmax[j + 4 + 3] = parts_minmax_new[4 + 3]; parts_update[parts[i] - 1] = 1; } parts_update[parts[i]] = 0; /* refine or remove */ if (refine) { /* bits left for partitioning? */ if (rhigh >= rlow) { if (last_new_area == part_areas[i] && last_new_class == k) part_areas[i] = nareas_new - 1; else { /* update last_new_... */ last_new_area = part_areas[i]; last_new_class = k; /* create new area */ elem_assign_at(&areas[part_areas[i]], lcs, &areas_new[nareas_new]); areas_new[nareas_new].size = local_counts[part_areas[i] * nclasses + k]; part_areas[i] = nareas_new; ++nareas_new; } } else { /* save local count/weight for the later prefix calculations */ final_locals[2 * (i - nparts_removed * direction) + 0] = lc; final_locals[2 * (i - nparts_removed * direction) + 1] = lw; } parts[i - nparts_removed * direction] = parts[i]; part_areas[i - nparts_removed * direction] = part_areas[i]; } else ++nparts_removed; rti_tstop(rti_tid_mpi_partition_radix2_while_check_post); i += direction; } if (direction > 0) parts_high -= nparts_removed; else parts_low += nparts_removed; direction *= -1; /* SL_NOTICE_IF(DEBUG_OR_NOT, "nparts = %" sl_int_type_fmt " vs. nareas_new = %" sl_int_type_fmt, nparts, nareas_new);*/ rti_tstop(rti_tid_mpi_partition_radix2_while_check); /* switch areas */ nareas = nareas_new; if (areas == areas0) { areas = areas1; areas_new = areas0; } else { areas = areas0; areas_new = areas1; } } rti_tstop(rti_tid_mpi_partition_radix2_while); /* create scounts */ for (i = 0; i < size - 1; ++i) scounts[i] = sdispls[i + 1] - sdispls[i]; scounts[size - 1] = s->size - sdispls[size - 1]; #ifdef SCOUNTS_SDISPLS printf("%d: scounts", rank); for (i = 0, j = 0; i < size; ++i) { printf(" %d", scounts[i]); j += scounts[i]; } printf(" = %" sl_int_type_fmt "\n", j); printf("%d: sdispls", rank); for (i = 0; i < size; ++i) printf(" %d", sdispls[i]); printf("\n"); #endif #ifdef RCOUNTS_RDISPLS rcounts = sl_alloc(size, sizeof(int)); rdispls = sl_alloc(size, sizeof(int)); MPI_Alltoall(scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm); rdispls[0] = 0; for (i = 1; i < size; ++i) rdispls[i] = rdispls[i - 1] + rcounts[i - 1]; printf("%d: rcounts", rank); for (i = 0; i < size; ++i) printf(" %d", rcounts[i]); printf("\n"); printf("%d: rdispls", rank); for (i = 0; i < size; ++i) printf(" %d", rdispls[i]); printf("\n"); sl_free(rcounts); sl_free(rdispls); #endif sl_free(locals); sl_free(globals); #ifdef WEIGHT_STATS partial_counts[size] = 0; partial_weights[size] = 0.0; for (i = 0; i < size; ++i) { partial_counts[i] = scounts[i]; partial_weights[i] = 0.0; for (j = sdispls[i]; j < sdispls[i] + scounts[i]; ++j) partial_weights[i] += elem_weight_one(s, j); partial_counts[size] += partial_counts[i]; partial_weights[size] += partial_weights[i]; } #ifdef HAVENT_MPI_IN_PLACE MPI_Reduce(partial_counts, partial_counts2, size + 1, int_mpi_datatype, MPI_SUM, 0, comm); MPI_Reduce(partial_weights, partial_weights2, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm); # define partial_counts partial_counts2 # define partial_weights partial_weights2 #else /* recvbuf requires workaround for an in-place/aliased-buffer-check-bug in mpich2 (fixed with rev 5518) */ MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_counts, (rank == 0)?partial_counts:NULL, size + 1, int_mpi_datatype, MPI_SUM, 0, comm); MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_weights, (rank == 0)?partial_weights:NULL, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm); #endif if (rank == 0) { printf("%d: total_count: %" sl_int_type_fmt " vs. %" sl_int_type_fmt "\n", rank, total_count, partial_counts[size]); d = 0.0; vmin = 1.0; vmax = 0.0; for (i = 0; i < size; ++i) { /* printf("%d: %" sl_int_type_fmt " %" sl_int_type_fmt " / %f - %" sl_int_type_fmt " / %f\n", rank, i, partial_counts[i], (double) partial_counts[i] / partial_counts[size], (partial_counts[size] / size) - partial_counts[i], fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])));*/ d += fabs((partial_counts[size] / size) - partial_counts[i]); if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) < vmin) vmin = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])); if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) > vmax) vmax = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])); } printf("%d: min/max: %f / %f\n", rank, vmin, vmax); printf("%d: average_count: %" sl_int_type_fmt " - %f / %f\n", rank, partial_counts[size] / size, d / size, d / partial_counts[size]); printf("%d: total_weight: %f vs. %f\n", rank, total_weight, partial_weights[size]); d = 0.0; vmin = 1.0; vmax = 0.0; for (i = 0; i < size; ++i) { /* printf("%d: %" sl_int_type_fmt " %f / %f - %f / %f\n", rank, i, partial_weights[i], partial_weights[i] / partial_weights[size], (partial_weights[size] / size) - partial_weights[i], fabs(1.0 - (partial_weights[i] * size / partial_weights[size])));*/ d += fabs((partial_weights[size] / size) - partial_weights[i]); if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) < vmin) vmin = fabs(1.0 - (partial_weights[i] * size / partial_weights[size])); if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) > vmax) vmax = fabs(1.0 - (partial_weights[i] * size / partial_weights[size])); } printf("%d: min/max: %f / %f\n", rank, vmin, vmax); printf("%d: average_weight: %f - %f / %f\n", rank, partial_weights[size] / size, d / size, d / partial_weights[size]); } #endif rti_tstop(rti_tid_mpi_partition_radix2); #if defined(TIMING_STATS) && defined(SL_USE_RTI_TIM) if (rank == 0) { printf("%d: mpi_partition_radix: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2)); printf("%d: mpi_partition_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_sync)); printf("%d: mpi_partition_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while)); printf("%d: mpi_partition_radix: count: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_count)); printf("%d: mpi_partition_radix: allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_allreduce)); printf("%d: mpi_partition_radix: round1: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1)); printf("%d: mpi_partition_radix: allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1_allgather)); printf("%d: mpi_partition_radix: exscan: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_exscan)); printf("%d: mpi_partition_radix: check: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_check)); printf("%d: mpi_partition_radix: pre: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_pre)); printf("%d: mpi_partition_radix: classes: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_classes)); printf("%d: mpi_partition_radix: final: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_final)); printf("%d: mpi_partition_radix: post: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_post)); } #endif return 0; }
static void list_compute_summary(flist_t* flist) { /* initialize summary values */ flist->max_file_name = 0; flist->max_user_name = 0; flist->max_group_name = 0; flist->min_depth = 0; flist->max_depth = 0; flist->total_files = 0; flist->offset = 0; /* get our rank and the size of comm_world */ int rank, ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ranks); /* get total number of files in list */ uint64_t total; uint64_t count = flist->list_count; MPI_Allreduce(&count, &total, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); flist->total_files = total; /* bail out early if no one has anything */ if (total <= 0) { return; } /* compute the global offset of our first item */ uint64_t offset; MPI_Exscan(&count, &offset, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { offset = 0; } flist->offset = offset; /* compute local min/max values */ int min_depth = -1; int max_depth = -1; uint64_t max_name = 0; elem_t* current = flist->list_head; while (current != NULL) { uint64_t len = (uint64_t)(strlen(current->file) + 1); if (len > max_name) { max_name = len; } int depth = current->depth; if (depth < min_depth || min_depth == -1) { min_depth = depth; } if (depth > max_depth || max_depth == -1) { max_depth = depth; } /* go to next item */ current = current->next; } /* get global maximums */ int global_max_depth; MPI_Allreduce(&max_depth, &global_max_depth, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); uint64_t global_max_name; MPI_Allreduce(&max_name, &global_max_name, 1, MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD); /* since at least one rank has an item and max will be -1 on ranks * without an item, set our min to global max if we have no items, * this will ensure that our contribution is >= true global min */ int global_min_depth; if (count == 0) { min_depth = global_max_depth; } MPI_Allreduce(&min_depth, &global_min_depth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); /* set summary values */ flist->max_file_name = global_max_name; flist->min_depth = global_min_depth; flist->max_depth = global_max_depth; /* set summary on users and groups */ if (flist->detail) { flist->total_users = flist->users.count; flist->total_groups = flist->groups.count; flist->max_user_name = flist->users.chars; flist->max_group_name = flist->groups.chars; } return; }
int elemental2vec(const El::DistMatrix<El::Complex<double>,El::VC,El::STAR> &Y, std::vector<double> &vec){ assert((Y.DistData().colDist == El::STAR) and (Y.DistData().rowDist == El::VC)); int data_dof=2; int SCAL_EXP = 1; //double *pt_array,*pt_perm_array; int r,q,ll,rq; // el vec info int nbigs; //Number of large recv (i.e. recv 1 extra data point) int pstart; // p_id of nstart int rank = El::mpi::WorldRank(); //p_id int recv_size; // base recv size bool print = (rank == -1); // Get el vec info ll = Y.Height(); const El::Grid* g = &(Y.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; int cheb_deg = InvMedTree<FMM_Mat_t>::cheb_deg; int omp_p=omp_get_max_threads(); size_t n_coeff3=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6; // Get petsc vec params //VecGetLocalSize(pt_vec,&nlocal); int nlocal = (vec.size())/data_dof; if(print) std::cout << "m: " << std::endl; int nstart = 0; //VecGetArray(pt_vec,&pt_array); //VecGetOwnershipRange(pt_vec,&nstart,NULL); MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm); // Determine who owns the first element we want rq = r * q; pstart = nstart % rq; nbigs = nlocal % rq; recv_size = nlocal / rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "recv_size: " << recv_size << std::endl; } // Make recv sizes std::vector<int> recv_lengths(rq); std::fill(recv_lengths.begin(),recv_lengths.end(),recv_size); if(nbigs >0){ for(int i=0;i<nbigs;i++){ recv_lengths[(pstart + i) % rq] += 1; } } // Make recv disps std::vector<int> recv_disps = exscan(recv_lengths); // All2all to get send sizes std::vector<int> send_lengths(rq); MPI_Alltoall(&recv_lengths[0], 1, MPI_INT, &send_lengths[0], 1, MPI_INT,comm); // Scan to get send_disps std::vector<int> send_disps = exscan(send_lengths); // Do all2allv to get data on correct processor std::vector<El::Complex<double>> recv_data(nlocal); std::vector<El::Complex<double>> recv_data_ordered(nlocal); //MPI_Alltoallv(el_vec.Buffer(),&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); El::mpi::AllToAll(Y.LockedBuffer(), &send_lengths[0], &send_disps[0], &recv_data[0],&recv_lengths[0],&recv_disps[0],comm); if(print){ //std::cout << "Send data: " <<std::endl << *el_vec.Buffer() <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } // Reorder the data so taht it is in the right order for the fmm tree for(int p=0;p<rq;p++){ int base_idx = (p - pstart + rq) % rq; int offset = recv_disps[p]; for(int i=0;i<recv_lengths[p];i++){ recv_data_ordered[base_idx + rq*i] = recv_data[offset + i]; } } // loop through and put the data into the vector #pragma omp parallel for for(int i=0;i<nlocal; i++){ vec[2*i] = El::RealPart(recv_data_ordered[i]); vec[2*i+1] = El::ImagPart(recv_data_ordered[i]); } if(print){std::cout <<"here?"<<std::endl;} return 0; }
/* writes output in pnetcdf format nblocks: local number of blocks vblocks: pointer to array of vblocks out_file: output file name comm: MPI communicator */ void pnetcdf_write(int nblocks, struct vblock_t *vblocks, char *out_file, MPI_Comm comm) { #ifdef USEPNETCDF int err; int ncid, cmode, varids[23], dimids[8], dimids_2D[2]; MPI_Offset start[2], count[2]; MPI_Offset quants[NUM_QUANTS]; /* quantities per block */ MPI_Offset proc_quants[NUM_QUANTS]; /* quantities per process */ MPI_Offset tot_quants[NUM_QUANTS]; /* total quantities all global blocks */ MPI_Offset block_ofsts[NUM_QUANTS]; /* starting offsets for each block */ /* init */ int i; for (i = 0; i < NUM_QUANTS; i++) { quants[i] = 0; proc_quants[i] = 0; tot_quants[i] = 0; block_ofsts[i] = 0; } /* sum quantities over local blocks */ int b; for (b = 0; b < nblocks; b++) { proc_quants[NUM_VERTS] += vblocks[b].num_verts; proc_quants[NUM_COMP_CELLS] += vblocks[b].num_complete_cells; proc_quants[NUM_CELL_FACES] += vblocks[b].tot_num_cell_faces; proc_quants[NUM_FACE_VERTS] += vblocks[b].tot_num_face_verts; proc_quants[NUM_ORIG_PARTS] += vblocks[b].num_orig_particles; proc_quants[NUM_NEIGHBORS] += DIY_Num_neighbors(0, b); } proc_quants[NUM_BLOCKS] = nblocks; /* sum per process values to be global ones */ MPI_Allreduce(proc_quants, tot_quants, NUM_QUANTS, MPI_OFFSET, MPI_SUM, comm); /* prefix sum proc offsets */ MPI_Exscan(proc_quants, &block_ofsts, NUM_QUANTS, MPI_OFFSET, MPI_SUM, comm); /* create a new file for writing */ cmode = NC_CLOBBER | NC_64BIT_DATA; err = ncmpi_create(comm, out_file, cmode, MPI_INFO_NULL, &ncid); ERR; /* define dimensions */ err = ncmpi_def_dim(ncid, "num_g_blocks", tot_quants[NUM_BLOCKS], &dimids[0]); ERR; err = ncmpi_def_dim(ncid, "XYZ", 3, &dimids[1]); ERR; err = ncmpi_def_dim(ncid, "num_g_verts", tot_quants[NUM_VERTS], &dimids[2]); ERR; err = ncmpi_def_dim(ncid, "num_g_complete_cells", tot_quants[NUM_COMP_CELLS], &dimids[3]); ERR; err = ncmpi_def_dim(ncid, "tot_num_g_cell_faces", tot_quants[NUM_CELL_FACES], &dimids[4]); ERR; err = ncmpi_def_dim(ncid, "tot_num_g_face_verts", tot_quants[NUM_FACE_VERTS], &dimids[5]); ERR; err = ncmpi_def_dim(ncid, "num_g_orig_particles", tot_quants[NUM_ORIG_PARTS], &dimids[6]); ERR; err = ncmpi_def_dim(ncid, "num_g_neighbors", tot_quants[NUM_NEIGHBORS], &dimids[7]); ERR; /* define variables */ err = ncmpi_def_var(ncid, "num_verts", NC_INT, 1, &dimids[0], &varids[0]); ERR; err = ncmpi_def_var(ncid, "num_complete_cells", NC_INT, 1, &dimids[0], &varids[1]); ERR; err = ncmpi_def_var(ncid, "tot_num_cell_faces", NC_INT, 1, &dimids[0], &varids[2]); ERR; err = ncmpi_def_var(ncid, "tot_num_face_verts", NC_INT, 1, &dimids[0], &varids[3]); ERR; err = ncmpi_def_var(ncid, "num_orig_particles", NC_INT, 1, &dimids[0], &varids[4]); ERR; /* block offsets */ err = ncmpi_def_var(ncid, "block_off_num_verts", NC_INT64, 1, &dimids[0], &varids[5]); ERR; err = ncmpi_def_var(ncid, "block_off_num_complete_cells", NC_INT64, 1, &dimids[0], &varids[6]); ERR; err = ncmpi_def_var(ncid, "block_off_tot_num_cell_faces", NC_INT64, 1, &dimids[0], &varids[7]); ERR; err = ncmpi_def_var(ncid, "block_off_tot_num_face_verts", NC_INT64, 1, &dimids[0], &varids[8]); ERR; err = ncmpi_def_var(ncid, "block_off_num_orig_particles", NC_INT64, 1, &dimids[0], &varids[9]); ERR; dimids_2D[0] = dimids[0]; dimids_2D[1] = dimids[1]; err = ncmpi_def_var(ncid, "mins", NC_FLOAT, 2, dimids_2D, &varids[11]); ERR; err = ncmpi_def_var(ncid, "maxs", NC_FLOAT, 2, dimids_2D, &varids[12]); ERR; dimids_2D[0] = dimids[2]; dimids_2D[1] = dimids[1]; err = ncmpi_def_var(ncid, "save_verts", NC_FLOAT, 2, dimids_2D, &varids[13]); ERR; dimids_2D[0] = dimids[6]; dimids_2D[1] = dimids[1]; err = ncmpi_def_var(ncid, "sites", NC_FLOAT, 2, dimids_2D, &varids[14]); ERR; err = ncmpi_def_var(ncid, "complete_cells", NC_INT, 1, &dimids[3], &varids[15]); ERR; err = ncmpi_def_var(ncid, "areas", NC_FLOAT, 1, &dimids[3], &varids[16]); ERR; err = ncmpi_def_var(ncid, "vols", NC_FLOAT, 1, &dimids[3], &varids[17]); ERR; err = ncmpi_def_var(ncid, "num_cell_faces", NC_INT, 1, &dimids[3], &varids[18]); ERR; err = ncmpi_def_var(ncid, "num_face_verts", NC_INT, 1, &dimids[4], &varids[19]); ERR; err = ncmpi_def_var(ncid, "face_verts", NC_INT, 1, &dimids[5], &varids[20]); ERR; err = ncmpi_def_var(ncid, "neighbors", NC_INT, 1, &dimids[7], &varids[21]); ERR; err = ncmpi_def_var(ncid, "g_block_ids", NC_INT, 1, &dimids[0], &varids[22]); ERR; /* exit define mode */ err = ncmpi_enddef(ncid); ERR; /* write all variables. to improve: we can try nonblocking I/O to aggregate small requests */ for (b = 0; b < nblocks; b++) { struct vblock_t *v = &vblocks[b]; /* quantities */ start[0] = block_ofsts[NUM_BLOCKS]; count[0] = 1; err = ncmpi_put_vara_int_all(ncid, varids[0], start, count, &v->num_verts); ERR; err = ncmpi_put_vara_int_all(ncid, varids[1], start, count, &v->num_complete_cells); ERR; err = ncmpi_put_vara_int_all(ncid, varids[2], start, count, &v->tot_num_cell_faces); ERR; err = ncmpi_put_vara_int_all(ncid, varids[3], start, count, &v->tot_num_face_verts); ERR; err = ncmpi_put_vara_int_all(ncid, varids[4], start, count, &v->num_orig_particles); ERR; /* block offsets */ err = ncmpi_put_vara_longlong_all(ncid, varids[5], start, count, &block_ofsts[NUM_VERTS]); ERR; err = ncmpi_put_vara_longlong_all(ncid, varids[6], start, count, &block_ofsts[NUM_COMP_CELLS]); ERR; err = ncmpi_put_vara_longlong_all(ncid, varids[7], start, count, &block_ofsts[NUM_CELL_FACES]); ERR; err = ncmpi_put_vara_longlong_all(ncid, varids[8], start, count, &block_ofsts[NUM_FACE_VERTS]); ERR; err = ncmpi_put_vara_longlong_all(ncid, varids[9], start, count, &block_ofsts[NUM_ORIG_PARTS]); ERR; /* block bounds */ start[0] = block_ofsts[NUM_BLOCKS]; count[0] = 1; start[1] = 0; count[1] = 3; err = ncmpi_put_vara_float_all(ncid, varids[11], start, count, v->mins); ERR; err = ncmpi_put_vara_float_all(ncid, varids[12], start, count, v->maxs); ERR; /* save_verts */ start[0] = block_ofsts[NUM_VERTS]; start[1] = 0; count[0] = v->num_verts; count[1] = 3; err = ncmpi_put_vara_float_all(ncid, varids[13], start, count, v->save_verts); ERR; /* sites */ start[0] = block_ofsts[NUM_ORIG_PARTS]; start[1] = 0; count[0] = v->num_orig_particles; count[1] = 3; err = ncmpi_put_vara_float_all(ncid, varids[14], start, count, v->sites); ERR; /* complete cells */ start[0] = block_ofsts[NUM_COMP_CELLS]; count[0] = v->num_complete_cells; err = ncmpi_put_vara_int_all(ncid, varids[15], start, count, v->complete_cells); ERR; /* areas */ start[0] = block_ofsts[NUM_COMP_CELLS]; count[0] = v->num_complete_cells; err = ncmpi_put_vara_float_all(ncid, varids[16], start, count, v->areas); ERR; /* volumes */ start[0] = block_ofsts[NUM_COMP_CELLS]; count[0] = v->num_complete_cells; err = ncmpi_put_vara_float_all(ncid, varids[17], start, count, v->vols); ERR; /* num_cell_faces */ start[0] = block_ofsts[NUM_COMP_CELLS]; count[0] = v->num_complete_cells; err = ncmpi_put_vara_int_all(ncid, varids[18], start, count, v->num_cell_faces); ERR; /* num_face_verts */ start[0] = block_ofsts[NUM_CELL_FACES]; count[0] = v->tot_num_cell_faces; err = ncmpi_put_vara_int_all(ncid, varids[19], start, count, v->num_face_verts); ERR; /* face verts */ start[0] = block_ofsts[NUM_FACE_VERTS]; count[0] = v->tot_num_face_verts; err = ncmpi_put_vara_int_all(ncid, varids[20], start, count, v->face_verts); ERR; /* neighbors */ int *neighbors = (int*)malloc(DIY_Num_neighbors(0, b) * sizeof(int)); int num_neighbors = DIY_Get_neighbors(0, b, neighbors); start[0] = block_ofsts[NUM_NEIGHBORS]; count[0] = num_neighbors; err = ncmpi_put_vara_int_all(ncid, varids[21], start, count, neighbors); ERR; /* gids */ int gid = DIY_Gid(0, b); start[0] = block_ofsts[NUM_BLOCKS]; count[0] = 1; err = ncmpi_put_vara_int_all(ncid, varids[22], start, count, &gid); ERR; /* update block offsets */ block_ofsts[NUM_VERTS] += v->num_verts; block_ofsts[NUM_COMP_CELLS] += v->num_complete_cells; block_ofsts[NUM_CELL_FACES] += v->tot_num_cell_faces; block_ofsts[NUM_FACE_VERTS] += v->tot_num_face_verts; block_ofsts[NUM_ORIG_PARTS] += v->num_orig_particles; block_ofsts[NUM_NEIGHBORS] += num_neighbors; block_ofsts[NUM_BLOCKS]++; /* debug */ /* fprintf(stderr, "gid = %d num_verts = %d num_complete_cells = %d " */ /* "tot_num_cell_faces = %d tot_num_face_verts = %d " */ /* "num_orig_particles = %d\n", */ /* gid, v->num_verts, v->num_complete_cells, v->tot_num_cell_faces, */ /* v->tot_num_face_verts, v->num_orig_particles); */ } err = ncmpi_close(ncid); ERR; #endif }
int main(int argc, char **argv) { int errs = 0; int i; int rank, size; int *sbuf = NULL; int *rbuf = NULL; int *scounts = NULL; int *rcounts = NULL; int *sdispls = NULL; int *rdispls = NULL; MPI_Datatype *types = NULL; MPI_Comm comm; /* intentionally not using MTest_Init/MTest_Finalize in order to make it * easy to take this test and use it as an NBC sanity test outside of the * MPICH test suite */ MPI_Init(&argc, &argv); comm = MPI_COMM_WORLD; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); /* enough space for every process to contribute at least NUM_INTS ints to any * collective operation */ sbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(sbuf); rbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(rbuf); scounts = malloc(size * sizeof(int)); my_assert(scounts); rcounts = malloc(size * sizeof(int)); my_assert(rcounts); sdispls = malloc(size * sizeof(int)); my_assert(sdispls); rdispls = malloc(size * sizeof(int)); my_assert(rdispls); types = malloc(size * sizeof(MPI_Datatype)); my_assert(types); for (i = 0; i < size; ++i) { sbuf[2 * i] = i; sbuf[2 * i + 1] = i; rbuf[2 * i] = i; rbuf[2 * i + 1] = i; scounts[i] = NUM_INTS; rcounts[i] = NUM_INTS; sdispls[i] = i * NUM_INTS; rdispls[i] = i * NUM_INTS; types[i] = MPI_INT; } if (rank == 0 && MPI_SUCCESS == MPI_Gather(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Gatherv(sbuf, NUM_INTS, MPI_INT, sbuf, rcounts, rdispls, MPI_INT, 0, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Scatter(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Scatterv(sbuf, scounts, sdispls, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm)) errs++; if (MPI_SUCCESS == MPI_Allgather(&sbuf[rank], 1, MPI_INT, sbuf, 1, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Allgatherv(&sbuf[rank * rcounts[rank]], rcounts[rank], MPI_INT, sbuf, rcounts, rdispls, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Alltoall(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Alltoallv(sbuf, scounts, sdispls, MPI_INT, sbuf, scounts, sdispls, MPI_INT, comm)) errs++; if (MPI_SUCCESS == MPI_Alltoallw(sbuf, scounts, sdispls, types, sbuf, scounts, sdispls, types, comm)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Reduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm)) errs++; if (MPI_SUCCESS == MPI_Allreduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Reduce_scatter(sbuf, sbuf, rcounts, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Reduce_scatter_block(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Scan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (MPI_SUCCESS == MPI_Exscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm)) errs++; if (sbuf) free(sbuf); if (rbuf) free(rbuf); if (scounts) free(scounts); if (rcounts) free(rcounts); if (sdispls) free(sdispls); if (rdispls) free(rdispls); if (types) free(types); if (rank == 0) { if (errs) fprintf(stderr, "Found %d errors\n", errs); else printf(" No errors\n"); } MPI_Finalize(); return 0; }
static void writePLY( MPI_Comm comm, std::string fname, int nvertices, int nverticesPerObject, int ntriangles, int ntrianglesPerObject, int nObjects, const std::vector<int3>& mesh, const std::vector<float3>& vertices) { int rank; MPI_Check( MPI_Comm_rank(comm, &rank) ); int totalVerts = 0; MPI_Check( MPI_Reduce(&nvertices, &totalVerts, 1, MPI_INT, MPI_SUM, 0, comm) ); int totalTriangles = 0; MPI_Check( MPI_Reduce(&ntriangles, &totalTriangles, 1, MPI_INT, MPI_SUM, 0, comm) ); MPI_File f; MPI_Check( MPI_File_open(comm, fname.c_str(), MPI_MODE_CREATE|MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_WRONLY, MPI_INFO_NULL, &f) ); MPI_Check( MPI_File_close(&f) ); MPI_Check( MPI_File_open(comm, fname.c_str(), MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &f) ); int headerSize = 0; MPI_Offset fileOffset = 0; if (rank == 0) { std::stringstream ss; ss << "ply\n"; ss << "format binary_little_endian 1.0\n"; ss << "element vertex " << totalVerts << "\n"; ss << "property float x\nproperty float y\nproperty float z\n"; //ss << "property float xnormal\nproperty float ynormal\nproperty float znormal\n"; ss << "element face " << totalTriangles << "\n"; ss << "property list int int vertex_index\n"; ss << "end_header\n"; std::string content = ss.str(); headerSize = content.length(); MPI_Check( MPI_File_write_at(f, fileOffset, content.c_str(), headerSize, MPI_CHAR, MPI_STATUS_IGNORE) ); } MPI_Check( MPI_Bcast(&headerSize, 1, MPI_INT, 0, comm) ); fileOffset += headerSize; fileOffset += writeToMPI(vertices, f, fileOffset, comm); int verticesOffset = 0; MPI_Check( MPI_Exscan(&nvertices, &verticesOffset, 1, MPI_INT, MPI_SUM, comm)); std::vector<int4> connectivity; for(int j = 0; j < nObjects; ++j) for(int i = 0; i < ntrianglesPerObject; ++i) { int3 vertIds = mesh[i] + nverticesPerObject * j + verticesOffset; connectivity.push_back({3, vertIds.x, vertIds.y, vertIds.z}); } fileOffset += writeToMPI(connectivity, f, fileOffset, comm); MPI_Check( MPI_File_close(&f)); }
slint_t mpi_select_exact_radix_fixed(elements_t *s, slint_t nelements, slint_t nparts, partcond_t *pconds, slint_t rhigh, slint_t rlow, slint_t rwidth, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_select_exact_radix_fixed */ { slkey_pure_t max_nclasses, nclasses, bit_mask; slkey_pure_t k, l; typedef struct { slint_t count_min, count_max; slint_t count_low, count_hig; #ifdef elem_weight double weight_min, weight_max; double weight_low, weight_hig; #endif } mmlh_t; mmlh_t mmlh[nparts]; const slint_t max_nborders = nparts - 1; slint_t border_lo, border_hi, nborders_removed; slint_t borders[max_nborders], border_areas[max_nborders]; #define MIN_LE 0 #define MIN_RI 1 #define MAX_LE 2 #define MAX_RI 3 struct { slint_t update; slint_t crange[2], cmmlr[4]; #ifdef elem_weight double wrange[2], wmmlr[4]; #endif } border_infos_[1 + max_nborders + 1], *border_infos = border_infos_ + 1, border_info_old; const slint_t max_nareas = max_nborders; slint_t nareas, nareas_new; elements_t areas0[max_nareas * nelements], areas1[max_nareas * nelements], *areas, *areas_new; slint_t *area_counts, *current_counts; double *local_counts, *global_counts; #ifdef elem_weight double *local_weights, *global_weights, *current_weights; #endif slint_t current_cmm[2]; #ifdef elem_weight double current_wmm[2]; #endif slint_t final_areas[max_nborders * nelements]; double final_locals[NCONDS * max_nborders], *final_globals; slint_t current_width; slint_t round, direction, refine, finalize; slint_t last_new_area, last_new_class; slint_t lc, lcs, gc, gcs, lcv[nelements], lcsv[nelements]; #ifdef elem_weight double lw, gw, lws, gws; double mw, dw; double mcw[4]; #else slint_t mc, dc; #endif slint_t i, j; elements_t xi, end; #ifdef VERIFY slint_t v; #endif SL_TRACE_IF(DEBUG_OR_NOT, "starting mpi_select_exact_radix"); /* sl_tid rti_tid_mpi_select_exact_radix rti_tid_mpi_select_exact_radix_sync */ rti_treset(rti_tid_mpi_select_exact_radix_while); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_count); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_allreduce); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_round1); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_round1_allgather); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_exscan); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_pre); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_classes); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_final); /* sl_tid */ rti_treset(rti_tid_mpi_select_exact_radix_while_check_post); /* sl_tid */ rti_tstart(rti_tid_mpi_select_exact_radix_sync); #ifdef SYNC_ON_INIT MPI_Barrier(comm); #endif rti_tstop(rti_tid_mpi_select_exact_radix_sync); #ifdef VERIFY v = elements_validate_order(s, 1); SL_TRACE_IF(DEBUG_OR_NOT, "elements order: %s (%" slint_fmt ")", (v > 0)?"FAILED":"SUCCESS", v); #endif rti_tstart(rti_tid_mpi_select_exact_radix); if (rhigh < 0) rhigh = key_radix_high; if (rlow < 0) rlow = key_radix_low; if (rwidth < 0) rwidth = sort_radix_width_default; max_nclasses = powof2_typed(rwidth, slkey_pure_t); /* SL_TRACE_IF(DEBUG_OR_NOT, "alloc area_counts: %" slint_fmt " * %d", max_nareas * nelements * max_nclasses, sizeof(slint_t)); SL_TRACE_IF(DEBUG_OR_NOT, "alloc local_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t)); SL_TRACE_IF(DEBUG_OR_NOT, "alloc global_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t));*/ area_counts = sl_alloc(max_nareas * nelements * max_nclasses, sizeof(slint_t)); local_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double)); global_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double)); /* init areas (first area = all elements) */ areas = areas0; areas_new = areas1; nareas = 1; for (j = 0; j < nelements; ++j) elem_assign(&s[j], &areas[0 * nelements + j]); /* init parts */ border_lo = 0; border_hi = max_nborders - 1; for (i = border_lo; i <= border_hi; ++i) { borders[i] = i; border_areas[i] = 0; } /* init sdispls */ for (i = 0; i < nparts; ++i) for (j = 0; j < nelements; ++j) sdispls[i * nelements + j] = 0; rti_tstart(rti_tid_mpi_select_exact_radix_while); round = 0; while (border_lo <= border_hi) { ++round; /* setup bitmask */ current_width = xmin(rwidth, rhigh - rlow + 1); rhigh -= (current_width > 0)?current_width - 1:rhigh; nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1; bit_mask = nclasses - 1; SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", rhigh: %" slint_fmt ", current_width: %" slint_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses); finalize = (current_width <= 0); if (!finalize || round == 1) { #ifdef elem_weight /* init weight counters */ local_weights = local_counts + (nareas * nclasses) + nareas; global_weights = global_counts + (nareas * nclasses) + nareas; #endif /* zero all counter */ for (i = 0; i < nareas; ++i) for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = #ifdef elem_weight local_weights[i * nclasses + k] = #endif 0.0; rti_tstart(rti_tid_mpi_select_exact_radix_while_count); /* for every area */ for (i = 0; i < nareas; ++i) { local_counts[nareas * nclasses + i] = 0; #ifdef elem_weight local_weights[nareas * nclasses + i] = 0.0; #endif /* for every list of elements */ for (j = 0; j < nelements; ++j) { SL_TRACE_IF(DEBUG_OR_NOT, "area %" slint_fmt ",%" slint_fmt ": size = %" slint_fmt, i, j, areas[i * nelements + j].size); elem_assign_at(&areas[i * nelements + j], areas[i * nelements + j].size, &end); current_counts = area_counts + ((i * nelements + j) * nclasses); #ifdef elem_weight current_weights = local_weights + (i * nclasses); #endif for (k = 0; k < nclasses; ++k) current_counts[k] = 0; if (nclasses > 1) { /* counts and weights in every class */ for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) { k = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask); current_counts[k] += 1; /* SL_TRACE_IF(DEBUG_OR_NOT, "key %" sl_key_pure_type_fmt " goes to bin %" sl_key_pure_type_fmt, key_purify(*xi.keys), k);*/ #ifdef elem_weight current_weights[k] += elem_weight(&xi, 0); #endif } } else { /* total counts and weights */ current_counts[0] = areas[i * nelements + j].size; #ifdef elem_weight for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) current_weights[0] += elem_weight(&xi, 0); #endif } for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] += current_counts[k]; /* total counts and weights in this area */ local_counts[nareas * nclasses + i] += areas[i * nelements + j].size; #ifdef elem_weight for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += current_weights[k]; #endif } SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ": counts =", " %f", k, nclasses, (&local_counts[i * nclasses]), i); } rti_tstop(rti_tid_mpi_select_exact_radix_while_count); --rhigh; SL_TRACE_IF(DEBUG_OR_NOT, "all-reducing %" slint_fmt " doubles", (slint_t) (NCONDS * (nareas * nclasses + nareas))); rti_tstart(rti_tid_mpi_select_exact_radix_while_allreduce); /* create global counts and weights */ #ifdef MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD if (size >= MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD) { MPI_Reduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm); MPI_Bcast(global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm); } else #endif MPI_Allreduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm); rti_tstop(rti_tid_mpi_select_exact_radix_while_allreduce); } /* do initializations */ if (round == 1) { rti_tstart(rti_tid_mpi_select_exact_radix_while_round1); for (i = 0; i < nparts; ++i) { /* truncate counts, set default values and determine local (count/weight) limits */ init_partconds(1, &pconds[i], nparts, global_counts[nareas * nclasses + 0], #ifdef elem_weight global_weights[nareas * nclasses + 0] #else 0 #endif ); mmlh[i].count_min = pconds[i].count_min; mmlh[i].count_max = pconds[i].count_max; mmlh[i].count_low = pconds[i].count_low; mmlh[i].count_hig = pconds[i].count_high; #ifdef elem_weight mmlh[i].weight_min = pconds[i].weight_min; mmlh[i].weight_max = pconds[i].weight_max; mmlh[i].weight_low = pconds[i].weight_low; mmlh[i].weight_hig = pconds[i].weight_high; #endif } /* init lowest and highest part (sentinels) */ border_infos[border_lo - 1].update = 0; border_infos[border_lo - 1].crange[0] = 0; border_infos[border_lo - 1].crange[1] = 0; border_infos[border_lo - 1].cmmlr[MIN_LE] = border_infos[border_lo - 1].cmmlr[MAX_LE] = 0; border_infos[border_lo - 1].cmmlr[MIN_RI] = border_infos[border_lo - 1].cmmlr[MAX_RI] = 0; SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_lo - 1, border_infos[border_lo - 1].cmmlr[MIN_LE], border_infos[border_lo - 1].cmmlr[MAX_LE], border_infos[border_lo - 1].cmmlr[MIN_RI], border_infos[border_lo - 1].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[border_lo - 1].wrange[0] = 0.0; border_infos[border_lo - 1].wrange[1] = 0.0; border_infos[border_lo - 1].wmmlr[MIN_LE] = border_infos[border_lo - 1].wmmlr[MAX_LE] = 0.0; border_infos[border_lo - 1].wmmlr[MIN_RI] = border_infos[border_lo - 1].wmmlr[MAX_RI] = 0.0; SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_lo - 1, border_infos[border_lo - 1].wmmlr[MIN_LE], border_infos[border_lo - 1].wmmlr[MAX_LE], border_infos[border_lo - 1].wmmlr[MIN_RI], border_infos[border_lo - 1].wmmlr[MAX_RI]); #endif /* init highest part (sentinel) */ border_infos[border_hi + 1].update = 0; border_infos[border_hi + 1].crange[0] = global_counts[nareas * nclasses + 0]; border_infos[border_hi + 1].crange[1] = global_counts[nareas * nclasses + 0]; border_infos[border_hi + 1].cmmlr[MIN_LE] = border_infos[border_hi + 1].cmmlr[MAX_LE] = 0; border_infos[border_hi + 1].cmmlr[MIN_RI] = border_infos[border_hi + 1].cmmlr[MAX_RI] = global_counts[nareas * nclasses + 0]; SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_hi + 1, border_infos[border_hi + 1].cmmlr[MIN_LE], border_infos[border_hi + 1].cmmlr[MAX_LE], border_infos[border_hi + 1].cmmlr[MIN_RI], border_infos[border_hi + 1].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[border_hi + 1].wrange[0] = global_weights[nareas * nclasses + 0]; border_infos[border_hi + 1].wrange[1] = global_weights[nareas * nclasses + 0]; border_infos[border_hi + 1].wmmlr[MIN_LE] = border_infos[border_hi + 1].wmmlr[MAX_LE] = 0.0; border_infos[border_hi + 1].wmmlr[MIN_RI] = border_infos[border_hi + 1].wmmlr[MAX_RI] = global_weights[nareas * nclasses + 0]; SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_hi + 1, border_infos[border_hi + 1].wmmlr[MIN_LE], border_infos[border_hi + 1].wmmlr[MAX_LE], border_infos[border_hi + 1].wmmlr[MIN_RI], border_infos[border_hi + 1].wmmlr[MAX_RI]); #endif /* init regular parts (backwards) */ for (i = border_hi; i >= border_lo; --i) { border_infos[borders[i]].update = 1; border_infos[borders[i]].crange[0] = 0; border_infos[borders[i]].crange[1] = global_counts[nareas * nclasses + 0]; border_infos[borders[i]].cmmlr[MIN_LE] = -1; border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min; border_infos[borders[i]].cmmlr[MAX_LE] = -1; border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[borders[i]].wrange[0] = 0.0; border_infos[borders[i]].wrange[1] = global_weights[nareas * nclasses + 0]; border_infos[borders[i]].wmmlr[MIN_LE] = -1.0; border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min; border_infos[borders[i]].wmmlr[MAX_LE] = -1.0; border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); #endif /* prepare for finalization in the 1st round */ if (finalize) { for (j = 0; j < nelements; ++j) final_areas[i * nelements + j] = area_counts[(0 * nelements + j) * nclasses + 0]; final_locals[NCONDS * i + 0] = local_counts[nareas * nclasses + 0]; #ifdef elem_weight final_locals[NCONDS * i + 1] = local_weights[nareas * nclasses + 0]; #endif } } /* first direction: forward */ direction = 1; rti_tstop(rti_tid_mpi_select_exact_radix_while_round1); } /* compute prefixes for finalization */ if (finalize) { /* determine number of parts to finalize */ j = border_hi - border_lo + 1; SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" slint_fmt " parts", j); rti_tstart(rti_tid_mpi_select_exact_radix_while_exscan); /* use local_counts to store the global prefix sums */ final_globals = local_counts; /* create global prefix sums (set rank 0 to zero) */ MPI_Exscan(&final_locals[NCONDS * border_lo], &final_globals[NCONDS * border_lo], NCONDS * j, MPI_DOUBLE, MPI_SUM, comm); if (rank == 0) for (i = border_lo; i <= border_hi; ++i) final_globals[NCONDS * i + 0] = #ifdef elem_weight final_globals[NCONDS * i + 1] = #endif 0.0; rti_tstop(rti_tid_mpi_select_exact_radix_while_exscan); } /* check all remaining parts */ SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", %s", round, (direction > 0)?"forward":"backward"); nareas_new = 0; last_new_area = last_new_class = -1; nborders_removed = 0; rti_tstart(rti_tid_mpi_select_exact_radix_while_check); i = (direction > 0)?border_lo:border_hi; while ((direction > 0)?(i <= border_hi):(i >= border_lo)) { /* check partition borders[i] */ SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ": PART: %" slint_fmt ",%" slint_fmt, round, i, borders[i]); rti_tstart(rti_tid_mpi_select_exact_radix_while_check_pre); /* save to old limits */ border_info_old = border_infos[borders[i]]; /* is an update required? */ if (border_infos[borders[i]].update) { /* forward */ if (direction > 0) { /* init from min/max (always) */ border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i] - 1].cmmlr[MIN_LE] + mmlh[borders[i]].count_min; border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i] - 1].cmmlr[MAX_LE] + mmlh[borders[i]].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left]: %" slint_fmt " + %" slint_fmt ", %" slint_fmt " + %" slint_fmt "", i, borders[i], border_infos[borders[i] - 1].cmmlr[MIN_LE], mmlh[borders[i]].count_min, border_infos[borders[i] - 1].cmmlr[MAX_LE], mmlh[borders[i]].count_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_COUNTS_LH) { if (border_infos[borders[i]].cmmlr[MIN_LE] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MIN_LE] = mmlh[borders[i] + 1].count_low; if (border_infos[borders[i]].cmmlr[MAX_LE] > mmlh[borders[i] ].count_hig) border_infos[borders[i]].cmmlr[MAX_LE] = mmlh[borders[i] ].count_hig; } #ifdef elem_weight /* init from min/max (always) */ border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i] - 1].wmmlr[MIN_LE] + mmlh[borders[i]].weight_min; border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i] - 1].wmmlr[MAX_LE] + mmlh[borders[i]].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left]: %f + %f, %f + %f", i, borders[i], border_infos[borders[i] - 1].wmmlr[MIN_LE], mmlh[borders[i]].weight_min, border_infos[borders[i] - 1].wmmlr[MAX_LE], mmlh[borders[i]].weight_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_WEIGHTS_LH) { if (border_infos[borders[i]].wmmlr[MIN_LE] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MIN_LE] = mmlh[borders[i] + 1].weight_low; if (border_infos[borders[i]].wmmlr[MAX_LE] > mmlh[borders[i] ].weight_hig) border_infos[borders[i]].wmmlr[MAX_LE] = mmlh[borders[i] ].weight_hig; } #endif } else /* backward */ { /* init from min/max (always) */ border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min; border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-right]: %" slint_fmt " - %" slint_fmt ", %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i] + 1].cmmlr[MIN_RI], mmlh[borders[i] + 1].count_min, border_infos[borders[i] + 1].cmmlr[MAX_RI], mmlh[borders[i] + 1].count_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_COUNTS_LH) { if (border_infos[borders[i]].cmmlr[MAX_RI] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MAX_RI] = mmlh[borders[i] + 1].count_low; if (border_infos[borders[i]].cmmlr[MIN_RI] > mmlh[borders[i] ].count_hig) border_infos[borders[i]].cmmlr[MIN_RI] = mmlh[borders[i] ].count_hig; } #ifdef elem_weight /* init from min/max (always) */ border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min; border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-right]: %f - %f, %f - %f", i, borders[i], border_infos[borders[i] + 1].wmmlr[MIN_RI], mmlh[borders[i] + 1].weight_min, border_infos[borders[i] + 1].wmmlr[MAX_RI], mmlh[borders[i] + 1].weight_max); /* check against low/high (on demand) */ if (pconds->pcm & SLPC_WEIGHTS_LH) { if (border_infos[borders[i]].wmmlr[MAX_RI] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MAX_RI] = mmlh[borders[i] + 1].weight_low; if (border_infos[borders[i]].wmmlr[MIN_RI] > mmlh[borders[i] ].weight_hig) border_infos[borders[i]].wmmlr[MIN_RI] = mmlh[borders[i] ].weight_hig; } #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); /* check against inconsistence */ if (border_infos[borders[i]].cmmlr[MIN_LE] > border_infos[borders[i]].cmmlr[MIN_RI]) border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i]].cmmlr[MIN_RI] = (border_infos[borders[i]].cmmlr[MIN_LE] + border_infos[borders[i]].cmmlr[MIN_RI]) / 2; if (border_infos[borders[i]].cmmlr[MAX_LE] < border_infos[borders[i]].cmmlr[MAX_RI]) border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i]].cmmlr[MAX_RI] = (border_infos[borders[i]].cmmlr[MAX_LE] + border_infos[borders[i]].cmmlr[MAX_RI]) / 2; #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); /* check against inconsistence */ if (border_infos[borders[i]].wmmlr[MIN_LE] > border_infos[borders[i]].wmmlr[MIN_RI]) border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i]].wmmlr[MIN_RI] = (border_infos[borders[i]].wmmlr[MIN_LE] + border_infos[borders[i]].wmmlr[MIN_RI]) / 2; if (border_infos[borders[i]].wmmlr[MAX_LE] < border_infos[borders[i]].wmmlr[MAX_RI]) border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i]].wmmlr[MAX_RI] = (border_infos[borders[i]].wmmlr[MAX_LE] + border_infos[borders[i]].wmmlr[MAX_RI]) / 2; #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": crange: %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]); /* select highest min and lowest max */ current_cmm[0] = xmax(border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_RI]) - border_infos[borders[i]].crange[0]; current_cmm[1] = xmin(border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI]) - border_infos[borders[i]].crange[0]; if (rank == 0) SL_ASSERT(current_cmm[0] <= current_cmm[1]); if (rank == 0) SL_ASSERT(0 <= current_cmm[0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_count: %" slint_fmt " - %" slint_fmt "", i, borders[i], current_cmm[0], current_cmm[1]); #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": wrange: %f - %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]); /* select highest min and lowest max */ current_wmm[0] = xmax(border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_RI]) - border_infos[borders[i]].wrange[0]; current_wmm[1] = xmin(border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI]) - border_infos[borders[i]].wrange[0]; if (rank == 0) SL_ASSERT(current_wmm[0] <= current_wmm[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_weight: %f - %f", i, borders[i], current_wmm[0], current_wmm[1]); #endif rti_tstop(rti_tid_mpi_select_exact_radix_while_check_pre); /* HIT is the default */ refine = 0; if (!finalize) { rti_tstart(rti_tid_mpi_select_exact_radix_while_check_classes); lcs = gcs = 0; #ifdef elem_weight lws = gws = 0.0; #endif for (k = 0; k < nclasses; ++k) { lc = local_counts[border_areas[i] * nclasses + k]; gc = global_counts[border_areas[i] * nclasses + k]; current_cmm[0] -= gc; current_cmm[1] -= gc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_count: %" slint_fmt " - %" slint_fmt ", lc = %" slint_fmt ", lcs = %" slint_fmt ", gc = %" slint_fmt ", gcs = %" slint_fmt, i, borders[i], k, current_cmm[0], current_cmm[1], lc, lcs, gc, gcs); #ifdef elem_weight lw = local_weights[border_areas[i] * nclasses + k]; gw = global_weights[border_areas[i] * nclasses + k]; current_wmm[0] -= gw; current_wmm[1] -= gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_weight: %e - %e", i, borders[i], k, current_wmm[0], current_wmm[1]); #endif /* stop and refine if max count is skipped OR min count AND max weight is skipped */ if ((current_cmm[1] < 0) #ifdef elem_weight || (current_cmm[0] < 0 && current_wmm[1] < 0.0) #endif ) { refine = 1; break; } lcs += lc; gcs += gc; gc = 0; #ifdef elem_weight lws += lw; gws += gw; gw = 0.0; #endif /* if between min/max counts */ if (current_cmm[0] <= 0 && current_cmm[1] >= 0) { #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "got to next: %d && %d", (current_cmm[1] > 0), (current_wmm[0] > 0)); /* go to next if max count not reached AND min weight not reached */ if (current_cmm[1] > 0 && current_wmm[0] > 0) continue; #endif /* look ahead for a better stop */ if (k + 1 < nclasses && current_cmm[1] - global_counts[border_areas[i] * nclasses + k + 1] >= 0) { #ifdef elem_weight /* continue if weights will improve */ if (myabs(current_wmm[0] + current_wmm[1]) > myabs(current_wmm[0] + current_wmm[1] - 2 * global_weights[border_areas[i] * nclasses + k + 1])) continue; #else /* continue if counts will improve */ if (myabs(current_cmm[0] + current_cmm[1]) > myabs(current_cmm[0] + current_cmm[1] - 2 * global_counts[border_areas[i] * nclasses + k + 1])) continue; #endif } /* stop */ break; } } SL_ASSERT_IF((rank == 0), k < nclasses); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": %s k = %" sl_key_pure_type_fmt ", lcs = %" slint_fmt, i, borders[i], (refine)?"REFINE":"HIT", k, lcs); /* make sure k is safe (it is used as index later) */ if (k >= nclasses) k = nclasses - 1; /* break the local contribution into contributions for the lists of elements */ for (j = 0; j < nelements; ++j) { lcsv[j] = 0; for (l = 0; l < k; ++l) lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + l]; if (refine) lcv[j] = area_counts[((border_areas[i] * nelements + j) * nclasses) + k]; else { lcv[j] = 0; lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + k]; } lcs -= lcsv[j]; } rti_tstop(rti_tid_mpi_select_exact_radix_while_check_classes); } else { rti_tstart(rti_tid_mpi_select_exact_radix_while_check_final); k = 0; #ifdef elem_weight /* middle of min/max weight */ mw = (current_wmm[0] + current_wmm[1]) / 2.0; /* min. part of weight to contribute */ dw = xmax(0, mw - final_globals[NCONDS * i + 1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mw = %e, dw = %e", i, borders[i], mw, dw); #else /* middle of min/max count */ mc = (current_cmm[0] + current_cmm[1]) / 2; /* min. part of count to contribute */ dc = xmax(0, mc - final_globals[NCONDS * i + 0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mc = %" slint_fmt ", dc = %" slint_fmt, i, borders[i], mc, dc); #endif /* contribute all? */ if ( #ifdef elem_weight dw >= final_locals[NCONDS * i + 1] #else dc >= final_locals[NCONDS * i + 0] #endif ) { lc = final_locals[NCONDS * i + 0]; #ifdef elem_weight lw = final_locals[NCONDS * i + 1]; #endif } else { /* contribute only a part */ #ifdef elem_weight lc = 0; for (j = 0; j < nelements; ++j) { elem_assign_at(&areas[border_areas[i] * nelements + j], areas[border_areas[i] * nelements + j].size, &end); for (elem_assign(&areas[border_areas[i] * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) { dw -= elem_weight(&xi, 0); ++lc; if (dw < 0.0 || lc >= final_locals[NCONDS * i + 0]) { dw += elem_weight(&xi, 0); --lc; break; } } } lw = dw; #else lc = dc; #endif } /* check mc against min/max count borders */ lc = xminmax(current_cmm[0] - final_globals[NCONDS * i + 0], lc, current_cmm[1] - final_globals[NCONDS * i + 0]); /* check agains 0 (don't step back!) and the local contribution */ lc = xminmax(0, lc, final_locals[NCONDS * i + 0]); lcs = lc; #ifdef elem_weight lws = lw; #endif #ifdef elem_weight SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " <= %" slint_fmt " + %" slint_fmt " <= %" slint_fmt, i, borders[i], border_lo, i, direction, border_hi); if (border_lo <= i + direction && i + direction <= border_hi) SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " == %" slint_fmt " + %" slint_fmt, i, borders[i], borders[i + direction], borders[i], direction); /* FIXME: finalize geht auch rückwärts!!! */ /* if the next open border is really the _next_ border */ if (border_lo <= i + direction && i + direction <= border_hi && borders[i + direction] == borders[i] + direction) { /* determine the exact global counts/weights (damn, this is expensive) */ mcw[0] = lcs; mcw[1] = lws; MPI_Allreduce(&mcw[0], &mcw[2], 2, MPI_DOUBLE, MPI_SUM, comm); } else { /* the exact global counts/weights are not required */ mcw[2] = 0.0; mcw[3] = 0.0; } gc = 0; gcs = mcw[2]; gw = 0.0; gws = mcw[3]; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt ", gws = %f", i, borders[i], gcs, gws); #else /* the global count is simply mc */ gc = 0; gcs = mc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt, i, borders[i], gcs); #endif SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcs = %" slint_fmt, i, borders[i], lcs); /* break the local contribution into contributions for the lists of elements */ for (j = 0; j < nelements; ++j) { lcv[j] = 0; lcsv[j] = xmin(lcs, final_areas[i * nelements + j]); lcs -= lcsv[j]; } SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcsv = ", "%" slint_fmt, j, nelements, lcsv, i, borders[i]); rti_tstop(rti_tid_mpi_select_exact_radix_while_check_final); } SL_ASSERT(lcs == 0); /* accept local contributions */ for (j = 0; j < nelements; ++j) sdispls[(borders[i] + 1) * nelements + j] += lcsv[j]; rti_tstart(rti_tid_mpi_select_exact_radix_while_check_post); /* this is wrong, e.g., even if gc == 0 and gcs == 0 then crange[1] is set to crange[0]! */ /* if (gc > 0 || gcs > 0 #ifdef elem_weight || gw != 0.0 || gws != 0.0 #endif )*/ { border_infos[borders[i]].crange[0] += gcs; border_infos[borders[i]].crange[1] = border_infos[borders[i]].crange[0] + gc; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": counts_range: %" slint_fmt " %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MIN_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MAX_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MIN_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].crange[1]); border_infos[borders[i]].cmmlr[MAX_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_RI], border_infos[borders[i]].crange[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]); #ifdef elem_weight border_infos[borders[i]].wrange[0] += gws; border_infos[borders[i]].wrange[1] = border_infos[borders[i]].wrange[0] + gw; SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weights_range: %f %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MIN_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MAX_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MIN_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wrange[1]); border_infos[borders[i]].wmmlr[MAX_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_RI], border_infos[borders[i]].wrange[1]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]); #endif } SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 0: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[0], border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[0], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[0]); SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 1: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[0], border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[1], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[1]); if (border_infos[borders[i]].cmmlr[MIN_LE] != border_info_old.cmmlr[MIN_LE] || border_infos[borders[i]].cmmlr[MAX_LE] != border_info_old.cmmlr[MAX_LE] #ifdef elem_weight || border_infos[borders[i]].wmmlr[MIN_LE] != border_info_old.wmmlr[MIN_LE] || border_infos[borders[i]].wmmlr[MAX_LE] != border_info_old.wmmlr[MAX_LE] #endif ) border_infos[borders[i] + 1].update = 1; if (border_infos[borders[i]].cmmlr[MIN_RI] != border_info_old.cmmlr[MIN_RI] || border_infos[borders[i]].cmmlr[MAX_RI] != border_info_old.cmmlr[MAX_RI] #ifdef elem_weight || border_infos[borders[i]].wmmlr[MIN_RI] != border_info_old.wmmlr[MIN_RI] || border_infos[borders[i]].wmmlr[MAX_RI] != border_info_old.wmmlr[MAX_RI] #endif ) border_infos[borders[i] - 1].update = 1; border_infos[borders[i]].update = 0; /* refine or remove */ if (refine) { /* bits left for partitioning? */ if (rhigh >= rlow) { if (last_new_area == border_areas[i] && last_new_class == k) border_areas[i] = nareas_new - 1; else { /* update last_new_... */ last_new_area = border_areas[i]; last_new_class = k; /* create new area */ for (j = 0; j < nelements; ++j) { elem_assign_at(&areas[border_areas[i] * nelements + j], lcsv[j], &areas_new[nareas_new * nelements + j]); areas_new[nareas_new * nelements + j].size = lcv[j]; } border_areas[i] = nareas_new; ++nareas_new; } } else { for (j = 0; j < nelements; ++j) final_areas[(i - nborders_removed * direction) * nelements + j] = lcv[j]; /* save local count/weight for the later prefix calculations */ final_locals[NCONDS * (i - nborders_removed * direction) + 0] = lc; #ifdef elem_weight final_locals[NCONDS * (i - nborders_removed * direction) + 1] = lw; #endif } borders[i - nborders_removed * direction] = borders[i]; border_areas[i - nborders_removed * direction] = border_areas[i]; } else ++nborders_removed; rti_tstop(rti_tid_mpi_select_exact_radix_while_check_post); i += direction; } /* restrict the parts */ if (direction > 0) border_hi -= nborders_removed; else border_lo += nborders_removed; /* change direction */ direction *= -1; rti_tstop(rti_tid_mpi_select_exact_radix_while_check); /* switch areas */ nareas = nareas_new; if (areas == areas0) { areas = areas1; areas_new = areas0; } else { areas = areas0; areas_new = areas1; } } rti_tstop(rti_tid_mpi_select_exact_radix_while); sl_free(area_counts); sl_free(local_counts); sl_free(global_counts); rti_tstop(rti_tid_mpi_select_exact_radix); #ifdef VERIFY v = mpi_post_check_partconds(s, nelements, nparts, pconds, sdispls, size, rank, comm); SL_ASSERT_IF(rank == 0, v < 0); SL_NOTICE_IF(rank == 0, "post_check_partconds: %s (%" slint_fmt ")", (v >= 0)?"FAILED":"SUCCESS", v); #endif #ifdef PRINT_SDISPLS printf("%d: sdispls:", rank); for (i = 0; i < nparts; ++i) printf(" %d ", sdispls[i]); printf("\n"); #endif #ifdef PRINT_STATS mpi_select_stats(s, nparts, sdispls, size, rank, comm); #endif #if defined(PRINT_TIMINGS) && defined(SL_USE_RTI_TIM) if (rank == PRINT_TIMINGS) { printf("%d: mpi_select_exact_radix: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix)); printf("%d: mpi_select_exact_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_sync)); printf("%d: mpi_select_exact_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while)); printf("%d: mpi_select_exact_radix: count: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_count)); printf("%d: mpi_select_exact_radix: allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_allreduce)); printf("%d: mpi_select_exact_radix: round1: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1)); printf("%d: mpi_select_exact_radix: allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1_allgather)); printf("%d: mpi_select_exact_radix: exscan: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_exscan)); printf("%d: mpi_select_exact_radix: check: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_check)); printf("%d: mpi_select_exact_radix: pre: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_pre)); printf("%d: mpi_select_exact_radix: classes: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_classes)); printf("%d: mpi_select_exact_radix: final: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_final)); printf("%d: mpi_select_exact_radix: post: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_post)); printf("%d: mpi_select_exact_radix: rounds: %" slint_fmt "\n", rank, round); } #endif return 0; }
int main(int argc, char *argv[]) { int r = 6; int n = 10; ATYPE a[] = {0,2,1,3,4,2,1,5,4,5}; ATYPE b[n]; int rank, size; MPI_Init(&argc,&argv); // get rank and size from communicator MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); if (rank == root) reference(a, n, r, b); int i; int m= n/size; /* create bucket locally */ ATYPE B[r]; const uint block_size = (rank != size-1) ? n/size : n - (n/size)*rank; printf("blocksize %d :rank %d\n",block_size, rank); ATYPE local_A[block_size]; ATYPE local_B[r]; int j = 0; for(i = rank * block_size; i < (rank + 1) * block_size; i++) { local_A[j] = a[i]; j++; } for(i = 0; i < r; i++) local_B[i] = 0; for(i = 0; i < block_size; i++) local_B[local_A[i]] ++; ATYPE AllB[r]; ATYPE RelB[r]; MPI_Allreduce(&local_B, AllB, r, ATYPE_MPI, MPI_SUM, MPI_COMM_WORLD); MPI_Exscan(&local_B, RelB,r, ATYPE_MPI, MPI_SUM, MPI_COMM_WORLD); // printArray(RelB, r); ATYPE temp[r]; if ( rank > root) { for(int j=1; j< block_size; j++) { if (local_A[j] > 0) { local_A[ local_A[j] + RelB[ local_A[j]] + local_A[j-1] ] = local_A[j]; } } printf("local_A\n"); printArray(local_A, block_size); } MPI_Finalize(); return 0; }
int vec2elemental(const std::vector<double> &vec, El::DistMatrix<El::Complex<double>,El::VC,El::STAR> &Y){ int data_dof=2; int SCAL_EXP = 1; int nlocal,gsize; //local elements, start p_id, global size double *pt_array; // will hold local array int r,q,rq; //Grid sizes int nbigs; //Number of large sends (i.e. send 1 extra data point) int pstart; // p_id of nstart int rank = El::mpi::WorldRank(); //p_id int send_size; // base send size bool print = rank == -1; // Get Grid and associated params const El::Grid* g = &(Y.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; // Get sizes, array in petsc nlocal = vec.size()/data_dof; int nstart = 0; MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm); //VecGetOwnershipRange(pt_vec,&nstart,NULL); //Find processor that nstart belongs to, number of larger sends rq = r * q; pstart = nstart % rq; //int div nbigs = nlocal % rq; send_size = nlocal/rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "send_size: " << send_size << std::endl; } // Make send_lengths std::vector<int> send_lengths(rq); std::fill(send_lengths.begin(),send_lengths.end(),send_size); if(nbigs >0){ for(int j=0;j<nbigs;j++){ send_lengths[(pstart + j) % rq] += 1; } } // Make send_disps std::vector<int> send_disps = exscan(send_lengths); std::vector<El::Complex<double>> indata(nlocal); // copy the data from an ffm tree to into a local vec of complex data for sending #pragma omp parallel for El::Complex<double> val; for(int i=0;i<nlocal;i++){ El::SetRealPart(val,vec[2*i+0]); El::SetImagPart(val,vec[2*i+1]); indata[i] = val; } // Make send_dataA, i.e. reorder the data std::vector<El::Complex<double>> send_data(nlocal); for(int proc=0;proc<rq;proc++){ int offset = send_disps[proc]; int base_idx = (proc - pstart + rq) % rq; for(int j=0; j<send_lengths[proc]; j++){ int idx = base_idx + (j * rq); send_data[offset + j] = indata[idx]; } } // Do all2all to get recv_lengths std::vector<int> recv_lengths(rq); MPI_Alltoall(&send_lengths[0], 1, MPI_INT, &recv_lengths[0], 1, MPI_INT,comm); // Scan to get recv_disps std::vector<int> recv_disps = exscan(recv_lengths); // Do all2allv to get data on correct processor El::Complex<double> * recv_data = Y.Buffer(); //MPI_Alltoallv(&send_data[0],&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ // &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); El::mpi::AllToAll(&send_data[0], &send_lengths[0], &send_disps[0], recv_data,&recv_lengths[0],&recv_disps[0],comm); if(print){ std::cout << "Send data: " <<std::endl << send_data <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } return 0; }