예제 #1
0
파일: test17.c 프로젝트: caisan/umpi
int main(int argc, char **argv)
{
	if (MPI_Init(&argc, &argv) != MPI_SUCCESS) {
		fprintf(stderr, "MPI initialization failed.\n");
		return 1;
	}
	int rank, size;
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	if (size < 2) {
		fprintf(stderr, "cant play this game alone.\n");
		return 1;
	}
	int sendbuf[2] = { rank, 1 };
	fprintf(stderr, "[ %d ] my numbers are: %3d %3d\n", rank, sendbuf[0], sendbuf[1]);
	int recvbuf[2];
	if (MPI_Exscan(sendbuf, recvbuf, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD)) {
		fprintf(stderr, "MPI_Exscan failed\n");
		MPI_Abort(MPI_COMM_WORLD, 1);
	}
	if (rank)
		fprintf(stderr, "[ %d ] received sum Exscan %3d %3d\n", rank, recvbuf[0], recvbuf[1]);
	MPI_Finalize();
	return 0;
}
예제 #2
0
int main(int argc, char *argv[])
{
int root = 0;
int processCount;
int currentRank;
MPI_Status status;

MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&processCount);  
MPI_Comm_rank(MPI_COMM_WORLD,&currentRank);
int reduce = currentRank;   
int reduce2 = currentRank; 
int reduce3 = 0; 

MPI_Scan(&currentRank,&reduce,1,MPI_INT,MPI_SUM,MPI_COMM_WORLD);
printf("Scan: process %d:   reduce = %d\n", currentRank, reduce);
MPI_Exscan(&currentRank,&reduce2,1,MPI_INT,MPI_SUM,MPI_COMM_WORLD);
printf("Exscan: process %d:   reduce = %d\n", currentRank, reduce2);
MPI_Reduce(&currentRank,&reduce3,1,MPI_INT,MPI_SUM, 0, MPI_COMM_WORLD);
if(currentRank==0)
    printf("Reduce: process %d:   reduce = %d\n", currentRank, reduce3);

MPI_Finalize();
return 0;
}
예제 #3
0
파일: exscanf.c 프로젝트: agrimaldi/pmap
FORT_DLL_SPEC void FORT_CALL mpi_exscan_ ( void*v1, void*v2, MPI_Fint *v3, MPI_Fint *v4, MPI_Fint *v5, MPI_Fint *v6, MPI_Fint *ierr ){

#ifndef HAVE_MPI_F_INIT_WORKS_WITH_C
    if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; }
#endif
    if (v1 == MPIR_F_MPI_IN_PLACE) v1 = MPI_IN_PLACE;
    *ierr = MPI_Exscan( v1, v2, (int)*v3, (MPI_Datatype)(*v4), (MPI_Op)*v5, (MPI_Comm)(*v6) );
}
예제 #4
0
static MPI_Offset writeToMPI(const std::vector<T>& data, MPI_File f, MPI_Offset base, MPI_Comm comm)
{    
    MPI_Offset offset = 0, nbytes = data.size()*sizeof(T);
    MPI_Check( MPI_Exscan(&nbytes, &offset, 1, MPI_OFFSET, MPI_SUM, comm));

    MPI_Check( MPI_File_write_at_all(f, base + offset, data.data(), nbytes, MPI_CHAR, MPI_STATUS_IGNORE));

    MPI_Offset ntotal = 0;
    MPI_Check( MPI_Allreduce(&nbytes, &ntotal, 1, MPI_OFFSET, MPI_SUM, comm) );

    return ntotal;
}
예제 #5
0
파일: comm.cpp 프로젝트: ibaned/omega_h2
T Comm::exscan(T x, Omega_h_Op op) const {
#ifdef OMEGA_H_USE_MPI
  CALL(MPI_Exscan(
      MPI_IN_PLACE, &x, 1, MpiTraits<T>::datatype(), mpi_op(op), impl_));
  if (rank() == 0) x = 0;
  return x;
#else
  (void)op;
  (void)x;
  return 0;
#endif
}
예제 #6
0
std::tuple<vector<unsigned int>, vector<unsigned int>> compute_global_t_prime_sums_and_exscans_arrays(const vector<vector<unsigned int>> &prefix_summed_bucket_table,
                                                                                                      MPI_Comm comm) {
    vector<unsigned int> t_primes(prefix_summed_bucket_table.size()),
                         t_primes_summed(prefix_summed_bucket_table.size()),
                         t_primes_exscanned(prefix_summed_bucket_table.size());

    // Initialize t_primes with the prefix sums of the bucket table
    for (auto i=0; i < t_primes.size(); ++i) {
        t_primes[i] = prefix_summed_bucket_table[i].back();
    }

    // Calculate, for each digit, the total number of elements with the same digit across all processors
    MPI_Allreduce(&t_primes[0], &t_primes_summed[0], t_primes.size(), MPI_UNSIGNED, MPI_SUM, comm);

    // Calculate, for each digit, the total number of elements with the same digit but on processors with smaller rank
    MPI_Exscan(&t_primes[0], &t_primes_exscanned[0], t_primes.size(), MPI_UNSIGNED, MPI_SUM, comm);

    // Return both
    return make_tuple(t_primes_summed, t_primes_exscanned);
}
  void globalUniquenessOfIds(std::vector<Q>& localVector, ReadIdType localReadCount, MPI_Comm comm)
  {
    int rank;
    MPI_Comm_rank(comm, &rank);

    ReadIdType previousReadIdSum;

    //Get MPI Datatype using mxx library
    mxx::datatype<ReadIdType> MPI_ReadIDType;
    MPI_Exscan(&localReadCount, &previousReadIdSum, 1, MPI_ReadIDType.type(), MPI_SUM, comm);


    //Update all elements
    if(rank > 0)
    {
      for ( auto& eachTuple : localVector) 
      {
        //Update Pc only
        std::get<readTuple::rid>(eachTuple) = std::get<readTuple::rid>(eachTuple) + previousReadIdSum;
      }
    }
  }
예제 #8
0
int main(int argc, char *argv[])
{
    int errs = 0;
    int rank, size;
    int sendbuf[1], recvbuf[1];
    MPI_Comm comm;

    MTest_Init(&argc, &argv);

    comm = MPI_COMM_WORLD;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);

    sendbuf[0] = rank;
    recvbuf[0] = -2;

    MPI_Exscan(sendbuf, recvbuf, 1, MPI_INT, MPI_SUM, comm);

    /* Check the results.  rank 0 has no data.  Input is
     * 0  1  2  3  4  5  6  7  8 ...
     * Output is
     * -  0  1  3  6 10 15 21 28 36
     * (scan, not counting the contribution from the calling process)
     */
    if (rank > 0) {
        int result = (((rank) * (rank - 1)) / 2);
        /* printf("%d: %d\n", rank, result); */
        if (recvbuf[0] != result) {
            errs++;
            fprintf(stderr, "Error in recvbuf = %d on %d, expected %d\n", recvbuf[0], rank, result);
        }
    } else if (recvbuf[0] != -2) {
        errs++;
        fprintf(stderr, "Error in recvbuf on zero, is %d\n", recvbuf[0]);
    }

    MTest_Finalize(errs);
    return MTestReturnValue(errs);
}
예제 #9
0
int tree2elemental(InvMedTree<FMM_Mat_t> *tree, El::DistMatrix<T,El::VC,El::STAR> &Y){

	int data_dof=2;
	int SCAL_EXP = 1;

	int nlocal,gsize; //local elements, start p_id, global size
	double *pt_array; // will hold local array
	int r,q,rq; //Grid sizes
	int nbigs; //Number of large sends (i.e. send 1 extra data point)
	int pstart; // p_id of nstart
	int rank = El::mpi::WorldRank(); //p_id
	int send_size; // base send size
	bool print = rank == -1; 


	// Get Grid and associated params
	const El::Grid* g = &(Y.Grid());
	r = g->Height();
	q = g->Width();
	MPI_Comm comm = (g->Comm()).comm;

	std::vector<FMMNode_t*> nlist = tree->GetNGLNodes();

	int cheb_deg = InvMedTree<FMM_Mat_t>::cheb_deg;
	int omp_p=omp_get_max_threads();
	size_t n_coeff3=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6;

	// Get sizes, array in petsc 
	//VecGetSize(pt_vec,&gsize);
	gsize = tree->M/data_dof;
	nlocal = tree->m/data_dof;
	//VecGetLocalSize(pt_vec,&nlocal);
	//VecGetArray(pt_vec,&pt_array);
	int nstart = 0;
	MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm);
	//VecGetOwnershipRange(pt_vec,&nstart,NULL);

	//Find processor that nstart belongs to, number of larger sends
	rq = r * q;
	pstart = nstart % rq; //int div
	nbigs = nlocal % rq;
	send_size = nlocal/rq;
	
	if(print){
		std::cout << "r: " << r << " q: " << q <<std::endl;
		std::cout << "nstart: " << nstart << std::endl;
		std::cout << "ps: " << pstart << std::endl;
		std::cout << "nbigs: " << nbigs << std::endl;
		std::cout << "send_size: " << send_size << std::endl;
	}

	// Make send_lengths
	std::vector<int> send_lengths(rq);
	std::fill(send_lengths.begin(),send_lengths.end(),send_size);
	if(nbigs >0){
		for(int j=0;j<nbigs;j++){
			send_lengths[(pstart + j) % rq] += 1;
		}
	}

	// Make send_disps
	std::vector<int> send_disps = exscan(send_lengths);

	std::vector<El::Complex<double>> indata(nlocal);
	// copy the data from an ffm tree to into a local vec of complex data for sending #pragma omp parallel for
	for(size_t tid=0;tid<omp_p;tid++){
		size_t i_start=(nlist.size()* tid   )/omp_p;
		size_t i_end  =(nlist.size()*(tid+1))/omp_p;
		for(size_t i=i_start;i<i_end;i++){
			pvfmm::Vector<double>& coeff_vec=nlist[i]->ChebData();
			double s=std::pow(0.5,COORD_DIM*nlist[i]->Depth()*0.5*SCAL_EXP);

			size_t offset=i*n_coeff3;
			for(size_t j=0;j<n_coeff3;j++){
				double real = coeff_vec[j]*s; // local indices as in the pvfmm trees
				double imag = coeff_vec[j+n_coeff3]*s;
				El::Complex<double> coeff;
				El::SetRealPart(coeff,real);
				El::SetImagPart(coeff,imag);

				indata[offset+j] = coeff;
			}
		}
	}


	// Make send_data
	std::vector<El::Complex<double>> send_data(nlocal);
	for(int proc=0;proc<rq;proc++){
		int offset = send_disps[proc];
		int base_idx = (proc - pstart + rq) % rq; 
		for(int j=0; j<send_lengths[proc]; j++){
			int idx = base_idx + (j * rq);
			send_data[offset + j] = indata[idx];
		}
	}

	// Do all2all to get recv_lengths
	std::vector<int> recv_lengths(rq);
	MPI_Alltoall(&send_lengths[0], 1, MPI_INT, &recv_lengths[0], 1, MPI_INT,comm);

	// Scan to get recv_disps
	std::vector<int> recv_disps = exscan(recv_lengths);

	// Do all2allv to get data on correct processor
	El::Complex<double> * recv_data = Y.Buffer();
	//MPI_Alltoallv(&send_data[0],&send_lengths[0],&send_disps[0],MPI_DOUBLE, \
	//		&recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm);
	El::mpi::AllToAll(&send_data[0], &send_lengths[0], &send_disps[0], recv_data,&recv_lengths[0],&recv_disps[0],comm);

	if(print){
		std::cout << "Send data: " <<std::endl << send_data <<std::endl;
		std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl;
		std::cout << "Send disps: " <<std::endl << send_disps <<std::endl;
		std::cout << "Recv data: " <<std::endl << recv_data <<std::endl;
		std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl;
		std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl;
	}

	return 0;
}
예제 #10
0
int main( int argc, char *argv[] )
{
    int errs = 0;
    int rank, size;
    int minsize = 2, count; 
    int *sendbuf, *recvbuf, i;
    MPI_Comm      comm;

    MTest_Init( &argc, &argv );

    /* The following illustrates the use of the routines to 
       run through a selection of communicators and datatypes.
       Use subsets of these for tests that do not involve combinations 
       of communicators, datatypes, and counts of datatypes */
    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
	if (comm == MPI_COMM_NULL) continue;

	MPI_Comm_rank( comm, &rank );
	MPI_Comm_size( comm, &size );
	
	for (count = 1; count < 65000; count = count * 2) {

	    sendbuf = (int *)malloc( count * sizeof(int) );
	    recvbuf = (int *)malloc( count * sizeof(int) );

	    for (i=0; i<count; i++) {
		sendbuf[i] = rank + i * size;
		recvbuf[i] = -1;
	    }
	    
	    MPI_Exscan( sendbuf, recvbuf, count, MPI_INT, MPI_SUM, comm );

	    /* Check the results.  rank 0 has no data */
	    if (rank > 0) {
		int result;
		for (i=0; i<count; i++) {
		    result = rank * i * size + ((rank) * (rank-1))/2;
		    if (recvbuf[i] != result) {
			errs++;
			if (errs < 10) {
			    fprintf( stderr, "Error in recvbuf[%d] = %d on %d, expected %d\n",
				     i, recvbuf[i], rank, result );
			}
		    }
		}
	    }

#if MTEST_HAVE_MIN_MPI_VERSION(2,2)
            /* now try the MPI_IN_PLACE flavor */
            for (i=0; i<count; i++) {
                sendbuf[i] = -1; /* unused */
                recvbuf[i] = rank + i * size;
            }

            MPI_Exscan( MPI_IN_PLACE, recvbuf, count, MPI_INT, MPI_SUM, comm );

            /* Check the results.  rank 0's data must remain unchanged */
            for (i=0; i<count; i++) {
                int result;
                if (rank == 0)
                    result = rank + i * size;
                else
                    result = rank * i * size + ((rank) * (rank-1))/2;
                if (recvbuf[i] != result) {
                    errs++;
                    if (errs < 10) {
                        fprintf( stderr, "Error in recvbuf[%d] = %d on %d, expected %d\n",
                                 i, recvbuf[i], rank, result );
                    }
                }
            }

            MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
            /* Make sure that we check for buffer aliasing properly */
            if (MPI_SUCCESS == MPI_Exscan( recvbuf, recvbuf, count, MPI_INT, MPI_SUM, comm ))
                errs++;
#endif

	    free( sendbuf );
	    free( recvbuf );
	}
	MTestFreeComm( &comm );
    }

    MTest_Finalize( errs );
    MPI_Finalize();
    return 0;
}
예제 #11
0
파일: pvo_vtu_file.c 프로젝트: kraused/pvo
/// Write a vtu file
static int pvo_vtu_write_data( pvo_file_t self, pvo_xml_file_t f )
{
    int err = 0;
    pvo_vtu_file_t fh = (pvo_vtu_file_t )self;
    int ibuf[3], jbuf[3];
    size_t offset = 0;
    int gnnodes, gncells, gnnz, bnodes, bnnz;
    pvo_var_t* p;
    int nbytes;
    MPI_Datatype type;
    size_t i;

    if( NULL == f->island ) {
        PVO_ERROR( "Invalid input: NULL == fh->base.cki." );
        goto fn_fail;
    }


    ibuf[0] = fh->nnodes;
    ibuf[1] = fh->ncells;
    ibuf[2] = fh->cia[fh->ncells];
    if( MPI_Allreduce( MPI_IN_PLACE, ibuf, 3, MPI_INT, MPI_SUM, f->island->comm )) {
        PVO_WARN( "MPI_Allreduce failed." );
        goto fn_fail;
    }

    gnnodes = ibuf[0];
    gncells = ibuf[1];
    gnnz    = ibuf[2];


    /* Bottom index for the local process. This value can be added to the local
     * node or cell index to get unique identifiers (within the island
     */
    ibuf[0] = fh->nnodes;
    ibuf[1] = fh->cia[fh->ncells];

    if( MPI_Exscan( ibuf, jbuf, 2, MPI_INT, MPI_SUM, f->island->comm )) {
        PVO_WARN( "MPI_Exscane failed." );
        goto fn_fail;
    }

    if( 0 == f->island->rank ) {
        jbuf[0] = 0;
        jbuf[1] = 0;
    }

    bnodes = jbuf[0];
    bnnz   = jbuf[1];

    pvo_xml_file_new_group( f, "VTKFile type=\"UnstructuredGrid\" "
                               "version=\"0.1\" "
                               "byte_order=\"%s\"", self->bo_str );
    pvo_xml_file_new_group( f, "UnstructuredGrid" );
    pvo_xml_file_new_group( f, "Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\"",
                               gnnodes, gncells );

    pvo_xml_file_new_group( f, "PointData" );
    for( p = self->cki->vlist; p; p = p->next ) {
        if( PVO_VAR_NODEDATA != p->grp )
            continue;

        p->offset = offset;
        pvo_xml_file_write_element( f, "DataArray type=\"%s\" Name=\"%s\" NumberOfComponents=\"%d\" format=\"appended\" offset=\"%lu\"", pvo_var_type_names[p->type], p->name, p->ncomps, p->offset );
        offset += pvo_var_type_sizeof[p->type]*p->ncomps*gnnodes + 4;
    }
    pvo_xml_file_end_group( f, "PointData" );

    pvo_xml_file_new_group( f, "CellData" );
    for( p = self->cki->vlist; p; p = p->next ) {
        if( PVO_VAR_CELLDATA != p->grp )
            continue;

        p->offset = offset;
        pvo_xml_file_write_element( f, "DataArray type=\"%s\" Name=\"%s\" NumberOfComponents=\"%d\" format=\"appended\" offset=\"%lu\"", pvo_var_type_names[p->type], p->name, p->ncomps, p->offset );
        offset += pvo_var_type_sizeof[p->type]*p->ncomps*gncells + 4;
    }
    pvo_xml_file_end_group( f, "CellData" );

    pvo_xml_file_new_group( f, "Points" );
    pvo_xml_file_write_element( f, "DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"appended\" offset=\"%lu\"", offset );
    offset += pvo_var_type_sizeof[PVO_VAR_FLOAT32]*3*gnnodes + 4;
    pvo_xml_file_end_group( f, "Points" );

    pvo_xml_file_new_group( f, "Cells" );
    pvo_xml_file_write_element( f, "DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"%lu\"", offset );
    offset += pvo_var_type_sizeof[PVO_VAR_INT32]*gnnz + 4;

    pvo_xml_file_write_element( f, "DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"%lu\"", offset );
    offset += pvo_var_type_sizeof[PVO_VAR_INT32]*gncells + 4;

    pvo_xml_file_write_element( f, "DataArray type=\"UInt8\" Name=\"types\" format=\"appended\" offset=\"%lu\"", offset );
    pvo_xml_file_end_group( f, "Cells" );

    pvo_xml_file_end_group( f, "Piece" );
    pvo_xml_file_end_group( f, "UnstructuredGrid" );

    pvo_xml_file_new_group( f, "AppendedData encoding=\"raw\"" );
    pvo_xml_file_write_single( f, "_", 1, MPI_CHAR );

    for( p = self->cki->vlist; p; p = p->next ) {
        if( PVO_VAR_NODEDATA != p->grp )
            continue;

        nbytes = pvo_var_type_sizeof[p->type]*p->ncomps*gnnodes;
        pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT );

        pvo_var_type_mpi(p->type, &type);
        pvo_xml_file_write_ordered( f, (void *)p->ptr, p->ncomps*fh->nnodes, type );
    }

    for( p = self->cki->vlist; p; p = p->next ) {
        if( PVO_VAR_CELLDATA != p->grp )
            continue;

        nbytes = pvo_var_type_sizeof[p->type]*p->ncomps*gncells;
        pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT );

        pvo_var_type_mpi(p->type, &type);
        pvo_xml_file_write_ordered( f, (void* )p->ptr, p->ncomps*fh->ncells, type );
    }

    /* Write point coordinates
     */
    nbytes = pvo_var_type_sizeof[PVO_VAR_FLOAT32]*3*gnnodes;
    pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT );

    pvo_var_type_mpi( PVO_VAR_FLOAT32, &type );
    pvo_xml_file_write_ordered( f, fh->pts, 3*fh->nnodes, type );

    /* Bottom index for the local process. This value can be added to the local
     * node or cell index to get unique identifiers (within the island
     */


    /* Write connectivity
     */
    nbytes = pvo_var_type_sizeof[PVO_VAR_INT32]*gnnz;
    pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT );

    for( i = 0; i < fh->cia[fh->ncells]; ++i )
        fh->cja[i] += bnodes;

    pvo_var_type_mpi( PVO_VAR_INT32, &type );
    pvo_xml_file_write_ordered( f, fh->cja, fh->cia[fh->ncells], type );

    for( i = 0; i < fh->cia[fh->ncells]; ++i )
        fh->cja[i] -= bnodes;

    /* Write offsets
     */
    nbytes = pvo_var_type_sizeof[PVO_VAR_INT32]*gncells;
    pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT );

    for( i = 1; i <= fh->ncells; ++i )
        fh->cia[i] += bnnz;

    pvo_var_type_mpi( PVO_VAR_INT32, &type );
    pvo_xml_file_write_ordered( f, fh->cia+1, fh->ncells, type );

    for( i = 1; i <= fh->ncells; ++i )
        fh->cia[i] -= bnnz;

    /* Write types
     */
    nbytes = pvo_var_type_sizeof[PVO_VAR_UINT8]*gncells;
    pvo_xml_file_write_single ( f, &nbytes, 1, MPI_INT );

    pvo_var_type_mpi( PVO_VAR_UINT8, &type );
    pvo_xml_file_write_ordered( f, fh->types, fh->ncells, type );

    pvo_xml_file_end_group( f, "AppendedData" );
    pvo_xml_file_end_group( f, "VTKFile" );

fn_exit:
    return err;
fn_fail:
    err = -1;
    goto fn_exit;
}
예제 #12
0
/* check whether all items in buf are already in sorted order */
int DTCMP_Is_sorted(
  const void* buf,
  int count,
  MPI_Datatype key,
  MPI_Datatype keysat,
  DTCMP_Op cmp,
  DTCMP_Flags hints,
  MPI_Comm comm,
  int* flag)
{
  int rc = DTCMP_SUCCESS;

  /* assume that items are globally sorted,
   * we'll set this to 0 if we find otherwise */
  int sorted = 1;

  /* get our rank and the number of ranks in the communicator */
  int rank, ranks;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &ranks);

  /* first, step through and check that all of our local items are in order */
  DTCMP_Is_sorted_local(buf, count, key, keysat, cmp, hints, &sorted);

  /* bail out at this point if ranks == 1 */
  if (ranks <= 1) {
    *flag = sorted;
    return DTCMP_SUCCESS;
  }

  /* get extent of keysat */
  MPI_Aint lb, extent;
  MPI_Type_get_extent(keysat, &lb, &extent);

  /* get true extent of key */
  MPI_Aint key_true_lb, key_true_extent;
  MPI_Type_get_true_extent(key, &key_true_lb, &key_true_extent);

  /* TODO: if we know that each proc has an item,
   * we could just do a single pt2pt send to the rank one higher,
   * compare, then allreduce, and thereby avoid the type/op creation
   * and scan that follows */

  /* allocate type for scan, one int to say whether key is valid,
   * and our largest key */
  size_t item_size = sizeof(int) + key_true_extent;
  char* sendbuf = dtcmp_malloc(item_size, 0, __FILE__, __LINE__);
  char* recvbuf = dtcmp_malloc(item_size, 0, __FILE__, __LINE__);

  /* copy our largest item into our send buffer,
   * set valid flag to 1 if we have a value */
  int*  valid = (int*) sendbuf;
  void* value = (void*) (sendbuf + sizeof(int));
  if (count > 1) {
    *valid = 1;

    /* get pointer to largest element in our buffer,
     * and copy it to our send buffer */
    const void* lastitem = (const void*) ((const char*)buf + (count - 1) * extent);
    DTCMP_Memcpy(value, 1, key, lastitem, 1, key);
  } else {
    /* we dont have any items, so set valid flag to 0 */
    *valid = 0;
  }

  /* create and commit type that consists of leading int followed by key */
  MPI_Datatype validtype;
  dtcmp_type_concat2(MPI_INT, key, &validtype);

  /* create user-defined reduction operation to copy key if its valid */
  MPI_Op validop;
  MPI_Op_create(copy_key_if_valid, 0, &validop);

  /* execute scan to get key from next process to our left (that has an item) */
  MPI_Exscan(sendbuf, recvbuf, 1, validtype, validop, comm);

  /* free off our user-defined reduction op and datatype */
  MPI_Op_free(&validop);
  MPI_Type_free(&validtype);

  /* compare our smallest item to the received item */
  if (count > 0 && rank > 0) {
    int recvvalid = *(int*) recvbuf;
    if (recvvalid) {
      const void* recvkey = (const void*) (recvbuf + sizeof(int));
      if (dtcmp_op_eval(recvkey, buf, cmp) > 0) {
        sorted = 0;
      }
    }
  }

  /* allreduce to determine whether all items are in order */
  int all_sorted;
  MPI_Allreduce(&sorted, &all_sorted, 1, MPI_INT, MPI_LAND, comm);

  /* free the scratch space */
  dtcmp_free(&recvbuf);
  dtcmp_free(&sendbuf);

  /* set caller's output flag and return */
  *flag = all_sorted;
  return rc;
}
slint_t mpi_partition_radix2(elements_t *s, partcond2_t *pc, slint_t rhigh, slint_t rlow, slint_t rwidth, int *scounts, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_partition_radix2 */
{
  slkey_pure_t max_nclasses;
  slkey_pure_t nclasses, bit_mask;
  slkey_pure_t k;

  const slint_t max_nareas = size - 1;
  slint_t nareas, nareas_new;
  elements_t areas0[max_nareas], areas1[max_nareas], *areas, *areas_new;

  double *locals, *globals;
  double *local_counts, *local_weights, *global_counts, *global_weights;

  const slint_t max_nparts = size - 1;
  slint_t parts_low, parts_high, nparts_removed;
  slint_t parts[max_nparts], part_areas[max_nparts];

  double parts_range_[2 * 2 * (1 + max_nparts + 1)];
  double *parts_range = parts_range_ + (2 * 2);
  double parts_minmax_[2 * 4 * (1 + max_nparts + 1)];
  double *parts_minmax = parts_minmax_ + (2 * 4);
  slint_t parts_update_[1 + max_nparts + 1];
  slint_t *parts_update = parts_update_ + 1;

  double parts_minmax_new[2 * 4];
  double current_minmax[2 * 2];
  
  double final_locals[2 * max_nparts];

  slint_t i, j, jp1, jm1, l, lp1, lm1;
  slint_t current_width;

  double minmax[2 * 4 * size];
  
  slint_t last_new_area, last_new_class;

#ifdef HAVENT_MPI_IN_PLACE
  double local_minmax[2 * 4];
#endif

  slint_t lc, lcs, gc, gcs;
  double lw, gw, lws, gws;
  double d, m;

  elements_t xi, end;

  slint_t round = 0;
  slint_t direction = 1;

  slint_t refine, finalize;

#ifdef RCOUNTS_RDISPLS
  int *rcounts, *rdispls;
#endif

#ifdef WEIGHT_STATS
  slint_t total_count = 0, partial_counts[size + 1];
  double total_weight = 0.0, partial_weights[size + 1];
  double vmin, vmax;
# ifdef HAVENT_MPI_IN_PLACE
  slint_t partial_counts2[size + 1];
  double partial_weights2[size + 1];
# endif
#endif

  rti_treset(rti_tid_mpi_partition_radix2_while);                   /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_count);             /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_allreduce);         /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_round1);            /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_round1_allgather);  /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_exscan);            /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check);             /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_pre);         /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_classes);     /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_final);       /* sl_tid */
  rti_treset(rti_tid_mpi_partition_radix2_while_check_post);        /* sl_tid */

  rti_tstart(rti_tid_mpi_partition_radix2_sync);
#ifdef SYNC_ON_INIT
  MPI_Barrier(comm);
#endif
  rti_tstop(rti_tid_mpi_partition_radix2_sync);

  rti_tstart(rti_tid_mpi_partition_radix2);

  if (rhigh < 0) rhigh = radix_high;
  if (rlow < 0) rlow = radix_low;
  if (rwidth < 0) rwidth = sort_radix_width_default;
  
  max_nclasses = powof2_typed(rwidth, slkey_pure_t);

  locals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double));
  globals = sl_alloc(2 * (max_nareas * max_nclasses + max_nareas), sizeof(double));

  areas = areas0;
  areas_new = areas1;

  /* init the first area (all elements) */
  nareas = 1;
  elem_assign(s, &areas[0]);

  /* init all parts */
  parts_low = 0;
  parts_high = max_nparts - 1;
  for (i = parts_low; i <= parts_high; ++i)
  {
    parts[i] = i;
    part_areas[i] = 0;
  }

  /* init sdispls */
  for (i = 0; i < size; ++i) sdispls[i] = 0;

  rti_tstart(rti_tid_mpi_partition_radix2_while);

  while (parts_low <= parts_high)
  {
    ++round;

    /* setup bitmask */
    current_width = xmin(rwidth, rhigh - rlow + 1);
    rhigh -= (current_width > 0)?current_width - 1:rhigh;

    nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1;
    bit_mask = nclasses - 1;
    
    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", rhigh: %" sl_int_type_fmt ", current_width: %" sl_int_type_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses);

    finalize = (current_width <= 0);

    if (!finalize || round == 1)
    {
      /* init counters */
      local_counts = locals;
      global_counts = globals;
      local_weights = locals + (nareas * nclasses) + nareas;
      global_weights = globals + (nareas * nclasses) + nareas;

      /* zero all counter */
      for (i = 0; i < nareas; ++i)
      for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = local_weights[i * nclasses + k] = 0.0;

      rti_tstart(rti_tid_mpi_partition_radix2_while_count);

      /* for every area */
      for (i = 0; i < nareas; ++i)
      {
        elem_assign_at(&areas[i], areas[i].size, &end);

        if (nclasses > 1)
        {
          /* counts and weights in every class */
          for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi))
          {
            k = radix_key2class(key_purify(*xi.keys), rhigh, bit_mask);
            local_counts[i * nclasses + k] += 1;
            local_weights[i * nclasses + k] += elem_weight_one(&xi, 0);
          }

        } else
        {
          /* total counts and weights */
          local_counts[i * nclasses + 0] = areas[i].size;

          for (elem_assign(&areas[i], &xi); xi.keys < end.keys; elem_inc(&xi)) local_weights[i * nclasses + 0] += elem_weight_one(&xi, 0);
        }

        /* total counts and weights in this area */
        local_counts[nareas * nclasses + i] = areas[i].size;

        local_weights[nareas * nclasses + i] = 0.0;
        for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += local_weights[i * nclasses + k];
      }

      rti_tstop(rti_tid_mpi_partition_radix2_while_count);

      --rhigh;

      rti_tstart(rti_tid_mpi_partition_radix2_while_allreduce);

      /* create global counts and weights */
#ifdef MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD
      if (size >= MPI_PARTITION_RADIX_REDUCEBCAST_THRESHOLD)
      {
        MPI_Reduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm);
        MPI_Bcast(globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm);

      } else
#endif
        MPI_Allreduce(locals, globals, (1 + 1) * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm);

      rti_tstop(rti_tid_mpi_partition_radix2_while_allreduce);
    }

#ifdef TIMING
    SL_TRACE_IF(DEBUG_OR_NOT, "allreduce: %f, nareas: %" sl_int_type_fmt ", nclasses: %" sl_key_type_fmt ", doubles: %" sl_int_type_fmt, rti_tlast(rti_tid_mpi_partition_radix2_while_allreduce), nareas, nclasses, (1 + 1) * (nareas * nclasses + nareas));
#endif

/*    if (DEBUG_OR_NOT)
    {
      printf("%d: locals\n", rank);
      for (i = 0; i < nareas; ++i)
      {
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", local_counts[i * nclasses + k]);
        printf(" = %f\n", local_counts[nareas * nclasses + i]);
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", local_weights[i * nclasses + k]);
        printf(" = %f\n", local_weights[nareas * nclasses + i]);
      }
      printf("%d: globals\n", rank);
      for (i = 0; i < nareas; ++i)
      {
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", global_counts[i * nclasses + k]);
        printf(" = %f\n", global_counts[nareas * nclasses + i]);
        printf("%d: %" sl_int_type_fmt ":", rank, i);
        for (k = 0; k < nclasses; ++k) printf("  %f", global_weights[i * nclasses + k]);
        printf(" = %f\n", global_weights[nareas * nclasses + i]);
      }
    }*/

    /* do some initializations */
    if (round == 1)
    {
      rti_tstart(rti_tid_mpi_partition_radix2_while_round1);
    
      /* distribute min/max counts and weights */
      minmax[rank * 2 * 4 + 0 + 0] = (pc->min_count >= 0)?pc->min_count:(-pc->min_count * global_counts[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 0 + 1] = (pc->max_count >= 0)?pc->max_count:(-pc->max_count * global_counts[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 0 + 2] = (pc->min_cpart >= 0)?pc->min_cpart:(-pc->min_cpart * global_counts[nareas * nclasses + 0]);
      minmax[rank * 2 * 4 + 0 + 3] = (pc->max_cpart >= 0)?pc->max_cpart:(-pc->max_cpart * global_counts[nareas * nclasses + 0]);

      minmax[rank * 2 * 4 + 4 + 0] = (pc->min_weight >= 0)?pc->min_weight:(-pc->min_weight * global_weights[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 4 + 1] = (pc->max_weight >= 0)?pc->max_weight:(-pc->max_weight * global_weights[nareas * nclasses + 0] / size);
      minmax[rank * 2 * 4 + 4 + 2] = (pc->min_wpart >= 0)?pc->min_wpart:(-pc->min_wpart * global_weights[nareas * nclasses + 0]);
      minmax[rank * 2 * 4 + 4 + 3] = (pc->max_wpart >= 0)?pc->max_wpart:(-pc->max_wpart * global_weights[nareas * nclasses + 0]);

      rti_tstart(rti_tid_mpi_partition_radix2_while_round1_allgather);
#ifdef HAVENT_MPI_IN_PLACE
      local_minmax[0 + 0] = minmax[rank * 2 * 4 + 0 + 0];
      local_minmax[0 + 1] = minmax[rank * 2 * 4 + 0 + 1];
      local_minmax[0 + 2] = minmax[rank * 2 * 4 + 0 + 2];
      local_minmax[0 + 3] = minmax[rank * 2 * 4 + 0 + 3];
      local_minmax[4 + 0] = minmax[rank * 2 * 4 + 4 + 0];
      local_minmax[4 + 1] = minmax[rank * 2 * 4 + 4 + 1];
      local_minmax[4 + 2] = minmax[rank * 2 * 4 + 4 + 2];
      local_minmax[4 + 3] = minmax[rank * 2 * 4 + 4 + 3];
      MPI_Allgather(local_minmax, 2 * 4, MPI_DOUBLE, minmax, 2 * 4, MPI_DOUBLE, comm);
/*      MPI_Gather(local_minmax_weights, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, 0, comm);
      MPI_Bcast(minmax_weights, 2 * 4 * size, MPI_DOUBLE, 0, comm);*/
#else
      MPI_Allgather(MPI_IN_PLACE, 2 * 4, MPI_DOUBLE, minmax_weights, 2 * 4, MPI_DOUBLE, comm);
#endif
      rti_tstop(rti_tid_mpi_partition_radix2_while_round1_allgather);

#ifdef WEIGHT_STATS
      total_count = global_counts[nareas * nclasses + 0];
      total_weight = global_weights[nareas * nclasses + 0];
#endif

      parts_minmax[2 * 4 * (parts_low - 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 2] = 0;
      parts_minmax[2 * 4 * (parts_low - 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 0 + 3] = 0;
      parts_minmax[2 * 4 * (parts_low - 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 2] = 0;
      parts_minmax[2 * 4 * (parts_low - 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_low - 1) + 4 + 3] = 0;

      parts_minmax[2 * 4 * (parts_high + 1) + 0 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 2] = 0;
      parts_minmax[2 * 4 * (parts_high + 1) + 0 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 0 + 3] = global_counts[nareas * nclasses + 0];
      parts_minmax[2 * 4 * (parts_high + 1) + 4 + 0] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 2] = 0;
      parts_minmax[2 * 4 * (parts_high + 1) + 4 + 1] = parts_minmax[2 * 4 * (parts_high + 1) + 4 + 3] = global_weights[nareas * nclasses + 0];

      parts_range[2 * 2 * (parts_low - 1) + 0 + 0] = parts_range[2 * 2 * (parts_high + 1) + 0 + 0] = 0.0;
      parts_range[2 * 2 * (parts_low - 1) + 0 + 1] = parts_range[2 * 2 * (parts_high + 1) + 0 + 1] = global_counts[nareas * nclasses + 0];
      parts_range[2 * 2 * (parts_low - 1) + 2 + 0] = parts_range[2 * 2 * (parts_high + 1) + 2 + 0] = 0.0;
      parts_range[2 * 2 * (parts_low - 1) + 2 + 1] = parts_range[2 * 2 * (parts_high + 1) + 2 + 1] = global_weights[nareas * nclasses + 0];

      for (i = parts_high; i >= parts_low; --i)
      {
        parts_minmax[2 * 4 * parts[i] + 0 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 1] - minmax[2 * 4 * (parts[i] + 1) + 0 + 0];
        parts_minmax[2 * 4 * parts[i] + 0 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 0 + 3] - minmax[2 * 4 * (parts[i] + 1) + 0 + 1];
        parts_minmax[2 * 4 * parts[i] + 4 + 1] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 1] - minmax[2 * 4 * (parts[i] + 1) + 4 + 0];
        parts_minmax[2 * 4 * parts[i] + 4 + 3] = parts_minmax[2 * 4 * (parts[i] + 1) + 4 + 3] - minmax[2 * 4 * (parts[i] + 1) + 4 + 1];
        
        parts_minmax[2 * 4 * parts[i] + 0 + 0] = parts_minmax[2 * 4 * parts[i] + 0 + 2] = parts_minmax[2 * 4 * parts[i] + 4 + 0] = parts_minmax[2 * 4 * parts[i] + 4 + 2] = -1;

        parts_range[2 * 2 * parts[i] + 0 + 0] = 0.0;
        parts_range[2 * 2 * parts[i] + 0 + 1] = global_counts[nareas * nclasses + 0];
        parts_range[2 * 2 * parts[i] + 2 + 0] = 0.0;
        parts_range[2 * 2 * parts[i] + 2 + 1] = global_weights[nareas * nclasses + 0];
/*        SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 0 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 0 + 3]);*/
/*        SL_ASSERT(minmax[2 * 4 * (parts[i] + 1) + 4 + 2] <= minmax[2 * 4 * (parts[i] + 0) + 4 + 3]);*/

        parts_update[parts[i]] = 1;

        if (finalize)
        {
          final_locals[2 * i + 0] = local_counts[nareas * nclasses + 0];
          final_locals[2 * i + 1] = local_weights[nareas * nclasses + 0];
        }
      }

      rti_tstop(rti_tid_mpi_partition_radix2_while_round1);
    }

    if (finalize)
    {
      j = parts_high - parts_low + 1;
    
      SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" sl_int_type_fmt " parts", j);

      rti_tstart(rti_tid_mpi_partition_radix2_while_exscan);

      MPI_Exscan(&final_locals[2 * parts_low], &locals[2 * parts_low], 2 * j, MPI_DOUBLE, MPI_SUM, comm);
      if (rank == 0) for (i = parts_low; i <= parts_high; ++i) locals[2 * i + 0] = locals[2 * i + 1] = 0;

      rti_tstop(rti_tid_mpi_partition_radix2_while_exscan);
    }

    nareas_new = 0;
    last_new_area = last_new_class = -1;

    /* check all remaining parts */

    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" sl_int_type_fmt ", %s", round, (direction > 0)?"forward":"backward");

    nparts_removed = 0;

    rti_tstart(rti_tid_mpi_partition_radix2_while_check);

    i = (direction > 0)?parts_low:parts_high;
    while ((direction > 0)?(i <= parts_high):(i >= parts_low))
    {
      rti_tstart(rti_tid_mpi_partition_radix2_while_check_pre);
    
      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": PART: %" sl_int_type_fmt ",%" sl_int_type_fmt, round, i, parts[i]);

      j = 2 * 4 * parts[i];
      jp1 = 2 * 4 * (parts[i] + 1);
      jm1 = 2 * 4 * (parts[i] - 1);
      l = 2 * 2 * parts[i];
      lp1 = 2 * 2 * (parts[i] + 1);
      lm1 = 2 * 2 * (parts[i] - 1);

      if (parts_update[parts[i]])
      {
        if (direction > 0)
        {
          parts_minmax_new[0 + 0] = parts_minmax[jm1 + 0 + 0] + minmax[j + 0 + 0];
          parts_minmax_new[0 + 2] = parts_minmax[jm1 + 0 + 2] + minmax[j + 0 + 1];
          parts_minmax_new[4 + 0] = parts_minmax[jm1 + 4 + 0] + minmax[j + 4 + 0];
          parts_minmax_new[4 + 2] = parts_minmax[jm1 + 4 + 2] + minmax[j + 4 + 1];

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f + %f, %f + %f  /  %f + %f, %f + %f", i, parts[i],
            parts_minmax[jm1 + 0 + 0], minmax[j + 0 + 0],
            parts_minmax[jm1 + 0 + 2], minmax[j + 0 + 1],
            parts_minmax[jm1 + 4 + 0], minmax[j + 4 + 0],
            parts_minmax[jm1 + 4 + 2], minmax[j + 4 + 1]);

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

          if (parts_minmax_new[0 + 0] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 0] = minmax[jp1 + 0 + 2];
          if (parts_minmax_new[0 + 2] > minmax[j   + 0 + 3]) parts_minmax_new[0 + 2] = minmax[j   + 0 + 3];
          if (parts_minmax_new[4 + 0] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 0] = minmax[jp1 + 4 + 2];
          if (parts_minmax_new[4 + 2] > minmax[j   + 4 + 3]) parts_minmax_new[4 + 2] = minmax[j   + 4 + 3];

          parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1];
          parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3];
          parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1];
          parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3];

        } else
        {
          parts_minmax_new[0 + 1] = parts_minmax[jp1 + 0 + 1] - minmax[jp1 + 0 + 0];
          parts_minmax_new[0 + 3] = parts_minmax[jp1 + 0 + 3] - minmax[jp1 + 0 + 1];
          parts_minmax_new[4 + 1] = parts_minmax[jp1 + 4 + 1] - minmax[jp1 + 4 + 0];
          parts_minmax_new[4 + 3] = parts_minmax[jp1 + 4 + 3] - minmax[jp1 + 4 + 1];

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": %f - %f, %f - %f  /  %f - %f, %f - %f", i, parts[i],
            parts_minmax[jp1 + 0 + 1], minmax[jp1 + 0 + 0],
            parts_minmax[jp1 + 0 + 3], minmax[jp1 + 0 + 1],
            parts_minmax[jp1 + 4 + 1], minmax[jp1 + 4 + 0],
            parts_minmax[jp1 + 4 + 3], minmax[jp1 + 4 + 1]);

          SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 0. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

          if (parts_minmax_new[0 + 3] < minmax[jp1 + 0 + 2]) parts_minmax_new[0 + 3] = minmax[jp1 + 0 + 2];
          if (parts_minmax_new[0 + 1] > minmax[j   + 0 + 3]) parts_minmax_new[0 + 1] = minmax[j   + 0 + 3];
          if (parts_minmax_new[4 + 3] < minmax[jp1 + 4 + 2]) parts_minmax_new[4 + 3] = minmax[jp1 + 4 + 2];
          if (parts_minmax_new[4 + 1] > minmax[j   + 4 + 3]) parts_minmax_new[4 + 1] = minmax[j   + 4 + 3];

          parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0];
          parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2];
          parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0];
          parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2];
        }

        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": 1. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);
        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": minmax: %f  %f  /  %f  %f", parts[i], minmax[2 * 4 * (parts[i] + 1) + 0 + 2], minmax[2 * 4 * (parts[i] + 0) + 0 + 3], minmax[2 * 4 * (parts[i] + 1) + 4 + 2], minmax[2 * 4 * (parts[i] + 0) + 4 + 3]);

        if (parts_minmax_new[0 + 0] > parts_minmax_new[0 + 1]) parts_minmax_new[0 + 0] = parts_minmax_new[0 + 1] = (parts_minmax_new[0 + 0] + parts_minmax_new[0 + 1]) / 2;
        if (parts_minmax_new[0 + 2] < parts_minmax_new[0 + 3]) parts_minmax_new[0 + 2] = parts_minmax_new[0 + 3] = (parts_minmax_new[0 + 2] + parts_minmax_new[0 + 3]) / 2;

        if (parts_minmax_new[4 + 0] > parts_minmax_new[4 + 1]) parts_minmax_new[4 + 0] = parts_minmax_new[4 + 1] = (parts_minmax_new[4 + 0] + parts_minmax_new[4 + 1]) / 2;
        if (parts_minmax_new[4 + 2] < parts_minmax_new[4 + 3]) parts_minmax_new[4 + 2] = parts_minmax_new[4 + 3] = (parts_minmax_new[4 + 2] + parts_minmax_new[4 + 3]) / 2;

      } else
      {
        parts_minmax_new[0 + 0] = parts_minmax[j + 0 + 0];
        parts_minmax_new[0 + 1] = parts_minmax[j + 0 + 1];
        parts_minmax_new[0 + 2] = parts_minmax[j + 0 + 2];
        parts_minmax_new[0 + 3] = parts_minmax[j + 0 + 3];

        parts_minmax_new[4 + 0] = parts_minmax[j + 4 + 0];
        parts_minmax_new[4 + 1] = parts_minmax[j + 4 + 1];
        parts_minmax_new[4 + 2] = parts_minmax[j + 4 + 2];
        parts_minmax_new[4 + 3] = parts_minmax[j + 4 + 3];
      }

      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 2. parts_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

      current_minmax[0 + 0] = xmax(parts_minmax_new[0 + 0], parts_minmax_new[0 + 3]) - parts_range[l + 0 + 0];
      current_minmax[0 + 1] = xmin(parts_minmax_new[0 + 2], parts_minmax_new[0 + 1]) - parts_range[l + 0 + 0];

      current_minmax[2 + 0] = xmax(parts_minmax_new[4 + 0], parts_minmax_new[4 + 3]) - parts_range[l + 2 + 0];
      current_minmax[2 + 1] = xmin(parts_minmax_new[4 + 2], parts_minmax_new[4 + 1]) - parts_range[l + 2 + 0];

      SL_ASSERT(current_minmax[0 + 0] <= current_minmax[0 + 1]);
      SL_ASSERT(current_minmax[2 + 0] <= current_minmax[2 + 1]);

      rti_tstop(rti_tid_mpi_partition_radix2_while_check_pre);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ": current_minmax: %f  %f / %f  %f", parts[i], current_minmax[0 + 0], current_minmax[0 + 1], current_minmax[2 + 0], current_minmax[2 + 1]);

      lcs = gcs = 0;
      lws = gws = 0;

      /* HIT is the default */
      refine = 0;

      if (!finalize)
      {
        rti_tstart(rti_tid_mpi_partition_radix2_while_check_classes);
      
        for (k = 0; k < nclasses; ++k)
        {
          lc = local_counts[part_areas[i] * nclasses + k];
          gc = global_counts[part_areas[i] * nclasses + k];
          lw = local_weights[part_areas[i] * nclasses + k];
          gw = global_weights[part_areas[i] * nclasses + k];

          current_minmax[0 + 0] -= gc;
          current_minmax[0 + 1] -= gc;

          current_minmax[2 + 0] -= gw;
          current_minmax[2 + 1] -= gw;

          SL_TRACE_IF(DEBUG_OR_NOT, "k = %" sl_key_pure_type_fmt ", current_minmax: %f  %f  / %f  %f", k, current_minmax[0], current_minmax[1], current_minmax[2], current_minmax[3]);

          /* stop and refine if max count is skipped OR min count AND max weight is skipped */
          if ((current_minmax[0 + 1] < 0) || (current_minmax[0 + 0] < 0 && current_minmax[2 + 1] < 0))
          {
            refine = 1;
            break;
          }

          lcs += lc;
          gcs += gc;
          lws += lw;
          gws += gw;

          gc = gw = 0.0;

          /* if between min/max counts */
          if (current_minmax[0 + 0] <= 0 && current_minmax[0 + 1] >= 0)
          {
            /* go to next if max count not reached AND min weight not reached */
            if (current_minmax[0 + 1] > 0 && current_minmax[2 + 0] > 0) continue;

            /* look ahead for a better stop */
            if (k + 1 < nclasses && current_minmax[0 + 1] - global_counts[part_areas[i] * nclasses + k + 1] >= 0)
            {
              /* continue if weights will improve */
              if (myabs(current_minmax[2 + 0] + current_minmax[2 + 1]) > myabs(current_minmax[2 + 0] + current_minmax[2 + 1] - 2 * global_weights[part_areas[i] * nclasses + k + 1])) continue;
            }

            /* stop */
            break;
          }
        }

        SL_ASSERT(k < nclasses);

        SL_TRACE_IF(DEBUG_OR_NOT, "%s k = %" sl_key_pure_type_fmt, (refine)?"REFINE":"HIT", k);
      
        rti_tstop(rti_tid_mpi_partition_radix2_while_check_classes);

      } else
      {
        rti_tstart(rti_tid_mpi_partition_radix2_while_check_final);

        /* middle of min/max weight */
        m = (current_minmax[2 + 0] + current_minmax[2 + 1]) / 2;

        /* min. part of weight to contribute */
        d = xmax(0, m - locals[i * 2 + 1]);

        /* contribute all? */
        if (d >= final_locals[i * 2 + 1])
        {
          lc = final_locals[i * 2 + 0];
          lw = final_locals[i * 2 + 1];

        } else
        {
          /* contribute only a part */
          lc = 0;
          lw = 0; /* not required */

          do
          {
            d -= elem_weight_one(s, sdispls[1 + parts[i]] + lc);
            ++lc;

          } while (d >= 0 && lc < final_locals[i * 2 + 0]);

          --lc;
        
          /* if unweighted, then m = middle of min/max count, d = ..., lc = d */
        }

        /* check mc against min/max count borders */
        lc = xminmax(current_minmax[0 + 0] - locals[i * 2 + 0], lc, current_minmax[0 + 1] - locals[i * 2 + 0]);

        /* check agains 0 (don't step back!) and the local contribution */
        lc = xminmax(0, lc, final_locals[i * 2 + 0]);

        /* the exact global counts/weights are unknown (set gc/gw so that parts_range is not changed) */
        gc = 0;
        gw = 0;

        lcs += lc;
        gcs += gc;
        lws += lw;
        gws += gw;
        
        gc = (parts_range[2 * 2 * parts[i] + 0 + 1] - parts_range[2 * 2 * parts[i] + 0 + 0]);
        gw = (parts_range[2 * 2 * parts[i] + 2 + 1] - parts_range[2 * 2 * parts[i] + 2 + 0]);

        rti_tstop(rti_tid_mpi_partition_radix2_while_check_final);
      }      

      rti_tstart(rti_tid_mpi_partition_radix2_while_check_post);
      
      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": sdispls[%" sl_int_type_fmt " + 1] = %d, lcs = %" sl_int_type_fmt, i, parts[i], parts[i], sdispls[parts[i] + 1], lcs);

      sdispls[parts[i] + 1] += lcs;

      if (gcs > 0 || gws > 0)
      {
        parts_range[l + 0 + 0] += gcs;
        parts_range[l + 0 + 1] = parts_range[l + 0 + 0] + gc;
        parts_range[l + 2 + 0] += gws;
        parts_range[l + 2 + 1] = parts_range[l + 2 + 0] + gw;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 3. part_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);
        SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": parts_range: %f  %f  /  %f  %f", i, parts[i], parts_range[2 * 2 * parts[i] + 0 + 0], parts_range[2 * 2 * parts[i] + 0 + 1], parts_range[2 * 2 * parts[i] + 2 + 0], parts_range[2 * 2 * parts[i] + 2 + 1]);

        parts_minmax_new[0 + 0] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 0], parts_range[l + 0 + 1]);
        parts_minmax_new[0 + 2] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 2], parts_range[l + 0 + 1]);
        parts_minmax_new[0 + 1] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 1], parts_range[l + 0 + 1]);
        parts_minmax_new[0 + 3] = xminmax(parts_range[l + 0 + 0], parts_minmax_new[0 + 3], parts_range[l + 0 + 1]);
      
        parts_minmax_new[4 + 0] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 0], parts_range[l + 2 + 1]);
        parts_minmax_new[4 + 2] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 2], parts_range[l + 2 + 1]);
        parts_minmax_new[4 + 1] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 1], parts_range[l + 2 + 1]);
        parts_minmax_new[4 + 3] = xminmax(parts_range[l + 2 + 0], parts_minmax_new[4 + 3], parts_range[l + 2 + 1]);
      }

      SL_TRACE_IF(DEBUG_OR_NOT, "%" sl_int_type_fmt ",%" sl_int_type_fmt ": 4. part_minmax_new: %f  %f  %f  %f  /  %f  %f  %f  %f", i, parts[i], parts_minmax_new[0 + 0], parts_minmax_new[0 + 1], parts_minmax_new[0 + 2], parts_minmax_new[0 + 3], parts_minmax_new[4 + 0], parts_minmax_new[4 + 1], parts_minmax_new[4 + 2], parts_minmax_new[4 + 3]);

      if (parts_minmax_new[0 + 0] != parts_minmax[j + 0 + 0] || parts_minmax_new[0 + 2] != parts_minmax[j + 0 + 2] || parts_minmax_new[4 + 0] != parts_minmax[j + 4 + 0] || parts_minmax_new[4 + 2] != parts_minmax[j + 4 + 2])
      {
        parts_minmax[j + 0 + 0] = parts_minmax_new[0 + 0];
        parts_minmax[j + 0 + 2] = parts_minmax_new[0 + 2];
        parts_minmax[j + 4 + 0] = parts_minmax_new[4 + 0];
        parts_minmax[j + 4 + 2] = parts_minmax_new[4 + 2];

        parts_update[parts[i] + 1] = 1;
      }

      if (parts_minmax_new[0 + 1] != parts_minmax[j + 0 + 1] || parts_minmax_new[0 + 3] != parts_minmax[j + 0 + 3] || parts_minmax_new[4 + 1] != parts_minmax[j + 4 + 1] || parts_minmax_new[4 + 3] != parts_minmax[j + 4 + 3])
      {
        parts_minmax[j + 0 + 1] = parts_minmax_new[0 + 1];
        parts_minmax[j + 0 + 3] = parts_minmax_new[0 + 3];
        parts_minmax[j + 4 + 1] = parts_minmax_new[4 + 1];
        parts_minmax[j + 4 + 3] = parts_minmax_new[4 + 3];

        parts_update[parts[i] - 1] = 1;
      }

      parts_update[parts[i]] = 0;

      /* refine or remove */
      if (refine)
      {
        /* bits left for partitioning? */
        if (rhigh >= rlow)
        {
          if (last_new_area == part_areas[i] && last_new_class == k) part_areas[i] = nareas_new - 1;
          else
          {
            /* update last_new_... */
            last_new_area = part_areas[i];
            last_new_class = k;

            /* create new area */
            elem_assign_at(&areas[part_areas[i]], lcs, &areas_new[nareas_new]);
            areas_new[nareas_new].size = local_counts[part_areas[i] * nclasses + k];
            part_areas[i] = nareas_new;
            ++nareas_new;
          }

        } else
        {
          /* save local count/weight for the later prefix calculations */
          final_locals[2 * (i - nparts_removed * direction) + 0] = lc;
          final_locals[2 * (i - nparts_removed * direction) + 1] = lw;
        }

        parts[i - nparts_removed * direction] = parts[i];
        part_areas[i - nparts_removed * direction] = part_areas[i];

      } else ++nparts_removed;

      rti_tstop(rti_tid_mpi_partition_radix2_while_check_post);
      
      i += direction;
    }

    if (direction > 0) parts_high -= nparts_removed;
    else parts_low += nparts_removed;

    direction *= -1;

/*    SL_NOTICE_IF(DEBUG_OR_NOT, "nparts = %" sl_int_type_fmt " vs. nareas_new = %" sl_int_type_fmt, nparts, nareas_new);*/

    rti_tstop(rti_tid_mpi_partition_radix2_while_check);
    
    /* switch areas */
    nareas = nareas_new;
    if (areas == areas0)
    {
      areas = areas1;
      areas_new = areas0;
    } else
    {
      areas = areas0;
      areas_new = areas1;
    }
  }

  rti_tstop(rti_tid_mpi_partition_radix2_while);

  /* create scounts */
  for (i = 0; i < size - 1; ++i) scounts[i] = sdispls[i + 1] - sdispls[i];
  scounts[size - 1] = s->size - sdispls[size - 1];

#ifdef SCOUNTS_SDISPLS
  printf("%d: scounts", rank);
  for (i = 0, j = 0; i < size; ++i) { printf("  %d", scounts[i]); j += scounts[i]; }
  printf(" = %" sl_int_type_fmt "\n", j);
  printf("%d: sdispls", rank);
  for (i = 0; i < size; ++i) printf("  %d", sdispls[i]);
  printf("\n");
#endif

#ifdef RCOUNTS_RDISPLS
  rcounts = sl_alloc(size, sizeof(int));
  rdispls = sl_alloc(size, sizeof(int));

  MPI_Alltoall(scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm);

  rdispls[0] = 0;
  for (i = 1; i < size; ++i) rdispls[i] = rdispls[i - 1] + rcounts[i - 1];

  printf("%d: rcounts", rank);
  for (i = 0; i < size; ++i) printf("  %d", rcounts[i]);
  printf("\n");
  printf("%d: rdispls", rank);
  for (i = 0; i < size; ++i) printf("  %d", rdispls[i]);
  printf("\n");

  sl_free(rcounts);
  sl_free(rdispls);
#endif

  sl_free(locals);
  sl_free(globals);

#ifdef WEIGHT_STATS
  partial_counts[size] = 0;
  partial_weights[size] = 0.0;
  for (i = 0; i < size; ++i)
  {
    partial_counts[i] = scounts[i];
    partial_weights[i] = 0.0;
    for (j = sdispls[i]; j < sdispls[i] + scounts[i]; ++j) partial_weights[i] += elem_weight_one(s, j);
    
    partial_counts[size] += partial_counts[i];
    partial_weights[size] += partial_weights[i];
  }

#ifdef HAVENT_MPI_IN_PLACE
  MPI_Reduce(partial_counts, partial_counts2, size + 1, int_mpi_datatype, MPI_SUM, 0, comm);
  MPI_Reduce(partial_weights, partial_weights2, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm);
# define partial_counts   partial_counts2
# define partial_weights  partial_weights2
#else
  /* recvbuf requires workaround for an in-place/aliased-buffer-check-bug in mpich2 (fixed with rev 5518) */
  MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_counts, (rank == 0)?partial_counts:NULL, size + 1, int_mpi_datatype, MPI_SUM, 0, comm);
  MPI_Reduce((rank == 0)?MPI_IN_PLACE:partial_weights, (rank == 0)?partial_weights:NULL, size + 1, MPI_DOUBLE, MPI_SUM, 0, comm);
#endif

  if (rank == 0)
  {
    printf("%d: total_count: %" sl_int_type_fmt " vs. %" sl_int_type_fmt "\n", rank, total_count, partial_counts[size]);
    d = 0.0;
    vmin = 1.0;
    vmax = 0.0;
    for (i = 0; i < size; ++i)
    {
/*      printf("%d: %" sl_int_type_fmt " %" sl_int_type_fmt " / %f - %" sl_int_type_fmt " / %f\n", rank, i, partial_counts[i], (double) partial_counts[i] / partial_counts[size], (partial_counts[size] / size) - partial_counts[i], fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])));*/
      d += fabs((partial_counts[size] / size) - partial_counts[i]);
      if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) < vmin) vmin = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size]));
      if (fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size])) > vmax) vmax = fabs(1.0 - ((double) partial_counts[i] * size / partial_counts[size]));
    }
    printf("%d: min/max: %f / %f\n", rank, vmin, vmax);
    printf("%d: average_count: %" sl_int_type_fmt " - %f / %f\n", rank, partial_counts[size] / size, d / size, d / partial_counts[size]);

    printf("%d: total_weight: %f vs. %f\n", rank, total_weight, partial_weights[size]);
    d = 0.0;
    vmin = 1.0;
    vmax = 0.0;
    for (i = 0; i < size; ++i)
    {
/*      printf("%d: %" sl_int_type_fmt " %f / %f - %f / %f\n", rank, i, partial_weights[i], partial_weights[i] / partial_weights[size], (partial_weights[size] / size) - partial_weights[i], fabs(1.0 - (partial_weights[i] * size / partial_weights[size])));*/
      d += fabs((partial_weights[size] / size) - partial_weights[i]);
      if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) < vmin) vmin = fabs(1.0 - (partial_weights[i] * size / partial_weights[size]));
      if (fabs(1.0 - (partial_weights[i] * size / partial_weights[size])) > vmax) vmax = fabs(1.0 - (partial_weights[i] * size / partial_weights[size]));
    }
    printf("%d: min/max: %f / %f\n", rank, vmin, vmax);
    printf("%d: average_weight: %f - %f / %f\n", rank, partial_weights[size] / size, d / size, d / partial_weights[size]);
  }
#endif

  rti_tstop(rti_tid_mpi_partition_radix2);

#if defined(TIMING_STATS) && defined(SL_USE_RTI_TIM)
  if (rank == 0)
  {
    printf("%d: mpi_partition_radix: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2));
    printf("%d: mpi_partition_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_sync));
    printf("%d: mpi_partition_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while));
    printf("%d: mpi_partition_radix:   count: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_count));
    printf("%d: mpi_partition_radix:   allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_allreduce));
    printf("%d: mpi_partition_radix:   round1: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1));
    printf("%d: mpi_partition_radix:     allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_round1_allgather));
    printf("%d: mpi_partition_radix:   exscan: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_exscan));
    printf("%d: mpi_partition_radix:   check: %f\n", rank, rti_tcumu(rti_tid_mpi_partition_radix2_while_check));
    printf("%d: mpi_partition_radix:     pre: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_pre));
    printf("%d: mpi_partition_radix:     classes: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_classes));
    printf("%d: mpi_partition_radix:     final: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_final));
    printf("%d: mpi_partition_radix:     post: %f\n", rank, rti_tlast(rti_tid_mpi_partition_radix2_while_check_post));
  }
#endif

  return 0;
}
예제 #14
0
static void list_compute_summary(flist_t* flist)
{
    /* initialize summary values */
    flist->max_file_name  = 0;
    flist->max_user_name  = 0;
    flist->max_group_name = 0;
    flist->min_depth      = 0;
    flist->max_depth      = 0;
    flist->total_files    = 0;
    flist->offset         = 0;

    /* get our rank and the size of comm_world */
    int rank, ranks;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &ranks);

    /* get total number of files in list */
    uint64_t total;
    uint64_t count = flist->list_count;
    MPI_Allreduce(&count, &total, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD);
    flist->total_files = total;

    /* bail out early if no one has anything */
    if (total <= 0) {
        return;
    }

    /* compute the global offset of our first item */
    uint64_t offset;
    MPI_Exscan(&count, &offset, 1, MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD);
    if (rank == 0) {
        offset = 0;
    }
    flist->offset = offset;

    /* compute local min/max values */
    int min_depth = -1;
    int max_depth = -1;
    uint64_t max_name = 0;
    elem_t* current = flist->list_head;
    while (current != NULL) {
        uint64_t len = (uint64_t)(strlen(current->file) + 1);
        if (len > max_name) {
            max_name = len;
        }

        int depth = current->depth;
        if (depth < min_depth || min_depth == -1) {
            min_depth = depth;
        }
        if (depth > max_depth || max_depth == -1) {
            max_depth = depth;
        }

        /* go to next item */
        current = current->next;
    }

    /* get global maximums */
    int global_max_depth;
    MPI_Allreduce(&max_depth, &global_max_depth, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);

    uint64_t global_max_name;
    MPI_Allreduce(&max_name, &global_max_name, 1, MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD);

    /* since at least one rank has an item and max will be -1 on ranks
     * without an item, set our min to global max if we have no items,
     * this will ensure that our contribution is >= true global min */
    int global_min_depth;
    if (count == 0) {
        min_depth = global_max_depth;
    }
    MPI_Allreduce(&min_depth, &global_min_depth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);

    /* set summary values */
    flist->max_file_name = global_max_name;
    flist->min_depth = global_min_depth;
    flist->max_depth = global_max_depth;

    /* set summary on users and groups */
    if (flist->detail) {
        flist->total_users    = flist->users.count;
        flist->total_groups   = flist->groups.count;
        flist->max_user_name  = flist->users.chars;
        flist->max_group_name = flist->groups.chars;
    }

    return;
}
예제 #15
0
int elemental2vec(const El::DistMatrix<El::Complex<double>,El::VC,El::STAR> &Y, std::vector<double> &vec){
	
	assert((Y.DistData().colDist == El::STAR) and (Y.DistData().rowDist == El::VC));

	int data_dof=2;
	int SCAL_EXP = 1;

	//double *pt_array,*pt_perm_array;
	int r,q,ll,rq; // el vec info
	int nbigs; //Number of large recv (i.e. recv 1 extra data point)
	int pstart; // p_id of nstart
	int rank = El::mpi::WorldRank(); //p_id
	int recv_size; // base recv size
	bool print = (rank == -1); 

	// Get el vec info
	ll = Y.Height();
	const El::Grid* g = &(Y.Grid());
	r = g->Height();
	q = g->Width();
	MPI_Comm comm = (g->Comm()).comm;

	int cheb_deg = InvMedTree<FMM_Mat_t>::cheb_deg;
	int omp_p=omp_get_max_threads();
	size_t n_coeff3=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6;
	
	// Get petsc vec params
	//VecGetLocalSize(pt_vec,&nlocal);
	int nlocal = (vec.size())/data_dof;
	if(print) std::cout << "m: " << std::endl;
	int nstart = 0;
	//VecGetArray(pt_vec,&pt_array);
	//VecGetOwnershipRange(pt_vec,&nstart,NULL);
	MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm);

	// Determine who owns the first element we want
	rq = r * q;
	pstart = nstart % rq;
	nbigs = nlocal % rq;
	recv_size = nlocal / rq;
	
	if(print){
		std::cout << "r: " << r << " q: " << q <<std::endl;
		std::cout << "nstart: " << nstart << std::endl;
		std::cout << "ps: " << pstart << std::endl;
		std::cout << "nbigs: " << nbigs << std::endl;
		std::cout << "recv_size: " << recv_size << std::endl;
	}

	// Make recv sizes
	std::vector<int> recv_lengths(rq);
	std::fill(recv_lengths.begin(),recv_lengths.end(),recv_size);
	if(nbigs >0){
		for(int i=0;i<nbigs;i++){
			recv_lengths[(pstart + i) % rq] += 1;
		}
	}

	// Make recv disps
	std::vector<int> recv_disps = exscan(recv_lengths);

	// All2all to get send sizes
	std::vector<int> send_lengths(rq);
	MPI_Alltoall(&recv_lengths[0], 1, MPI_INT, &send_lengths[0], 1, MPI_INT,comm);

	// Scan to get send_disps
	std::vector<int> send_disps = exscan(send_lengths);

	// Do all2allv to get data on correct processor
	std::vector<El::Complex<double>> recv_data(nlocal);
	std::vector<El::Complex<double>> recv_data_ordered(nlocal);
	//MPI_Alltoallv(el_vec.Buffer(),&send_lengths[0],&send_disps[0],MPI_DOUBLE, \
			&recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm);
	El::mpi::AllToAll(Y.LockedBuffer(), &send_lengths[0], &send_disps[0], &recv_data[0],&recv_lengths[0],&recv_disps[0],comm);
	
	if(print){
		//std::cout << "Send data: " <<std::endl << *el_vec.Buffer() <<std::endl;
		std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl;
		std::cout << "Send disps: " <<std::endl << send_disps <<std::endl;
		std::cout << "Recv data: " <<std::endl << recv_data <<std::endl;
		std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl;
		std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl;
	}
	
	// Reorder the data so taht it is in the right order for the fmm tree
	for(int p=0;p<rq;p++){
		int base_idx = (p - pstart + rq) % rq;
		int offset = recv_disps[p];
		for(int i=0;i<recv_lengths[p];i++){
			recv_data_ordered[base_idx + rq*i] = recv_data[offset + i];
		}
	}

	// loop through and put the data into the vector
	#pragma omp parallel for
	for(int i=0;i<nlocal; i++){
		vec[2*i] = El::RealPart(recv_data_ordered[i]);
		vec[2*i+1] = El::ImagPart(recv_data_ordered[i]);
	}

	if(print){std::cout <<"here?"<<std::endl;}

	return 0;

}
예제 #16
0
파일: io.c 프로젝트: gcasey/cosmotools
/*
  writes output in pnetcdf format

  nblocks: local number of blocks
  vblocks: pointer to array of vblocks
  out_file: output file name
  comm: MPI communicator
*/
void pnetcdf_write(int nblocks, struct vblock_t *vblocks,
       char *out_file, MPI_Comm comm) {

#ifdef USEPNETCDF
  int err;
  int ncid, cmode, varids[23], dimids[8], dimids_2D[2];
  MPI_Offset start[2], count[2];

  MPI_Offset quants[NUM_QUANTS]; /* quantities per block */
  MPI_Offset proc_quants[NUM_QUANTS]; /* quantities per process */
  MPI_Offset tot_quants[NUM_QUANTS]; /* total quantities all global blocks */
  MPI_Offset block_ofsts[NUM_QUANTS]; /* starting offsets for each block */

  /* init */
  int i;
  for (i = 0; i < NUM_QUANTS; i++) {
    quants[i] = 0;
    proc_quants[i] = 0;
    tot_quants[i] = 0;
    block_ofsts[i] = 0;
  }

  /* sum quantities over local blocks */
  int b;
  for (b = 0; b < nblocks; b++) {
    proc_quants[NUM_VERTS] += vblocks[b].num_verts;
    proc_quants[NUM_COMP_CELLS] += vblocks[b].num_complete_cells;
    proc_quants[NUM_CELL_FACES] += vblocks[b].tot_num_cell_faces;
    proc_quants[NUM_FACE_VERTS] += vblocks[b].tot_num_face_verts;
    proc_quants[NUM_ORIG_PARTS] += vblocks[b].num_orig_particles;
    proc_quants[NUM_NEIGHBORS] += DIY_Num_neighbors(0, b);
  }
  proc_quants[NUM_BLOCKS] = nblocks;

  /* sum per process values to be global ones */
  MPI_Allreduce(proc_quants, tot_quants, NUM_QUANTS, MPI_OFFSET, MPI_SUM, comm);

  /* prefix sum proc offsets */
  MPI_Exscan(proc_quants, &block_ofsts, NUM_QUANTS, MPI_OFFSET, MPI_SUM, comm);

  /* create a new file for writing */
  cmode = NC_CLOBBER | NC_64BIT_DATA;
  err = ncmpi_create(comm, out_file, cmode, MPI_INFO_NULL, &ncid); ERR;

  /* define dimensions */
  err = ncmpi_def_dim(ncid, "num_g_blocks", tot_quants[NUM_BLOCKS],
          &dimids[0]); ERR;
  err = ncmpi_def_dim(ncid, "XYZ", 3, &dimids[1]); ERR;
  err = ncmpi_def_dim(ncid, "num_g_verts", tot_quants[NUM_VERTS],
          &dimids[2]); ERR;
  err = ncmpi_def_dim(ncid, "num_g_complete_cells", tot_quants[NUM_COMP_CELLS],
          &dimids[3]); ERR;
  err = ncmpi_def_dim(ncid, "tot_num_g_cell_faces", tot_quants[NUM_CELL_FACES],
          &dimids[4]); ERR;
  err = ncmpi_def_dim(ncid, "tot_num_g_face_verts", tot_quants[NUM_FACE_VERTS],
          &dimids[5]); ERR;
  err = ncmpi_def_dim(ncid, "num_g_orig_particles", tot_quants[NUM_ORIG_PARTS],
          &dimids[6]); ERR;
  err = ncmpi_def_dim(ncid, "num_g_neighbors", tot_quants[NUM_NEIGHBORS],
          &dimids[7]); ERR;

  /* define variables */
  err = ncmpi_def_var(ncid, "num_verts", NC_INT, 1, &dimids[0],
          &varids[0]); ERR;
  err = ncmpi_def_var(ncid, "num_complete_cells", NC_INT, 1, &dimids[0],
          &varids[1]); ERR;
  err = ncmpi_def_var(ncid, "tot_num_cell_faces", NC_INT, 1, &dimids[0],
          &varids[2]); ERR;
  err = ncmpi_def_var(ncid, "tot_num_face_verts", NC_INT, 1, &dimids[0],
          &varids[3]); ERR;
  err = ncmpi_def_var(ncid, "num_orig_particles", NC_INT, 1, &dimids[0],
          &varids[4]); ERR;

  /* block offsets */
  err = ncmpi_def_var(ncid, "block_off_num_verts", NC_INT64, 1, &dimids[0],
          &varids[5]); ERR;
  err = ncmpi_def_var(ncid, "block_off_num_complete_cells", NC_INT64, 1,
          &dimids[0], &varids[6]); ERR;
  err = ncmpi_def_var(ncid, "block_off_tot_num_cell_faces", NC_INT64, 1,
          &dimids[0], &varids[7]); ERR;
  err = ncmpi_def_var(ncid, "block_off_tot_num_face_verts", NC_INT64, 1,
          &dimids[0], &varids[8]); ERR;
  err = ncmpi_def_var(ncid, "block_off_num_orig_particles", NC_INT64, 1,
          &dimids[0], &varids[9]); ERR;

  dimids_2D[0] = dimids[0];
  dimids_2D[1] = dimids[1];
  err = ncmpi_def_var(ncid, "mins", NC_FLOAT, 2, dimids_2D, &varids[11]); ERR;
  err = ncmpi_def_var(ncid, "maxs", NC_FLOAT, 2, dimids_2D, &varids[12]); ERR;

  dimids_2D[0] = dimids[2];
  dimids_2D[1] = dimids[1];
  err = ncmpi_def_var(ncid, "save_verts", NC_FLOAT, 2, dimids_2D,
          &varids[13]); ERR;
  dimids_2D[0] = dimids[6];
  dimids_2D[1] = dimids[1];
  err = ncmpi_def_var(ncid, "sites", NC_FLOAT, 2, dimids_2D,
          &varids[14]); ERR;
  err = ncmpi_def_var(ncid, "complete_cells", NC_INT, 1, &dimids[3],
          &varids[15]); ERR;
  err = ncmpi_def_var(ncid, "areas", NC_FLOAT, 1, &dimids[3],
          &varids[16]); ERR;
  err = ncmpi_def_var(ncid, "vols", NC_FLOAT, 1, &dimids[3], &varids[17]); ERR;
  err = ncmpi_def_var(ncid, "num_cell_faces", NC_INT, 1, &dimids[3],
          &varids[18]); ERR;
  err = ncmpi_def_var(ncid, "num_face_verts", NC_INT, 1, &dimids[4],
          &varids[19]); ERR;
  err = ncmpi_def_var(ncid, "face_verts", NC_INT, 1, &dimids[5],
          &varids[20]); ERR;
  err = ncmpi_def_var(ncid, "neighbors", NC_INT, 1, &dimids[7],
          &varids[21]); ERR;
  err = ncmpi_def_var(ncid, "g_block_ids", NC_INT, 1, &dimids[0],
          &varids[22]); ERR;

  /* exit define mode */
  err = ncmpi_enddef(ncid); ERR;

  /* write all variables.
     to improve: we can try nonblocking I/O to aggregate small requests */

  for (b = 0; b < nblocks; b++) {

    struct vblock_t *v = &vblocks[b];

    /* quantities */
    start[0] = block_ofsts[NUM_BLOCKS];
    count[0] = 1;
    err = ncmpi_put_vara_int_all(ncid, varids[0], start, count,
         &v->num_verts); ERR;
    err = ncmpi_put_vara_int_all(ncid, varids[1], start, count,
         &v->num_complete_cells); ERR;
    err = ncmpi_put_vara_int_all(ncid, varids[2], start, count,
         &v->tot_num_cell_faces); ERR;
    err = ncmpi_put_vara_int_all(ncid, varids[3], start, count,
         &v->tot_num_face_verts); ERR;
    err = ncmpi_put_vara_int_all(ncid, varids[4], start, count,
         &v->num_orig_particles); ERR;

    /* block offsets */
    err = ncmpi_put_vara_longlong_all(ncid, varids[5], start, count,
              &block_ofsts[NUM_VERTS]); ERR;
    err = ncmpi_put_vara_longlong_all(ncid, varids[6], start, count,
              &block_ofsts[NUM_COMP_CELLS]); ERR;
    err = ncmpi_put_vara_longlong_all(ncid, varids[7], start, count,
              &block_ofsts[NUM_CELL_FACES]); ERR;
    err = ncmpi_put_vara_longlong_all(ncid, varids[8], start, count,
              &block_ofsts[NUM_FACE_VERTS]); ERR;
    err = ncmpi_put_vara_longlong_all(ncid, varids[9], start, count,
              &block_ofsts[NUM_ORIG_PARTS]); ERR;

    /* block bounds */
    start[0] = block_ofsts[NUM_BLOCKS];
    count[0] = 1;
    start[1] = 0;
    count[1] = 3;
    err = ncmpi_put_vara_float_all(ncid, varids[11], start, count,
           v->mins); ERR;
    err = ncmpi_put_vara_float_all(ncid, varids[12], start, count,
           v->maxs); ERR;

    /* save_verts */
    start[0] = block_ofsts[NUM_VERTS];
    start[1] = 0;
    count[0] = v->num_verts;
    count[1] = 3;
    err = ncmpi_put_vara_float_all(ncid, varids[13], start, count,
           v->save_verts); ERR;

    /* sites */
    start[0] = block_ofsts[NUM_ORIG_PARTS];
    start[1] = 0;
    count[0] = v->num_orig_particles;
    count[1] = 3;
    err = ncmpi_put_vara_float_all(ncid, varids[14], start, count,
           v->sites); ERR;

    /* complete cells */
    start[0] = block_ofsts[NUM_COMP_CELLS];
    count[0] = v->num_complete_cells;
    err = ncmpi_put_vara_int_all(ncid, varids[15], start, count,
         v->complete_cells); ERR;

    /* areas */
    start[0] = block_ofsts[NUM_COMP_CELLS];
    count[0] = v->num_complete_cells;
    err = ncmpi_put_vara_float_all(ncid, varids[16], start, count,
           v->areas); ERR;

    /* volumes */
    start[0] = block_ofsts[NUM_COMP_CELLS];
    count[0] = v->num_complete_cells;
    err = ncmpi_put_vara_float_all(ncid, varids[17], start, count,
           v->vols); ERR;

    /* num_cell_faces */
    start[0] = block_ofsts[NUM_COMP_CELLS];
    count[0] = v->num_complete_cells;
    err = ncmpi_put_vara_int_all(ncid, varids[18], start, count,
         v->num_cell_faces); ERR;

    /* num_face_verts */
    start[0] = block_ofsts[NUM_CELL_FACES];
    count[0] = v->tot_num_cell_faces;
    err = ncmpi_put_vara_int_all(ncid, varids[19], start, count,
         v->num_face_verts); ERR;

    /* face verts */
    start[0] = block_ofsts[NUM_FACE_VERTS];
    count[0] = v->tot_num_face_verts;
    err = ncmpi_put_vara_int_all(ncid, varids[20], start, count,
         v->face_verts); ERR;

    /* neighbors */
    int *neighbors = (int*)malloc(DIY_Num_neighbors(0, b) * sizeof(int));
    int num_neighbors = DIY_Get_neighbors(0, b, neighbors);
    start[0] = block_ofsts[NUM_NEIGHBORS];
    count[0] = num_neighbors;
    err = ncmpi_put_vara_int_all(ncid, varids[21], start, count, neighbors);
    ERR;

    /* gids */
    int gid = DIY_Gid(0, b);
    start[0] = block_ofsts[NUM_BLOCKS];
    count[0] = 1;
    err = ncmpi_put_vara_int_all(ncid, varids[22], start, count,
         &gid); ERR;

    /* update block offsets */
    block_ofsts[NUM_VERTS] += v->num_verts;
    block_ofsts[NUM_COMP_CELLS] += v->num_complete_cells;
    block_ofsts[NUM_CELL_FACES] += v->tot_num_cell_faces;
    block_ofsts[NUM_FACE_VERTS] += v->tot_num_face_verts;
    block_ofsts[NUM_ORIG_PARTS] += v->num_orig_particles;
    block_ofsts[NUM_NEIGHBORS] += num_neighbors;
    block_ofsts[NUM_BLOCKS]++;

    /* debug */
/*     fprintf(stderr, "gid = %d num_verts = %d num_complete_cells = %d " */
/* 	    "tot_num_cell_faces = %d tot_num_face_verts = %d " */
/* 	    "num_orig_particles = %d\n", */
/* 	    gid, v->num_verts, v->num_complete_cells, v->tot_num_cell_faces, */
/* 	    v->tot_num_face_verts, v->num_orig_particles); */

  }

  err = ncmpi_close(ncid); ERR;
#endif

}
예제 #17
0
int main(int argc, char **argv)
{
    int errs = 0;
    int i;
    int rank, size;
    int *sbuf = NULL;
    int *rbuf = NULL;
    int *scounts = NULL;
    int *rcounts = NULL;
    int *sdispls = NULL;
    int *rdispls = NULL;
    MPI_Datatype *types = NULL;
    MPI_Comm comm;

    /* intentionally not using MTest_Init/MTest_Finalize in order to make it
     * easy to take this test and use it as an NBC sanity test outside of the
     * MPICH test suite */
    MPI_Init(&argc, &argv);

    comm = MPI_COMM_WORLD;

    MPI_Comm_size(comm, &size);
    MPI_Comm_rank(comm, &rank);

    MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);

    /* enough space for every process to contribute at least NUM_INTS ints to any
     * collective operation */
    sbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(sbuf);
    rbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(rbuf);
    scounts = malloc(size * sizeof(int));
    my_assert(scounts);
    rcounts = malloc(size * sizeof(int));
    my_assert(rcounts);
    sdispls = malloc(size * sizeof(int));
    my_assert(sdispls);
    rdispls = malloc(size * sizeof(int));
    my_assert(rdispls);
    types = malloc(size * sizeof(MPI_Datatype));
    my_assert(types);

    for (i = 0; i < size; ++i) {
        sbuf[2 * i] = i;
        sbuf[2 * i + 1] = i;
        rbuf[2 * i] = i;
        rbuf[2 * i + 1] = i;
        scounts[i] = NUM_INTS;
        rcounts[i] = NUM_INTS;
        sdispls[i] = i * NUM_INTS;
        rdispls[i] = i * NUM_INTS;
        types[i] = MPI_INT;
    }

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Gather(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Gatherv(sbuf, NUM_INTS, MPI_INT, sbuf, rcounts, rdispls, MPI_INT, 0, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Scatter(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Scatterv(sbuf, scounts, sdispls, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Allgather(&sbuf[rank], 1, MPI_INT, sbuf, 1, MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Allgatherv(&sbuf[rank * rcounts[rank]], rcounts[rank], MPI_INT, sbuf, rcounts, rdispls,
                       MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Alltoall(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Alltoallv(sbuf, scounts, sdispls, MPI_INT, sbuf, scounts, sdispls, MPI_INT, comm))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Alltoallw(sbuf, scounts, sdispls, types, sbuf, scounts, sdispls, types, comm))
        errs++;

    if (rank == 0 && MPI_SUCCESS == MPI_Reduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Allreduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Reduce_scatter(sbuf, sbuf, rcounts, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Reduce_scatter_block(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Scan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (MPI_SUCCESS == MPI_Exscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm))
        errs++;

    if (sbuf)
        free(sbuf);
    if (rbuf)
        free(rbuf);
    if (scounts)
        free(scounts);
    if (rcounts)
        free(rcounts);
    if (sdispls)
        free(sdispls);
    if (rdispls)
        free(rdispls);
    if (types)
        free(types);

    if (rank == 0) {
        if (errs)
            fprintf(stderr, "Found %d errors\n", errs);
        else
            printf(" No errors\n");
    }
    MPI_Finalize();
    return 0;
}
예제 #18
0
static void writePLY(
        MPI_Comm comm, std::string fname,
        int nvertices, int nverticesPerObject,
        int ntriangles, int ntrianglesPerObject,
        int nObjects,
        const std::vector<int3>& mesh,
        const std::vector<float3>& vertices)
{
    int rank;
    MPI_Check( MPI_Comm_rank(comm, &rank) );

    int totalVerts = 0;
    MPI_Check( MPI_Reduce(&nvertices, &totalVerts, 1, MPI_INT, MPI_SUM, 0, comm) );

    int totalTriangles = 0;
    MPI_Check( MPI_Reduce(&ntriangles, &totalTriangles, 1, MPI_INT, MPI_SUM, 0, comm) );

    MPI_File f;
    MPI_Check( MPI_File_open(comm, fname.c_str(), MPI_MODE_CREATE|MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_WRONLY, MPI_INFO_NULL, &f) );
    MPI_Check( MPI_File_close(&f) );
    MPI_Check( MPI_File_open(comm, fname.c_str(), MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &f) );

    int headerSize = 0;
    MPI_Offset fileOffset = 0;

    if (rank == 0)
    {
        std::stringstream ss;

        ss <<  "ply\n";
        ss <<  "format binary_little_endian 1.0\n";
        ss <<  "element vertex " << totalVerts << "\n";
        ss <<  "property float x\nproperty float y\nproperty float z\n";
        //ss <<  "property float xnormal\nproperty float ynormal\nproperty float znormal\n";
        ss <<  "element face " << totalTriangles << "\n";
        ss <<  "property list int int vertex_index\n";
        ss <<  "end_header\n";

        std::string content = ss.str();
        headerSize = content.length();
        MPI_Check( MPI_File_write_at(f, fileOffset, content.c_str(), headerSize, MPI_CHAR, MPI_STATUS_IGNORE) );
    }

    MPI_Check( MPI_Bcast(&headerSize, 1, MPI_INT, 0, comm) );

    fileOffset += headerSize;
    
    fileOffset += writeToMPI(vertices, f, fileOffset, comm);

    int verticesOffset = 0;
    MPI_Check( MPI_Exscan(&nvertices, &verticesOffset, 1, MPI_INT, MPI_SUM, comm));

    std::vector<int4> connectivity;
    for(int j = 0; j < nObjects; ++j)
        for(int i = 0; i < ntrianglesPerObject; ++i)
        {
            int3 vertIds = mesh[i] + nverticesPerObject * j + verticesOffset;
            connectivity.push_back({3, vertIds.x, vertIds.y, vertIds.z});
        }

    fileOffset += writeToMPI(connectivity, f, fileOffset, comm);

    MPI_Check( MPI_File_close(&f));
}
slint_t mpi_select_exact_radix_fixed(elements_t *s, slint_t nelements, slint_t nparts, partcond_t *pconds, slint_t rhigh, slint_t rlow, slint_t rwidth, int *sdispls, int size, int rank, MPI_Comm comm) /* sl_proto, sl_func mpi_select_exact_radix_fixed */
{
  slkey_pure_t max_nclasses, nclasses, bit_mask;
  slkey_pure_t k, l;

  typedef struct {
    slint_t count_min, count_max;
    slint_t count_low, count_hig;
#ifdef elem_weight
    double weight_min, weight_max;
    double weight_low, weight_hig;
#endif
  } mmlh_t;

  mmlh_t mmlh[nparts];

  const slint_t max_nborders = nparts - 1;
  slint_t border_lo, border_hi, nborders_removed;
  slint_t borders[max_nborders], border_areas[max_nborders];

#define MIN_LE  0
#define MIN_RI  1
#define MAX_LE  2
#define MAX_RI  3

  struct {
    slint_t update;
    slint_t crange[2], cmmlr[4];
#ifdef elem_weight
    double wrange[2], wmmlr[4];
#endif
  } border_infos_[1 + max_nborders + 1], *border_infos = border_infos_ + 1, border_info_old;

  const slint_t max_nareas = max_nborders;
  slint_t nareas, nareas_new;
  elements_t areas0[max_nareas * nelements], areas1[max_nareas * nelements], *areas, *areas_new;

  slint_t *area_counts, *current_counts;
  double *local_counts, *global_counts;
#ifdef elem_weight
  double *local_weights, *global_weights, *current_weights;
#endif

  slint_t current_cmm[2];
#ifdef elem_weight
  double current_wmm[2];
#endif

  slint_t final_areas[max_nborders * nelements];
  double final_locals[NCONDS * max_nborders], *final_globals;

  slint_t current_width;
  slint_t round, direction, refine, finalize;
  slint_t last_new_area, last_new_class;

  slint_t lc, lcs, gc, gcs, lcv[nelements], lcsv[nelements];
#ifdef elem_weight
  double lw, gw, lws, gws;
  double mw, dw;
  double mcw[4];
#else
  slint_t mc, dc;
#endif

  slint_t i, j;

  elements_t xi, end;

#ifdef VERIFY
  slint_t v;
#endif


  SL_TRACE_IF(DEBUG_OR_NOT, "starting mpi_select_exact_radix");

  /* sl_tid rti_tid_mpi_select_exact_radix rti_tid_mpi_select_exact_radix_sync */

  rti_treset(rti_tid_mpi_select_exact_radix_while);                   /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_count);             /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_allreduce);         /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_round1);            /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_round1_allgather);  /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_exscan);            /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check);             /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_pre);         /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_classes);     /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_final);       /* sl_tid */
  rti_treset(rti_tid_mpi_select_exact_radix_while_check_post);        /* sl_tid */

  rti_tstart(rti_tid_mpi_select_exact_radix_sync);
#ifdef SYNC_ON_INIT
  MPI_Barrier(comm);
#endif
  rti_tstop(rti_tid_mpi_select_exact_radix_sync);

#ifdef VERIFY
  v = elements_validate_order(s, 1);
  
  SL_TRACE_IF(DEBUG_OR_NOT, "elements order: %s (%" slint_fmt ")", (v > 0)?"FAILED":"SUCCESS", v);
#endif

  rti_tstart(rti_tid_mpi_select_exact_radix);

  if (rhigh < 0) rhigh = key_radix_high;
  if (rlow < 0) rlow = key_radix_low;
  if (rwidth < 0) rwidth = sort_radix_width_default;
  
  max_nclasses = powof2_typed(rwidth, slkey_pure_t);

/*  SL_TRACE_IF(DEBUG_OR_NOT, "alloc area_counts: %" slint_fmt " * %d", max_nareas * nelements * max_nclasses, sizeof(slint_t));
  SL_TRACE_IF(DEBUG_OR_NOT, "alloc local_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t));
  SL_TRACE_IF(DEBUG_OR_NOT, "alloc global_counts: %" slint_fmt " * %d", NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(slint_t));*/

  area_counts = sl_alloc(max_nareas * nelements * max_nclasses, sizeof(slint_t));
  local_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double));
  global_counts = sl_alloc(NCONDS * (max_nareas * max_nclasses + max_nareas), sizeof(double));

  /* init areas (first area = all elements) */
  areas = areas0;
  areas_new = areas1;

  nareas = 1;
  for (j = 0; j < nelements; ++j) elem_assign(&s[j], &areas[0 * nelements + j]);

  /* init parts */
  border_lo = 0;
  border_hi = max_nborders - 1;
  for (i = border_lo; i <= border_hi; ++i)
  {
    borders[i] = i;
    border_areas[i] = 0;
  }

  /* init sdispls */
  for (i = 0; i < nparts; ++i)
  for (j = 0; j < nelements; ++j) sdispls[i * nelements + j] = 0;

  rti_tstart(rti_tid_mpi_select_exact_radix_while);

  round = 0;
  while (border_lo <= border_hi)
  {
    ++round;

    /* setup bitmask */
    current_width = xmin(rwidth, rhigh - rlow + 1);
    rhigh -= (current_width > 0)?current_width - 1:rhigh;

    nclasses = (current_width > 0)?powof2_typed(current_width, slkey_pure_t):1;
    bit_mask = nclasses - 1;

    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", rhigh: %" slint_fmt ", current_width: %" slint_fmt ", nclasses: %" sl_key_pure_type_fmt, round, rhigh, current_width, nclasses);

    finalize = (current_width <= 0);

    if (!finalize || round == 1)
    {
#ifdef elem_weight
      /* init weight counters */
      local_weights = local_counts + (nareas * nclasses) + nareas;
      global_weights = global_counts + (nareas * nclasses) + nareas;
#endif

      /* zero all counter */
      for (i = 0; i < nareas; ++i)
      for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] = 
#ifdef elem_weight
        local_weights[i * nclasses + k] = 
#endif
        0.0;

      rti_tstart(rti_tid_mpi_select_exact_radix_while_count);

      /* for every area */
      for (i = 0; i < nareas; ++i)
      {
        local_counts[nareas * nclasses + i] = 0;
#ifdef elem_weight
        local_weights[nareas * nclasses + i] = 0.0;
#endif

        /* for every list of elements */
        for (j = 0; j < nelements; ++j)
        {
          SL_TRACE_IF(DEBUG_OR_NOT, "area %" slint_fmt ",%" slint_fmt ": size = %" slint_fmt, i, j, areas[i * nelements + j].size);

          elem_assign_at(&areas[i * nelements + j], areas[i * nelements + j].size, &end);
          
          current_counts = area_counts + ((i * nelements + j) * nclasses);
#ifdef elem_weight
          current_weights = local_weights + (i * nclasses);
#endif

          for (k = 0; k < nclasses; ++k) current_counts[k] = 0;

          if (nclasses > 1)
          {
            /* counts and weights in every class */
            for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi))
            {
              k = key_radix_key2class(key_purify(*xi.keys), rhigh, bit_mask);
              current_counts[k] += 1;
/*              SL_TRACE_IF(DEBUG_OR_NOT, "key %" sl_key_pure_type_fmt " goes to bin %"  sl_key_pure_type_fmt, key_purify(*xi.keys), k);*/
#ifdef elem_weight
              current_weights[k] += elem_weight(&xi, 0);
#endif
            }

          } else
          {
            /* total counts and weights */
            current_counts[0] = areas[i * nelements + j].size;

#ifdef elem_weight
            for (elem_assign(&areas[i * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi)) current_weights[0] += elem_weight(&xi, 0);
#endif
          }
          
          for (k = 0; k < nclasses; ++k) local_counts[i * nclasses + k] += current_counts[k];

          /* total counts and weights in this area */
          local_counts[nareas * nclasses + i] += areas[i * nelements + j].size;
#ifdef elem_weight
          for (k = 0; k < nclasses; ++k) local_weights[nareas * nclasses + i] += current_weights[k];
#endif
        }

        SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ": counts =", " %f", k, nclasses, (&local_counts[i * nclasses]), i);
      }

      rti_tstop(rti_tid_mpi_select_exact_radix_while_count);

      --rhigh;

      SL_TRACE_IF(DEBUG_OR_NOT, "all-reducing %" slint_fmt " doubles", (slint_t) (NCONDS * (nareas * nclasses + nareas)));

      rti_tstart(rti_tid_mpi_select_exact_radix_while_allreduce);

      /* create global counts and weights */
#ifdef MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD
      if (size >= MPI_SELECT_EXACT_RADIX_REDUCEBCAST_THRESHOLD)
      {
        MPI_Reduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, REDUCEBCAST_ROOT, comm);
        MPI_Bcast(global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, REDUCEBCAST_ROOT, comm);

      } else
#endif
        MPI_Allreduce(local_counts, global_counts, NCONDS * (nareas * nclasses + nareas), MPI_DOUBLE, MPI_SUM, comm);

      rti_tstop(rti_tid_mpi_select_exact_radix_while_allreduce);
    }

    /* do initializations */
    if (round == 1)
    {
      rti_tstart(rti_tid_mpi_select_exact_radix_while_round1);

      for (i = 0; i < nparts; ++i)
      {
        /* truncate counts, set default values and determine local (count/weight) limits */
        init_partconds(1, &pconds[i], nparts, global_counts[nareas * nclasses + 0],
#ifdef elem_weight
          global_weights[nareas * nclasses + 0]
#else
          0
#endif
          );

        mmlh[i].count_min = pconds[i].count_min;
        mmlh[i].count_max = pconds[i].count_max;
        mmlh[i].count_low = pconds[i].count_low;
        mmlh[i].count_hig = pconds[i].count_high;

#ifdef elem_weight
        mmlh[i].weight_min = pconds[i].weight_min;
        mmlh[i].weight_max = pconds[i].weight_max;
        mmlh[i].weight_low = pconds[i].weight_low;
        mmlh[i].weight_hig = pconds[i].weight_high;
#endif
      }

      /* init lowest and highest part (sentinels) */
      border_infos[border_lo - 1].update = 0;
      border_infos[border_lo - 1].crange[0] = 0;
      border_infos[border_lo - 1].crange[1] = 0;
      border_infos[border_lo - 1].cmmlr[MIN_LE] = border_infos[border_lo - 1].cmmlr[MAX_LE] = 0;
      border_infos[border_lo - 1].cmmlr[MIN_RI] = border_infos[border_lo - 1].cmmlr[MAX_RI] = 0;

      SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_lo - 1,
        border_infos[border_lo - 1].cmmlr[MIN_LE], border_infos[border_lo - 1].cmmlr[MAX_LE], border_infos[border_lo - 1].cmmlr[MIN_RI], border_infos[border_lo - 1].cmmlr[MAX_RI]);

#ifdef elem_weight
      border_infos[border_lo - 1].wrange[0] = 0.0;
      border_infos[border_lo - 1].wrange[1] = 0.0;
      border_infos[border_lo - 1].wmmlr[MIN_LE] = border_infos[border_lo - 1].wmmlr[MAX_LE] = 0.0;
      border_infos[border_lo - 1].wmmlr[MIN_RI] = border_infos[border_lo - 1].wmmlr[MAX_RI] = 0.0;

      SL_TRACE_IF(DEBUG_OR_NOT, "lowest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_lo - 1,
        border_infos[border_lo - 1].wmmlr[MIN_LE], border_infos[border_lo - 1].wmmlr[MAX_LE], border_infos[border_lo - 1].wmmlr[MIN_RI], border_infos[border_lo - 1].wmmlr[MAX_RI]);
#endif

      /* init highest part (sentinel) */
      border_infos[border_hi + 1].update = 0;
      border_infos[border_hi + 1].crange[0] = global_counts[nareas * nclasses + 0];
      border_infos[border_hi + 1].crange[1] = global_counts[nareas * nclasses + 0];
      border_infos[border_hi + 1].cmmlr[MIN_LE] = border_infos[border_hi + 1].cmmlr[MAX_LE] = 0;
      border_infos[border_hi + 1].cmmlr[MIN_RI] = border_infos[border_hi + 1].cmmlr[MAX_RI] = global_counts[nareas * nclasses + 0];

      SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", border_hi + 1,
        border_infos[border_hi + 1].cmmlr[MIN_LE], border_infos[border_hi + 1].cmmlr[MAX_LE], border_infos[border_hi + 1].cmmlr[MIN_RI], border_infos[border_hi + 1].cmmlr[MAX_RI]);

#ifdef elem_weight
      border_infos[border_hi + 1].wrange[0] = global_weights[nareas * nclasses + 0];
      border_infos[border_hi + 1].wrange[1] = global_weights[nareas * nclasses + 0];
      border_infos[border_hi + 1].wmmlr[MIN_LE] = border_infos[border_hi + 1].wmmlr[MAX_LE] = 0.0;
      border_infos[border_hi + 1].wmmlr[MIN_RI] = border_infos[border_hi + 1].wmmlr[MAX_RI] = global_weights[nareas * nclasses + 0];

      SL_TRACE_IF(DEBUG_OR_NOT, "highest: %" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", border_hi + 1,
        border_infos[border_hi + 1].wmmlr[MIN_LE], border_infos[border_hi + 1].wmmlr[MAX_LE], border_infos[border_hi + 1].wmmlr[MIN_RI], border_infos[border_hi + 1].wmmlr[MAX_RI]);
#endif

      /* init regular parts (backwards) */
      for (i = border_hi; i >= border_lo; --i)
      {
        border_infos[borders[i]].update = 1;
        border_infos[borders[i]].crange[0] = 0;
        border_infos[borders[i]].crange[1] = global_counts[nareas * nclasses + 0];
        border_infos[borders[i]].cmmlr[MIN_LE] = -1;
        border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min;
        border_infos[borders[i]].cmmlr[MAX_LE] = -1;
        border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
          border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

#ifdef elem_weight
        border_infos[borders[i]].wrange[0] = 0.0;
        border_infos[borders[i]].wrange[1] = global_weights[nareas * nclasses + 0];
        border_infos[borders[i]].wmmlr[MIN_LE] = -1.0;
        border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min;
        border_infos[borders[i]].wmmlr[MAX_LE] = -1.0;
        border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": init weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
          border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);
#endif

        /* prepare for finalization in the 1st round */
        if (finalize)
        {
          for (j = 0; j < nelements; ++j) final_areas[i * nelements + j] = area_counts[(0 * nelements + j) * nclasses + 0];

          final_locals[NCONDS * i + 0] = local_counts[nareas * nclasses + 0];
#ifdef elem_weight
          final_locals[NCONDS * i + 1] = local_weights[nareas * nclasses + 0];
#endif
        }
      }
      
      /* first direction: forward */
      direction = 1;

      rti_tstop(rti_tid_mpi_select_exact_radix_while_round1);
    }

    /* compute prefixes for finalization */
    if (finalize)
    {
      /* determine number of parts to finalize */
      j = border_hi - border_lo + 1;
    
      SL_TRACE_IF(DEBUG_OR_NOT, "Exscan: finalizing %" slint_fmt " parts", j);

      rti_tstart(rti_tid_mpi_select_exact_radix_while_exscan);

      /* use local_counts to store the global prefix sums */      
      final_globals = local_counts;

      /* create global prefix sums (set rank 0 to zero) */
      MPI_Exscan(&final_locals[NCONDS * border_lo], &final_globals[NCONDS * border_lo], NCONDS * j, MPI_DOUBLE, MPI_SUM, comm);
      if (rank == 0) for (i = border_lo; i <= border_hi; ++i) final_globals[NCONDS * i + 0] = 
#ifdef elem_weight
        final_globals[NCONDS * i + 1] = 
#endif
        0.0;

      rti_tstop(rti_tid_mpi_select_exact_radix_while_exscan);
    }

    /* check all remaining parts */
    SL_TRACE_IF(DEBUG_OR_NOT, "ROUND: %" slint_fmt ", %s", round, (direction > 0)?"forward":"backward");

    nareas_new = 0;
    last_new_area = last_new_class = -1;
    nborders_removed = 0;

    rti_tstart(rti_tid_mpi_select_exact_radix_while_check);

    i = (direction > 0)?border_lo:border_hi;
    while ((direction > 0)?(i <= border_hi):(i >= border_lo))
    {
      /* check partition borders[i] */
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ": PART: %" slint_fmt ",%" slint_fmt, round, i, borders[i]);

      rti_tstart(rti_tid_mpi_select_exact_radix_while_check_pre);

      /* save to old limits */
      border_info_old = border_infos[borders[i]];

      /* is an update required? */
      if (border_infos[borders[i]].update)
      {
        /* forward */
        if (direction > 0)
        {
          /* init from min/max (always) */
          border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i] - 1].cmmlr[MIN_LE] + mmlh[borders[i]].count_min;
          border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i] - 1].cmmlr[MAX_LE] + mmlh[borders[i]].count_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left]: %" slint_fmt " + %" slint_fmt ", %" slint_fmt " + %" slint_fmt "", i, borders[i],
            border_infos[borders[i] - 1].cmmlr[MIN_LE], mmlh[borders[i]].count_min,
            border_infos[borders[i] - 1].cmmlr[MAX_LE], mmlh[borders[i]].count_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_COUNTS_LH)
          {
            if (border_infos[borders[i]].cmmlr[MIN_LE] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MIN_LE] = mmlh[borders[i] + 1].count_low;
            if (border_infos[borders[i]].cmmlr[MAX_LE] > mmlh[borders[i]    ].count_hig) border_infos[borders[i]].cmmlr[MAX_LE] = mmlh[borders[i]    ].count_hig;
          }

#ifdef elem_weight
          /* init from min/max (always) */
          border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i] - 1].wmmlr[MIN_LE] + mmlh[borders[i]].weight_min;
          border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i] - 1].wmmlr[MAX_LE] + mmlh[borders[i]].weight_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left]: %f + %f, %f + %f", i, borders[i],
            border_infos[borders[i] - 1].wmmlr[MIN_LE], mmlh[borders[i]].weight_min,
            border_infos[borders[i] - 1].wmmlr[MAX_LE], mmlh[borders[i]].weight_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_WEIGHTS_LH)
          {
            if (border_infos[borders[i]].wmmlr[MIN_LE] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MIN_LE] = mmlh[borders[i] + 1].weight_low;
            if (border_infos[borders[i]].wmmlr[MAX_LE] > mmlh[borders[i]    ].weight_hig) border_infos[borders[i]].wmmlr[MAX_LE] = mmlh[borders[i]    ].weight_hig;
          }
#endif
        } else /* backward */
        {
          /* init from min/max (always) */
          border_infos[borders[i]].cmmlr[MIN_RI] = border_infos[borders[i] + 1].cmmlr[MIN_RI] - mmlh[borders[i] + 1].count_min;
          border_infos[borders[i]].cmmlr[MAX_RI] = border_infos[borders[i] + 1].cmmlr[MAX_RI] - mmlh[borders[i] + 1].count_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-right]: %" slint_fmt " - %" slint_fmt ", %" slint_fmt " - %" slint_fmt "", i, borders[i],
            border_infos[borders[i] + 1].cmmlr[MIN_RI], mmlh[borders[i] + 1].count_min,
            border_infos[borders[i] + 1].cmmlr[MAX_RI], mmlh[borders[i] + 1].count_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_COUNTS_LH)
          {
            if (border_infos[borders[i]].cmmlr[MAX_RI] < mmlh[borders[i] + 1].count_low) border_infos[borders[i]].cmmlr[MAX_RI] = mmlh[borders[i] + 1].count_low;
            if (border_infos[borders[i]].cmmlr[MIN_RI] > mmlh[borders[i]    ].count_hig) border_infos[borders[i]].cmmlr[MIN_RI] = mmlh[borders[i]    ].count_hig;
          }

#ifdef elem_weight
          /* init from min/max (always) */
          border_infos[borders[i]].wmmlr[MIN_RI] = border_infos[borders[i] + 1].wmmlr[MIN_RI] - mmlh[borders[i] + 1].weight_min;
          border_infos[borders[i]].wmmlr[MAX_RI] = border_infos[borders[i] + 1].wmmlr[MAX_RI] - mmlh[borders[i] + 1].weight_max;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-right]: %f - %f, %f - %f", i, borders[i],
            border_infos[borders[i] + 1].wmmlr[MIN_RI], mmlh[borders[i] + 1].weight_min,
            border_infos[borders[i] + 1].wmmlr[MAX_RI], mmlh[borders[i] + 1].weight_max);

          /* check against low/high (on demand) */
          if (pconds->pcm & SLPC_WEIGHTS_LH)
          {
            if (border_infos[borders[i]].wmmlr[MAX_RI] < mmlh[borders[i] + 1].weight_low) border_infos[borders[i]].wmmlr[MAX_RI] = mmlh[borders[i] + 1].weight_low;
            if (border_infos[borders[i]].wmmlr[MIN_RI] > mmlh[borders[i]    ].weight_hig) border_infos[borders[i]].wmmlr[MIN_RI] = mmlh[borders[i]    ].weight_hig;
          }
#endif
        }

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
          border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

        /* check against inconsistence */
        if (border_infos[borders[i]].cmmlr[MIN_LE] > border_infos[borders[i]].cmmlr[MIN_RI]) border_infos[borders[i]].cmmlr[MIN_LE] = border_infos[borders[i]].cmmlr[MIN_RI] = (border_infos[borders[i]].cmmlr[MIN_LE] + border_infos[borders[i]].cmmlr[MIN_RI]) / 2;
        if (border_infos[borders[i]].cmmlr[MAX_LE] < border_infos[borders[i]].cmmlr[MAX_RI]) border_infos[borders[i]].cmmlr[MAX_LE] = border_infos[borders[i]].cmmlr[MAX_RI] = (border_infos[borders[i]].cmmlr[MAX_LE] + border_infos[borders[i]].cmmlr[MAX_RI]) / 2;

#ifdef elem_weight
        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
          border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);

        /* check against inconsistence */
        if (border_infos[borders[i]].wmmlr[MIN_LE] > border_infos[borders[i]].wmmlr[MIN_RI]) border_infos[borders[i]].wmmlr[MIN_LE] = border_infos[borders[i]].wmmlr[MIN_RI] = (border_infos[borders[i]].wmmlr[MIN_LE] + border_infos[borders[i]].wmmlr[MIN_RI]) / 2;
        if (border_infos[borders[i]].wmmlr[MAX_LE] < border_infos[borders[i]].wmmlr[MAX_RI]) border_infos[borders[i]].wmmlr[MAX_LE] = border_infos[borders[i]].wmmlr[MAX_RI] = (border_infos[borders[i]].wmmlr[MAX_LE] + border_infos[borders[i]].wmmlr[MAX_RI]) / 2;
#endif
      }

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
        border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": crange: %" slint_fmt " - %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]);

      /* select highest min and lowest max */
      current_cmm[0] = xmax(border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_RI]) - border_infos[borders[i]].crange[0];
      current_cmm[1] = xmin(border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI]) - border_infos[borders[i]].crange[0];

      if (rank == 0) SL_ASSERT(current_cmm[0] <= current_cmm[1]);
      
      if (rank == 0) SL_ASSERT(0 <= current_cmm[0]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_count: %" slint_fmt " - %" slint_fmt "", i, borders[i], current_cmm[0], current_cmm[1]);

#ifdef elem_weight
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
        border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": wrange: %f - %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]);

      /* select highest min and lowest max */
      current_wmm[0] = xmax(border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_RI]) - border_infos[borders[i]].wrange[0];
      current_wmm[1] = xmin(border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI]) - border_infos[borders[i]].wrange[0];

      if (rank == 0) SL_ASSERT(current_wmm[0] <= current_wmm[1]);

      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": current_weight: %f - %f", i, borders[i], current_wmm[0], current_wmm[1]);
#endif

      rti_tstop(rti_tid_mpi_select_exact_radix_while_check_pre);

      /* HIT is the default */
      refine = 0;

      if (!finalize)
      {
        rti_tstart(rti_tid_mpi_select_exact_radix_while_check_classes);

        lcs = gcs = 0;
#ifdef elem_weight
        lws = gws = 0.0;
#endif

        for (k = 0; k < nclasses; ++k)
        {
          lc = local_counts[border_areas[i] * nclasses + k];
          gc = global_counts[border_areas[i] * nclasses + k];

          current_cmm[0] -= gc;
          current_cmm[1] -= gc;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_count: %" slint_fmt " - %" slint_fmt ", lc = %" slint_fmt ", lcs = %" slint_fmt ", gc = %" slint_fmt ", gcs = %" slint_fmt,
            i, borders[i], k, current_cmm[0], current_cmm[1], lc, lcs, gc, gcs);

#ifdef elem_weight
          lw = local_weights[border_areas[i] * nclasses + k];
          gw = global_weights[border_areas[i] * nclasses + k];

          current_wmm[0] -= gw;
          current_wmm[1] -= gw;

          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": k = %" sl_key_pure_type_fmt ", current_weight: %e - %e", i, borders[i], k, current_wmm[0], current_wmm[1]);
#endif

          /* stop and refine if max count is skipped OR min count AND max weight is skipped */
          if ((current_cmm[1] < 0)
#ifdef elem_weight
            || (current_cmm[0] < 0 && current_wmm[1] < 0.0)
#endif
            )
          {
            refine = 1;
            break;
          }

          lcs += lc;
          gcs += gc;
          gc = 0;

#ifdef elem_weight
          lws += lw;
          gws += gw;
          gw = 0.0;
#endif

          /* if between min/max counts */
          if (current_cmm[0] <= 0 && current_cmm[1] >= 0)
          {
#ifdef elem_weight
            SL_TRACE_IF(DEBUG_OR_NOT, "got to next: %d && %d", (current_cmm[1] > 0), (current_wmm[0] > 0));

            /* go to next if max count not reached AND min weight not reached */
            if (current_cmm[1] > 0 && current_wmm[0] > 0) continue;
#endif

            /* look ahead for a better stop */
            if (k + 1 < nclasses && current_cmm[1] - global_counts[border_areas[i] * nclasses + k + 1] >= 0)
            {
#ifdef elem_weight
              /* continue if weights will improve */
              if (myabs(current_wmm[0] + current_wmm[1]) > myabs(current_wmm[0] + current_wmm[1] - 2 * global_weights[border_areas[i] * nclasses + k + 1])) continue;
#else
              /* continue if counts will improve */
              if (myabs(current_cmm[0] + current_cmm[1]) > myabs(current_cmm[0] + current_cmm[1] - 2 * global_counts[border_areas[i] * nclasses + k + 1])) continue;
#endif
            }

            /* stop */
            break;
          }
        }

        SL_ASSERT_IF((rank == 0), k < nclasses);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": %s k = %" sl_key_pure_type_fmt ", lcs = %" slint_fmt, i, borders[i], (refine)?"REFINE":"HIT", k, lcs);

        /* make sure k is safe (it is used as index later) */
        if (k >= nclasses) k = nclasses - 1;

        /* break the local contribution into contributions for the lists of elements */
        for (j = 0; j < nelements; ++j)
        {
          lcsv[j] = 0;
          for (l = 0; l < k; ++l) lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + l];

          if (refine) lcv[j] = area_counts[((border_areas[i] * nelements + j) * nclasses) + k];
          else
          {
            lcv[j] = 0;
            lcsv[j] += area_counts[((border_areas[i] * nelements + j) * nclasses) + k];
          }

          lcs -= lcsv[j];
        }

        rti_tstop(rti_tid_mpi_select_exact_radix_while_check_classes);

      } else
      {
        rti_tstart(rti_tid_mpi_select_exact_radix_while_check_final);
        
        k = 0;

#ifdef elem_weight
        /* middle of min/max weight */
        mw = (current_wmm[0] + current_wmm[1]) / 2.0;

        /* min. part of weight to contribute */
        dw = xmax(0, mw - final_globals[NCONDS * i + 1]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mw = %e, dw = %e", i, borders[i], mw, dw);
#else
        /* middle of min/max count */
        mc = (current_cmm[0] + current_cmm[1]) / 2;

        /* min. part of count to contribute */
        dc = xmax(0, mc - final_globals[NCONDS * i + 0]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": mc = %" slint_fmt ", dc = %" slint_fmt, i, borders[i], mc, dc);
#endif

        /* contribute all? */
        if (
#ifdef elem_weight
          dw >= final_locals[NCONDS * i + 1]
#else
          dc >= final_locals[NCONDS * i + 0]
#endif
        )
        {
          lc = final_locals[NCONDS * i + 0];
#ifdef elem_weight
          lw = final_locals[NCONDS * i + 1];
#endif

        } else
        {
          /* contribute only a part */
#ifdef elem_weight
          lc = 0;

          for (j = 0; j < nelements; ++j)
          {
            elem_assign_at(&areas[border_areas[i] * nelements + j], areas[border_areas[i] * nelements + j].size, &end);

            for (elem_assign(&areas[border_areas[i] * nelements + j], &xi); xi.keys < end.keys; elem_inc(&xi))
            {
              dw -= elem_weight(&xi, 0);
              ++lc;

              if (dw < 0.0 || lc >= final_locals[NCONDS * i + 0])
              {
                dw += elem_weight(&xi, 0);
                --lc;
                break;
              }
            }
          }

          lw = dw;
#else
          lc = dc;
#endif
        }

        /* check mc against min/max count borders */
        lc = xminmax(current_cmm[0] - final_globals[NCONDS * i + 0], lc, current_cmm[1] - final_globals[NCONDS * i + 0]);

        /* check agains 0 (don't step back!) and the local contribution */
        lc = xminmax(0, lc, final_locals[NCONDS * i + 0]);

        lcs = lc;
#ifdef elem_weight
        lws = lw;
#endif

#ifdef elem_weight
        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " <= %" slint_fmt " + %" slint_fmt " <= %" slint_fmt,
          i, borders[i], border_lo, i, direction, border_hi);
        if (border_lo <= i + direction && i + direction <= border_hi)
          SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": next border: %" slint_fmt " == %" slint_fmt " + %" slint_fmt,
            i, borders[i], borders[i + direction], borders[i], direction);

        /* FIXME: finalize geht auch rückwärts!!! */

        /* if the next open border is really the _next_ border */
        if (border_lo <= i + direction && i + direction <= border_hi && borders[i + direction] == borders[i] + direction)
        {
          /* determine the exact global counts/weights (damn, this is expensive) */
          mcw[0] = lcs;
          mcw[1] = lws;
          MPI_Allreduce(&mcw[0], &mcw[2], 2, MPI_DOUBLE, MPI_SUM, comm);

        } else
        {
          /* the exact global counts/weights are not required */
          mcw[2] = 0.0;
          mcw[3] = 0.0;
        }

        gc = 0;
        gcs = mcw[2];
        gw = 0.0;
        gws = mcw[3];
        
        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt ", gws = %f", i, borders[i], gcs, gws);
#else
        /* the global count is simply mc */
        gc = 0;
        gcs = mc;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": gcs = %" slint_fmt, i, borders[i], gcs);
#endif

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcs = %" slint_fmt, i, borders[i], lcs);

        /* break the local contribution into contributions for the lists of elements */
        for (j = 0; j < nelements; ++j)
        {
          lcv[j] = 0;
          lcsv[j] = xmin(lcs, final_areas[i * nelements + j]);
          
          lcs -= lcsv[j];
        }

        SL_TRACE_ARRAY_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": lcsv = ", "%" slint_fmt, j, nelements, lcsv, i, borders[i]);

        rti_tstop(rti_tid_mpi_select_exact_radix_while_check_final);
      }

      SL_ASSERT(lcs == 0);
      
      /* accept local contributions */
      for (j = 0; j < nelements; ++j) sdispls[(borders[i] + 1) * nelements + j] += lcsv[j];

      rti_tstart(rti_tid_mpi_select_exact_radix_while_check_post);

      /* this is wrong, e.g., even if gc == 0 and gcs == 0 then crange[1] is set to crange[0]! */
/*      if (gc > 0 || gcs > 0
#ifdef elem_weight
       || gw != 0.0 || gws != 0.0
#endif
       )*/
      {
        border_infos[borders[i]].crange[0] += gcs;
        border_infos[borders[i]].crange[1] = border_infos[borders[i]].crange[0] + gc;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": counts_range: %" slint_fmt "  %" slint_fmt "", i, borders[i], border_infos[borders[i]].crange[0], border_infos[borders[i]].crange[1]);

        border_infos[borders[i]].cmmlr[MIN_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].crange[1]);
        border_infos[borders[i]].cmmlr[MAX_LE] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].crange[1]);
        border_infos[borders[i]].cmmlr[MIN_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].crange[1]);
        border_infos[borders[i]].cmmlr[MAX_RI] = xminmax(border_infos[borders[i]].crange[0], border_infos[borders[i]].cmmlr[MAX_RI], border_infos[borders[i]].crange[1]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": count[min/max-left/right]: %" slint_fmt " / %" slint_fmt " - %" slint_fmt " / %" slint_fmt "", i, borders[i],
          border_infos[borders[i]].cmmlr[MIN_LE], border_infos[borders[i]].cmmlr[MAX_LE], border_infos[borders[i]].cmmlr[MIN_RI], border_infos[borders[i]].cmmlr[MAX_RI]);

#ifdef elem_weight
        border_infos[borders[i]].wrange[0] += gws;
        border_infos[borders[i]].wrange[1] = border_infos[borders[i]].wrange[0] + gw;

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weights_range: %f  %f", i, borders[i], border_infos[borders[i]].wrange[0], border_infos[borders[i]].wrange[1]);

        border_infos[borders[i]].wmmlr[MIN_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wrange[1]);
        border_infos[borders[i]].wmmlr[MAX_LE] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wrange[1]);
        border_infos[borders[i]].wmmlr[MIN_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wrange[1]);
        border_infos[borders[i]].wmmlr[MAX_RI] = xminmax(border_infos[borders[i]].wrange[0], border_infos[borders[i]].wmmlr[MAX_RI], border_infos[borders[i]].wrange[1]);

        SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": weight[min/max-left/right]: %f / %f - %f / %f", i, borders[i],
          border_infos[borders[i]].wmmlr[MIN_LE], border_infos[borders[i]].wmmlr[MAX_LE], border_infos[borders[i]].wmmlr[MIN_RI], border_infos[borders[i]].wmmlr[MAX_RI]);
#endif
      }
      
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 0: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i],
        border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[0] - border_infos[borders[i] - 1].crange[0],
        border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[0], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[0]);
      SL_TRACE_IF(DEBUG_OR_NOT, "%" slint_fmt ",%" slint_fmt ": range diff 1: %" slint_fmt "-%" slint_fmt " | %" slint_fmt "-%" slint_fmt, i, borders[i],
        border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[1], border_infos[borders[i]].crange[1] - border_infos[borders[i] - 1].crange[0],
        border_infos[borders[i] + 1].crange[0] - border_infos[borders[i]].crange[1], border_infos[borders[i] + 1].crange[1] - border_infos[borders[i]].crange[1]);

      if (border_infos[borders[i]].cmmlr[MIN_LE] != border_info_old.cmmlr[MIN_LE]
       || border_infos[borders[i]].cmmlr[MAX_LE] != border_info_old.cmmlr[MAX_LE]
#ifdef elem_weight
       || border_infos[borders[i]].wmmlr[MIN_LE] != border_info_old.wmmlr[MIN_LE]
       || border_infos[borders[i]].wmmlr[MAX_LE] != border_info_old.wmmlr[MAX_LE]
#endif
       ) border_infos[borders[i] + 1].update = 1;

      if (border_infos[borders[i]].cmmlr[MIN_RI] != border_info_old.cmmlr[MIN_RI]
       || border_infos[borders[i]].cmmlr[MAX_RI] != border_info_old.cmmlr[MAX_RI]
#ifdef elem_weight
       || border_infos[borders[i]].wmmlr[MIN_RI] != border_info_old.wmmlr[MIN_RI]
       || border_infos[borders[i]].wmmlr[MAX_RI] != border_info_old.wmmlr[MAX_RI]
#endif
       ) border_infos[borders[i] - 1].update = 1;

      border_infos[borders[i]].update = 0;

      /* refine or remove */
      if (refine)
      {
        /* bits left for partitioning? */
        if (rhigh >= rlow)
        {
          if (last_new_area == border_areas[i] && last_new_class == k) border_areas[i] = nareas_new - 1;
          else
          {
            /* update last_new_... */
            last_new_area = border_areas[i];
            last_new_class = k;

            /* create new area */
            for (j = 0; j < nelements; ++j)
            {
              elem_assign_at(&areas[border_areas[i] * nelements + j], lcsv[j], &areas_new[nareas_new * nelements + j]);
              areas_new[nareas_new * nelements + j].size = lcv[j];
            }
            border_areas[i] = nareas_new;
            ++nareas_new;
          }

        } else
        {
          for (j = 0; j < nelements; ++j) final_areas[(i - nborders_removed * direction) * nelements + j] = lcv[j];

          /* save local count/weight for the later prefix calculations */
          final_locals[NCONDS * (i - nborders_removed * direction) + 0] = lc;
#ifdef elem_weight
          final_locals[NCONDS * (i - nborders_removed * direction) + 1] = lw;
#endif
        }

        borders[i - nborders_removed * direction] = borders[i];
        border_areas[i - nborders_removed * direction] = border_areas[i];

      } else ++nborders_removed;

      rti_tstop(rti_tid_mpi_select_exact_radix_while_check_post);

      i += direction;
    }

    /* restrict the parts */
    if (direction > 0) border_hi -= nborders_removed;
    else border_lo += nborders_removed;

    /* change direction */
    direction *= -1;

    rti_tstop(rti_tid_mpi_select_exact_radix_while_check);
    
    /* switch areas */
    nareas = nareas_new;
    if (areas == areas0)
    {
      areas = areas1;
      areas_new = areas0;
    } else
    {
      areas = areas0;
      areas_new = areas1;
    }
  }

  rti_tstop(rti_tid_mpi_select_exact_radix_while);

  sl_free(area_counts);
  sl_free(local_counts);
  sl_free(global_counts);

  rti_tstop(rti_tid_mpi_select_exact_radix);

#ifdef VERIFY
  v = mpi_post_check_partconds(s, nelements, nparts, pconds, sdispls, size, rank, comm);
  
  SL_ASSERT_IF(rank == 0, v < 0);
  
  SL_NOTICE_IF(rank == 0, "post_check_partconds: %s (%" slint_fmt ")", (v >= 0)?"FAILED":"SUCCESS", v);
#endif

#ifdef PRINT_SDISPLS
  printf("%d: sdispls:", rank);
  for (i = 0; i < nparts; ++i) printf(" %d ", sdispls[i]);
  printf("\n");
#endif

#ifdef PRINT_STATS
  mpi_select_stats(s, nparts, sdispls, size, rank, comm);
#endif

#if defined(PRINT_TIMINGS) && defined(SL_USE_RTI_TIM)
  if (rank == PRINT_TIMINGS)
  {
    printf("%d: mpi_select_exact_radix: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix));
    printf("%d: mpi_select_exact_radix: sync: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_sync));
    printf("%d: mpi_select_exact_radix: while: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while));
    printf("%d: mpi_select_exact_radix:  count: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_count));
    printf("%d: mpi_select_exact_radix:  allreduce: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_allreduce));
    printf("%d: mpi_select_exact_radix:  round1: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1));
    printf("%d: mpi_select_exact_radix:   allgather: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_round1_allgather));
    printf("%d: mpi_select_exact_radix:  exscan: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_exscan));
    printf("%d: mpi_select_exact_radix:  check: %f\n", rank, rti_tcumu(rti_tid_mpi_select_exact_radix_while_check));
    printf("%d: mpi_select_exact_radix:   pre: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_pre));
    printf("%d: mpi_select_exact_radix:   classes: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_classes));
    printf("%d: mpi_select_exact_radix:   final: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_final));
    printf("%d: mpi_select_exact_radix:   post: %f\n", rank, rti_tlast(rti_tid_mpi_select_exact_radix_while_check_post));
    printf("%d: mpi_select_exact_radix: rounds: %" slint_fmt "\n", rank, round);
  }
#endif

  return 0;
}
예제 #20
0
파일: main.c 프로젝트: pxlpnk/ipc_2012
int main(int argc, char *argv[]) {
  int r = 6;
  int n = 10;
  ATYPE a[] = {0,2,1,3,4,2,1,5,4,5};
  ATYPE b[n];

  int rank, size;

  MPI_Init(&argc,&argv);
  // get rank and size from communicator
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);


  if (rank == root)
    reference(a, n, r, b);


  int i;
  int m= n/size;

  /* create bucket locally */

  ATYPE B[r];

  const uint block_size = (rank != size-1) ? n/size : n - (n/size)*rank;

  printf("blocksize %d :rank %d\n",block_size, rank);


  ATYPE local_A[block_size];
  ATYPE local_B[r];

  int j = 0;
  for(i = rank * block_size; i < (rank + 1) * block_size; i++) {
    local_A[j] = a[i];
    j++;
  }

  for(i = 0; i < r; i++) local_B[i] = 0;
  for(i = 0; i < block_size; i++) local_B[local_A[i]] ++;


  ATYPE AllB[r];
  ATYPE RelB[r];

  MPI_Allreduce(&local_B, AllB, r, ATYPE_MPI, MPI_SUM, MPI_COMM_WORLD);
  MPI_Exscan(&local_B, RelB,r, ATYPE_MPI, MPI_SUM, MPI_COMM_WORLD);

  //  printArray(RelB, r);

  ATYPE temp[r];
  if ( rank > root) {
    for(int j=1; j< block_size; j++) {
      if (local_A[j] > 0) {
        local_A[ local_A[j] + RelB[ local_A[j]] + local_A[j-1] ] = local_A[j];
      }
    }

    printf("local_A\n");
    printArray(local_A, block_size);
  }





  MPI_Finalize();
  return 0;
}
예제 #21
0
int vec2elemental(const std::vector<double> &vec, El::DistMatrix<El::Complex<double>,El::VC,El::STAR> &Y){

	int data_dof=2;
	int SCAL_EXP = 1;

	int nlocal,gsize; //local elements, start p_id, global size
	double *pt_array; // will hold local array
	int r,q,rq; //Grid sizes
	int nbigs; //Number of large sends (i.e. send 1 extra data point)
	int pstart; // p_id of nstart
	int rank = El::mpi::WorldRank(); //p_id
	int send_size; // base send size
	bool print = rank == -1; 


	// Get Grid and associated params
	const El::Grid* g = &(Y.Grid());
	r = g->Height();
	q = g->Width();
	MPI_Comm comm = (g->Comm()).comm;

	// Get sizes, array in petsc 
	nlocal = vec.size()/data_dof;
	int nstart = 0;
	MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm);
	//VecGetOwnershipRange(pt_vec,&nstart,NULL);

	//Find processor that nstart belongs to, number of larger sends
	rq = r * q;
	pstart = nstart % rq; //int div
	nbigs = nlocal % rq;
	send_size = nlocal/rq;
	
	if(print){
		std::cout << "r: " << r << " q: " << q <<std::endl;
		std::cout << "nstart: " << nstart << std::endl;
		std::cout << "ps: " << pstart << std::endl;
		std::cout << "nbigs: " << nbigs << std::endl;
		std::cout << "send_size: " << send_size << std::endl;
	}

	// Make send_lengths
	std::vector<int> send_lengths(rq);
	std::fill(send_lengths.begin(),send_lengths.end(),send_size);
	if(nbigs >0){
		for(int j=0;j<nbigs;j++){
			send_lengths[(pstart + j) % rq] += 1;
		}
	}

	// Make send_disps
	std::vector<int> send_disps = exscan(send_lengths);

	std::vector<El::Complex<double>> indata(nlocal);
	// copy the data from an ffm tree to into a local vec of complex data for sending #pragma omp parallel for
	El::Complex<double> val;
	for(int i=0;i<nlocal;i++){
		El::SetRealPart(val,vec[2*i+0]);
		El::SetImagPart(val,vec[2*i+1]);
		indata[i] = val;
	}


	// Make send_dataA, i.e. reorder the data
	std::vector<El::Complex<double>> send_data(nlocal);
	for(int proc=0;proc<rq;proc++){
		int offset = send_disps[proc];
		int base_idx = (proc - pstart + rq) % rq; 
		for(int j=0; j<send_lengths[proc]; j++){
			int idx = base_idx + (j * rq);
			send_data[offset + j] = indata[idx];
		}
	}

	// Do all2all to get recv_lengths
	std::vector<int> recv_lengths(rq);
	MPI_Alltoall(&send_lengths[0], 1, MPI_INT, &recv_lengths[0], 1, MPI_INT,comm);

	// Scan to get recv_disps
	std::vector<int> recv_disps = exscan(recv_lengths);

	// Do all2allv to get data on correct processor
	El::Complex<double> * recv_data = Y.Buffer();
	//MPI_Alltoallv(&send_data[0],&send_lengths[0],&send_disps[0],MPI_DOUBLE, \
	//		&recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm);
	El::mpi::AllToAll(&send_data[0], &send_lengths[0], &send_disps[0], recv_data,&recv_lengths[0],&recv_disps[0],comm);

	if(print){
		std::cout << "Send data: " <<std::endl << send_data <<std::endl;
		std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl;
		std::cout << "Send disps: " <<std::endl << send_disps <<std::endl;
		std::cout << "Recv data: " <<std::endl << recv_data <<std::endl;
		std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl;
		std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl;
	}

	return 0;
}