 /** \brief
  * In many situations a rank computes a number of local DOFs. Then all
  * ranks want to know the number of global DOFs and the starting
  * displacment number of the DOF numbering in each rank.
  * \param[in]   mpiComm        The MPI communicator.
  * \param[in]   nRankDofs      The number of local DOFs.
  * \param[out]  rStartDofs     Displacment of the DOF numbering. On rank n
  *                             this is the sum of all local DOF numbers in
  *                             ranks 0 to n - 1.
  * \param[out]  nOverallDofs   Global sum of nRankDofs. Is equal on all
  *                             ranks.
 inline void getDofNumbering(MPI::Intracomm& mpiComm,
                             int nRankDofs,
                             int& rStartDofs,
                             int& nOverallDofs)
   rStartDofs = 0;
   nOverallDofs = 0;
   mpiComm.Scan(&nRankDofs, &rStartDofs, 1, MPI_INT, MPI_SUM);
   rStartDofs -= nRankDofs;
   mpiComm.Allreduce(&nRankDofs, &nOverallDofs, 1, MPI_INT, MPI_SUM);
 void globalAdd(MPI::Intracomm& mpiComm, int& value)
   int valCopy = value;
   mpiComm.Allreduce(&valCopy, &value, 1, MPI_INT, MPI_SUM);
 void globalAdd(MPI::Intracomm& mpiComm, double& value)
   double valCopy = value;
   mpiComm.Allreduce(&valCopy, &value, 1, MPI_DOUBLE, MPI_SUM);
FullyDistVec<IT,NT> FullyDistSpVec<IT,NT>::operator() (const FullyDistVec<IT,IT> & ri) const
	MPI::Intracomm World = commGrid->GetWorld();
	// FullyDistVec ( shared_ptr<CommGrid> grid, IT globallen, NT initval, NT id);
	FullyDistVec<IT,NT> Indexed(ri.commGrid, ri.glen, zero, zero);
	int nprocs = World.Get_size();
        unordered_map<IT, IT> revr_map;       // inverted index that maps indices of *this to indices of output
	vector< vector<IT> > data_req(nprocs);
	IT locnnz = ri.LocArrSize();

	// ABAB: Input sanity check
	int local = 1;
	int whole = 1;
	for(IT i=0; i < locnnz; ++i)
		if(ri.arr[i] >= glen || ri.arr[i] < 0)
			local = 0;
	World.Allreduce( &local, &whole, 1, MPI::INT, MPI::BAND);
	if(whole == 0)
		throw outofrangeexception();

	for(IT i=0; i < locnnz; ++i)
		IT locind;
		int owner = Owner(ri.arr[i], locind);	// numerical values in ri are 0-based
                revr_map.insert(typename unordered_map<IT, IT>::value_type(locind, i));
	IT * sendbuf = new IT[locnnz];
	int * sendcnt = new int[nprocs];
	int * sdispls = new int[nprocs];
	for(int i=0; i<nprocs; ++i)
		sendcnt[i] = data_req[i].size();

	int * rdispls = new int[nprocs];
	int * recvcnt = new int[nprocs];
	World.Alltoall(sendcnt, 1, MPI::INT, recvcnt, 1, MPI::INT);	// share the request counts 

	sdispls[0] = 0;
	rdispls[0] = 0;
	for(int i=0; i<nprocs-1; ++i)
		sdispls[i+1] = sdispls[i] + sendcnt[i];
		rdispls[i+1] = rdispls[i] + recvcnt[i];
	IT totrecv = accumulate(recvcnt,recvcnt+nprocs,0);
	IT * recvbuf = new IT[totrecv];

	for(int i=0; i<nprocs; ++i)
		copy(data_req[i].begin(), data_req[i].end(), sendbuf+sdispls[i]);
	World.Alltoallv(sendbuf, sendcnt, sdispls, MPIType<IT>(), recvbuf, recvcnt, rdispls, MPIType<IT>());  // request data
	// We will return the requested data, 
	// our return can be at most as big as the request
	// and smaller if we are missing some elements 
	IT * indsback = new IT[totrecv];
	NT * databack = new NT[totrecv];		

	int * ddispls = new int[nprocs];
	copy(rdispls, rdispls+nprocs, ddispls);
	for(int i=0; i<nprocs; ++i)
		// this is not the most efficient method because it scans ind vector nprocs = sqrt(p) times
		IT * it = set_intersection(recvbuf+rdispls[i], recvbuf+rdispls[i]+recvcnt[i], ind.begin(), ind.end(), indsback+rdispls[i]);
		recvcnt[i] = (it - (indsback+rdispls[i]));	// update with size of the intersection
		IT vi = 0;
		for(int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j)	// fetch the numerical values
			// indsback is a subset of ind
			while(indsback[j] > ind[vi]) 
			databack[j] = num[vi++];
	DeleteAll(recvbuf, ddispls);
	NT * databuf = new NT[ri.LocArrSize()];

	World.Alltoall(recvcnt, 1, MPI::INT, sendcnt, 1, MPI::INT);	// share the response counts, overriding request counts 
	World.Alltoallv(indsback, recvcnt, rdispls, MPIType<IT>(), sendbuf, sendcnt, sdispls, MPIType<IT>());  // send indices
	World.Alltoallv(databack, recvcnt, rdispls, MPIType<NT>(), databuf, sendcnt, sdispls, MPIType<NT>());  // send data
	DeleteAll(rdispls, recvcnt, indsback, databack);

	// Now create the output from databuf (holds numerical values) and sendbuf (holds indices)
	// arr is already resized during its construction
	for(int i=0; i<nprocs; ++i)
		// data will come globally sorted from processors 
		// i.e. ind owned by proc_i is always smaller than 
		// ind owned by proc_j for j < i
		for(int j=sdispls[i]; j< sdispls[i]+sendcnt[i]; ++j)	
			typename unordered_map<IT,IT>::iterator it = revr_map.find(sendbuf[j]);
			Indexed.arr[it->second] = databuf[j];
			// cout << it->second << "(" << sendbuf[j] << "):" << databuf[j] << endl;
	DeleteAll(sdispls, sendcnt, sendbuf, databuf);
	return Indexed;