Example #1
0
void cudasummer(int data[], int length)
{

	int *devIn, *devOut;
	//cudaEvent_t start, stop;
	//cudaEventCreate(&start);
	//cudaEventCreate(&stop);
	//cudaEventRecord(start, 0);

	cudaMalloc((void**) &devIn, length * sizeof(int));
	cudaMalloc((void**) &devOut, length * sizeof(int));
	cudaMemcpy(devIn, data, length * sizeof(int), cudaMemcpyHostToDevice);

	prefixsum(devIn, devOut, length);

	cudaMemcpy(data, devOut, length * sizeof(int), cudaMemcpyDeviceToHost);

	cudaFree(devIn);
	cudaFree(devOut);

	//cudaEventRecord(stop, 0);
	//cudaEventSynchronize(stop);

	float t;

	//cudaEventElapsedTime(&t, start, stop);

	printf("Elapsed time %3fms\n", t);
	//cudaEventDestroy(start);
	//cudaEventDestroy(stop);

}
Example #2
0
void prefixsum(int* in, int *out, int length)
{
	int blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;


	//dim3 dimGrid(blocks, 1, 1);
	__modify_Grid(blocks, 1);

	//dim3 dimBlock(BLOCK_SIZE, 1, 1);
	__modify_Block(BLOCK_SIZE, 1, 1);

	__begin_GPU();
	zarro(out, length);
	__end_GPU();

	__begin_GPU();
	prefixsumblock(in, out, length);
	__end_GPU();

	if (blocks > 1) {
		int *devEnds;
		int *devTmpEnds;

		cudaMalloc((void**) &devEnds, blocks * sizeof(int));
		cudaMalloc((void**) &devTmpEnds, blocks * sizeof(int));

		int subblocks = (blocks + BLOCK_SIZE - 1) / BLOCK_SIZE;

		//dim3 subgrid(subblocks, 1, 1);
		__modify_Grid(subblocks, 1);

		//dim3 subblock(BLOCK_SIZE, 1, 1);
		__modify_Block(BLOCK_SIZE, 1);

		__begin_GPU();
		gathersumends(out, devEnds);
		__end_GPU();

		prefixsum(devEnds, devTmpEnds, blocks);

		cudaFree(devEnds);

		__begin_GPU();		
		correctsumends(devTmpEnds, in, out);
		__end_GPU();

		cudaFree(devTmpEnds);
	}
}
Example #3
0
SDArrayBAux::ValueTp SDArrayBAux::rank2(ValueTp p, ValueTp &select) const {
	uint64_t v = rank(p);
	//TODO: optimize this
	select = prefixsum(v);
	return v;
}
      void gather(const Vertex rootVertex, const Vertex srcVertex, const T_Send sendData, T_Recv& recvData, const bool reorder){
	typedef typename T_Send::value_type SendValueType;
	typedef typename T_Recv::value_type RecvValueType;
	
	static std::vector<SendValueType> gather;
	static T_Recv* rootRecvData     = NULL;
	static bool peerHostsRootVertex = false;
	static unsigned nGatherCalls    = 0;

	nGatherCalls++;

	VAddr rootVAddr  = locateVertex(graph, rootVertex);
	Context context  = getGraphContext(graph);

	// Insert data of srcVertex to the end of the gather vector
	gather.insert(gather.end(), sendData.begin(), sendData.end());

	// Store recv pointer of rootVertex
	if(srcVertex.id == rootVertex.id){
	  rootRecvData = &recvData;
	  peerHostsRootVertex = true;
	}

	if(nGatherCalls == hostedVertices.size()){
	  std::vector<unsigned> recvCount;
	  std::vector<unsigned> prefixsum(context.size(),0);

	  if(peerHostsRootVertex){
	    cal.gatherVar(rootVAddr, context, gather, *rootRecvData, recvCount);

	    // TODO
	    // std::partial_sum might do the job
	    unsigned sum = 0;
	    for(unsigned count_i = 0; count_i < recvCount.size(); ++count_i){
	      prefixsum[count_i] = sum;
	      sum += recvCount[count_i];
	    }
		    
	    // Reordering code
	    if(reorder){
	      std::vector<RecvValueType> recvDataReordered(recvData.size());
	      for(unsigned vAddr = 0; vAddr < context.size(); vAddr++){
		std::vector<Vertex> hostedVertices = getHostedVertices(graph, vAddr);
		unsigned nElementsPerVertex = recvCount.at(vAddr) / hostedVertices.size();

		unsigned hVertex_i=0;
		for(Vertex v: hostedVertices){

		  std::copy(rootRecvData->begin()+(prefixsum[vAddr] + (hVertex_i * nElementsPerVertex)),
			    rootRecvData->begin()+(prefixsum[vAddr] + (hVertex_i * nElementsPerVertex)) + (nElementsPerVertex),
			    recvDataReordered.begin()+(v.id*nElementsPerVertex));
		  hVertex_i++;

		}
			    
	      }
	      std::copy(recvDataReordered.begin(), recvDataReordered.end(), rootRecvData->begin());

	    }
		
	  }
	  else {
	    cal.gatherVar(rootVAddr, context, gather, recvData, recvCount);
	  }
	    
	  gather.clear();
	  nGatherCalls = 0;

	}

      }