void cudasummer(int data[], int length) { int *devIn, *devOut; //cudaEvent_t start, stop; //cudaEventCreate(&start); //cudaEventCreate(&stop); //cudaEventRecord(start, 0); cudaMalloc((void**) &devIn, length * sizeof(int)); cudaMalloc((void**) &devOut, length * sizeof(int)); cudaMemcpy(devIn, data, length * sizeof(int), cudaMemcpyHostToDevice); prefixsum(devIn, devOut, length); cudaMemcpy(data, devOut, length * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(devIn); cudaFree(devOut); //cudaEventRecord(stop, 0); //cudaEventSynchronize(stop); float t; //cudaEventElapsedTime(&t, start, stop); printf("Elapsed time %3fms\n", t); //cudaEventDestroy(start); //cudaEventDestroy(stop); }
void prefixsum(int* in, int *out, int length) { int blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; //dim3 dimGrid(blocks, 1, 1); __modify_Grid(blocks, 1); //dim3 dimBlock(BLOCK_SIZE, 1, 1); __modify_Block(BLOCK_SIZE, 1, 1); __begin_GPU(); zarro(out, length); __end_GPU(); __begin_GPU(); prefixsumblock(in, out, length); __end_GPU(); if (blocks > 1) { int *devEnds; int *devTmpEnds; cudaMalloc((void**) &devEnds, blocks * sizeof(int)); cudaMalloc((void**) &devTmpEnds, blocks * sizeof(int)); int subblocks = (blocks + BLOCK_SIZE - 1) / BLOCK_SIZE; //dim3 subgrid(subblocks, 1, 1); __modify_Grid(subblocks, 1); //dim3 subblock(BLOCK_SIZE, 1, 1); __modify_Block(BLOCK_SIZE, 1); __begin_GPU(); gathersumends(out, devEnds); __end_GPU(); prefixsum(devEnds, devTmpEnds, blocks); cudaFree(devEnds); __begin_GPU(); correctsumends(devTmpEnds, in, out); __end_GPU(); cudaFree(devTmpEnds); } }
SDArrayBAux::ValueTp SDArrayBAux::rank2(ValueTp p, ValueTp &select) const { uint64_t v = rank(p); //TODO: optimize this select = prefixsum(v); return v; }
void gather(const Vertex rootVertex, const Vertex srcVertex, const T_Send sendData, T_Recv& recvData, const bool reorder){ typedef typename T_Send::value_type SendValueType; typedef typename T_Recv::value_type RecvValueType; static std::vector<SendValueType> gather; static T_Recv* rootRecvData = NULL; static bool peerHostsRootVertex = false; static unsigned nGatherCalls = 0; nGatherCalls++; VAddr rootVAddr = locateVertex(graph, rootVertex); Context context = getGraphContext(graph); // Insert data of srcVertex to the end of the gather vector gather.insert(gather.end(), sendData.begin(), sendData.end()); // Store recv pointer of rootVertex if(srcVertex.id == rootVertex.id){ rootRecvData = &recvData; peerHostsRootVertex = true; } if(nGatherCalls == hostedVertices.size()){ std::vector<unsigned> recvCount; std::vector<unsigned> prefixsum(context.size(),0); if(peerHostsRootVertex){ cal.gatherVar(rootVAddr, context, gather, *rootRecvData, recvCount); // TODO // std::partial_sum might do the job unsigned sum = 0; for(unsigned count_i = 0; count_i < recvCount.size(); ++count_i){ prefixsum[count_i] = sum; sum += recvCount[count_i]; } // Reordering code if(reorder){ std::vector<RecvValueType> recvDataReordered(recvData.size()); for(unsigned vAddr = 0; vAddr < context.size(); vAddr++){ std::vector<Vertex> hostedVertices = getHostedVertices(graph, vAddr); unsigned nElementsPerVertex = recvCount.at(vAddr) / hostedVertices.size(); unsigned hVertex_i=0; for(Vertex v: hostedVertices){ std::copy(rootRecvData->begin()+(prefixsum[vAddr] + (hVertex_i * nElementsPerVertex)), rootRecvData->begin()+(prefixsum[vAddr] + (hVertex_i * nElementsPerVertex)) + (nElementsPerVertex), recvDataReordered.begin()+(v.id*nElementsPerVertex)); hVertex_i++; } } std::copy(recvDataReordered.begin(), recvDataReordered.end(), rootRecvData->begin()); } } else { cal.gatherVar(rootVAddr, context, gather, recvData, recvCount); } gather.clear(); nGatherCalls = 0; } }