void Dispatcher::dispatchRequest( unsigned int uCommand,PushFramework::LogicalConnection* pClient,IncomingPacket& packet,unsigned int serviceBytes ) { //StopWatch dispatchWatch(m_QPFrequency); serviceMapT::iterator it = serviceMap.find(uCommand); if (it == serviceMap.end()) return; // Service* pHandler = it->second->pService; //wcout << L"Locating Service : " << dispatchWatch.GetElapsedTime(false) << std::endl; //Mark dispatched service : setCurrentService(it->second->serviceName); StopWatch watch; pHandler->handle(pClient, &packet); double duration = watch.GetElapsedTime(); /* wcout << L"Service Time : " << watch.GetElapsedTime() << std::endl; */ //StopWatch statsClock(m_QPFrequency); stats.addToDistribution(ServerStats::PerformanceProcessingTimePerService, it->second->serviceName, duration); //wcout << L"Stat 1 : " << statsClock.GetElapsedTime(false) << std::endl; stats.addToDuration(ServerStats::PerformanceProcessingTime, duration); //wcout << L"Stat 2 : " << statsClock.GetElapsedTime(false) << std::endl; UnsetCurrentService(); //Stats. : stats.addToDistribution(ServerStats::BandwidthInboundVolPerRequest, it->second->serviceName, serviceBytes); //wcout << L"Stat 3 : " << statsClock.GetElapsedTime(false) << std::endl; stats.addToDistribution(ServerStats::PerformanceRequestVolPerRequest, it->second->serviceName, 1); //wcout << L"Stat 4 : " << statsClock.GetElapsedTime(false) << std::endl; //wcout << L"Dispatch Time : " << dispatchWatch.GetElapsedTime() << std::endl; }
void clppSort_RadixSort::sort() { // Satish et al. empirically set b = 4. The size of a work-group is in hundreds of // work-items, depending on the concrete device and each work-item processes more than one // stream element, usually 4, in order to hide latencies. StopWatch sw; cl_int clStatus; unsigned int numBlocks = roundUpDiv(_datasetSize, _workgroupSize * 4); unsigned int Ndiv4 = roundUpDiv(_datasetSize, 4); size_t global[1] = {toMultipleOf(Ndiv4, _workgroupSize)}; size_t local[1] = {_workgroupSize}; cl_mem* dataA = &_clBuffer_dataSet; cl_mem* dataB = &_clBuffer_dataSetOut; for(unsigned int bitOffset = 0; bitOffset < _bits; bitOffset += 4) { // 1) Each workgroup sorts its tile by using local memory // 2) Create an histogram of d=2^b digits entries #ifdef BENCHMARK sw.StartTimer(); #endif radixLocal(global, local, dataA, bitOffset); #ifdef BENCHMARK sw.StopTimer(); cout << "Local sort " << sw.GetElapsedTime() << endl; sw.StartTimer(); #endif localHistogram(global, local, dataA, &_clBuffer_radixHist1, &_clBuffer_radixHist2, bitOffset); #ifdef BENCHMARK sw.StopTimer(); cout << "Local histogram " << sw.GetElapsedTime() << endl; //********** //clEnqueueReadBuffer(_context->clQueue, dataA, CL_TRUE, 0, sizeof(int) * _datasetSize, _dataSetOut, 0, NULL, NULL); //********** // 3) Scan the p*2^b = p*(16) entry histogram table. Stored in column-major order, computes global digit offsets. sw.StartTimer(); #endif _scan->pushCLDatas(_clBuffer_radixHist1, 16 * numBlocks); _scan->scan(); #ifdef BENCHMARK _scan->waitCompletion(); sw.StopTimer(); cout << "Global scan " << sw.GetElapsedTime() << endl; // 4) Prefix sum results are used to scatter each work-group's elements to their correct position. sw.StartTimer(); #endif radixPermute(global, local, dataA, dataB, &_clBuffer_radixHist1, &_clBuffer_radixHist2, bitOffset, numBlocks); #ifdef BENCHMARK sw.StopTimer(); cout << "Global reorder " << sw.GetElapsedTime() << endl; #endif std::swap(dataA, dataB); } }