/************************************************************************************ 执行常规的检索 1. 首先重置渲染画布; 2. 遍历图像库中的每个图像,并将其与目标图像进行特征向量的匹配; 3. 在检索结果列表中,根据相似程序进行排序 这其中为了统计效率,在检索开始前放置计时器,并开始计时;当检索完成之后停止计时器,得到相应的耗时 ************************************************************************************/ double CCBIRView::PerformNormRetrieval() { if (mpDstImage) { CPerfCounter timeCounter; timeCounter.Start(); // 将绘制面板重置 mCanvas.Reset(); // 将目标图片与数据库列表中的图片进行一一匹配 for (unsigned int i = 0 ; i < mImageList.GetListSize() ; ++i) { mImageList.GetItemPtr(i)->Match(mpDstImage); } // 清空排序列表 mSortedListNormal.EmptyList(); // 对匹配结果按Distance由小到大进行排序(这里使用比较暴力的N*N方法,可以凸显蜂群算法的速度提升^-^) double minDistance; int minImageIndex; for (unsigned int i = 0 ; i < mImageList.GetListSize() ; ++i) { minDistance = 999999.0f; minImageIndex = 0; // 遍历以搜索当前匹配度最高(距离最小)的图片,并对其进行标记以避免重复搜索 for (unsigned int j = 0 ; j < mImageList.GetListSize() ; ++j) { if(mImageList.GetItemPtr(j)->IsFlagged() == false) { if (mImageList.GetItemPtr(j)->GetDistance() < minDistance) { minImageIndex = j; minDistance = mImageList.GetItemPtr(j)->GetDistance(); } } } // 将当前最相近的图片加入到列表中,并修改其标记值(该值会有每次Match操作时重置) Image* pImage = mImageList.GetItemPtr(minImageIndex); mSortedListNormal.AddItem(pImage); mImageList.GetItemPtr(minImageIndex)->SetFlagged(); TRACE("Distance: %f\n" , minDistance); } InvalidateRect(NULL , FALSE); timeCounter.Stop(); return timeCounter.GetElapsedTime(); } return 0.0; }
void timedKernel( cl_command_queue queue, cl_kernel kernel, cl_mem bufSrc, cl_mem bufDst, unsigned char v ) { cl_int ret; cl_event ev = 0; CPerfCounter t; size_t global_work_offset[2] = { imageOrigin[0], imageOrigin[1] }; size_t global_work_size[2] = { nThreadsX, nThreadsY }; size_t local_work_size[2] = { nLocalThreadsX, nLocalThreadsY }; cl_uint val=0; for(int i=0; i < nBytesPerChannel; i++) val |= v << (i*8); ret = clSetKernelArg( kernel, 0, sizeof(void *), (void *) &bufSrc ); ret |= clSetKernelArg( kernel, 1, sizeof(void *), (void *) &bufDst ); ret |= clSetKernelArg( kernel, 2, sizeof(int), (void *) &nPixelsPerThread ); ret |= clSetKernelArg( kernel, 3, sizeof(cl_uint), (void *) &val ); ret |= clSetKernelArg( kernel, 4, sizeof(cl_uint), (void *) &nKLoops ); ASSERT_CL_RETURN( ret ); t.Reset(); t.Start(); for(int i=0; i < nKLaunches; i++) { ret = clEnqueueNDRangeKernel( queue, kernel, 2, global_work_offset, global_work_size, local_work_size, 0, NULL, &ev ); ASSERT_CL_RETURN( ret ); } clFlush( queue ); spinForEventsComplete( 1, &ev ); t.Stop(); tlog->Timer( "%32s %lf s %8.2lf GB/s\n", "clEnqueueNDRangeKernel():", t.GetElapsedTime() / nKLaunches, nBytesRegion, nKLoops ); }
void benchBarrier() { CPerfCounter t; t.Reset(); int nl=100000; t.Start(); for(int n = 0; n < nl; n++) { empty_MT(); } t.Stop(); std::cout << std::setw(21) << std::left << "Barrier speed" << std::setw(7) << t.GetElapsedTime() / nl * 1e9 << " ns\n"; }
void benchLaunch() { CPerfCounter t; t.Reset(); int nl = 100; t.Start(); for(int n = 0; n < nl; n++) { launchThreads(); shutdownThreads(); } t.Stop(); std::cout << "Launch speed" << " " << t.GetElapsedTime() / nl * 1e9 << nWorkers << " ns\n"; }
void stridePagesCPU( void *ptr, size_t stride, size_t nbytes ) { register unsigned int *p = ( unsigned int * ) ptr; register size_t i; CPerfCounter t; double kTime; t.Reset(); t.Start(); for(i = 0; i < nbytes/sizeof(unsigned int); i += stride/sizeof(unsigned int)) p[i] = 0; t.Stop(); kTime = t.GetElapsedTime(); std::cout << std::setw(21) << std::left << "Page fault" << std::setw(7) << (kTime*1e9) / ((double) nbytes/stride) << " ns" << std::endl; }
/************************************************************************************ 执行蜂群算法的检索 1. 首先重置渲染画布; 2. 使用蜂群检索算法对象(BeeColonyAlgo)来执行检索操作 其中的检索结果排序等相关的操作均由蜂群算法对象来维护。这其中为了统计效率,在检索开始前放置计时器, 并开始计时;当检索完成之后停止计时器,得到相应的耗时 ************************************************************************************/ double CCBIRView::PerformBeeRetrieval() { if (mpDstImage) { CPerfCounter timeCounter; timeCounter.Start(); // 将绘制面板重置 mCanvas.Reset(); mBeeColony.PerformRetrieval(mpDstImage , mImageList , mSortedListBeeColony); InvalidateRect(NULL , FALSE); timeCounter.Stop(); return timeCounter.GetElapsedTime(); } return 0.0; }
int main( int argc, char *argv[]) { cl_int err; int ret = 0; CPerfCounter timer; timer.Start(); #ifdef WIN32 if (SetConsoleCtrlHandler( (PHANDLER_ROUTINE)ConsoleHandler,TRUE)==FALSE) { // unable to install handler... // display message to the user printf("Unable to install handler!\n"); return -1; } #else struct sigaction sigIntHandler; sigIntHandler.sa_handler = my_handler; sigemptyset(&sigIntHandler.sa_mask); sigIntHandler.sa_flags = 0; sigaction(SIGINT, &sigIntHandler, NULL); #endif bool deviceAll = true; std::vector<unsigned int> deviceNum; size_t NBRuns = 5; while (--argc) { ++argv; if (!strncmp(*argv, "-m", 2)) { ++argv; --argc; int mode = strtoul((*argv), NULL, 0); cout << "mode = "<<mode<<endl; if (mode==0) NBRuns = 31; else if (mode == 1) NBRuns = 310; else if (mode == 2) NBRuns = 20000; else Usage(); } else if (!strncmp(*argv, "-iDD", 4)) { ++argv; --argc; char *p = *argv; while (p) { deviceAll = false; int device = 0; sscanf(p, "%d",&device); cout << "device = "<<device<<endl; deviceNum.push_back(device); p = strchr(p, ','); if (p != NULL) ++p; } } else { Usage(); } } int MainCL; float MaxGlobalMemory = 0.0f; InitCL(deviceNum, deviceAll, ctx, MainCL, MaxGlobalMemory); /* Create and build program */ std::string src; //sgemm1 //std::string srcXgemm; // //srcXgemm = get_file_contents("HealthMonitorKernels.cl"); const char * C_KernelString = KernelString.c_str(); std::size_t C_KernelString_size[] = { strlen(C_KernelString) }; program = clCreateProgramWithSource(ctx, 1, &C_KernelString, C_KernelString_size, &err); check_err(err, "clCreateProgramWithSource", &ctx); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err!=CL_SUCCESS) { Print_BuildLog(program); check_err(err, "clCreateProgramWithSource", &ctx); } int NBElem = 6400000; //MaxGlobalMemory = 15.0f; int NBBuffers = (int)(MaxGlobalMemory / (double)(NBElem*2*sizeof(float)/(1024.0*1024.0*1024.0))); float totalMemory = NBElem * 2 * sizeof(float) / (1024.0f*1024.0f*1024.0f) * NBBuffers; if (totalMemory != 0) cout << "we will work with " << NBBuffers << " buffers. This represents " << totalMemory << " GB of data" << endl << endl; else { cout << "OpenCL reports 0 Bytes of memory available for one of the GPU. Please check if one of the GPU is damaged using clinfo and reboot the system" << endl; return -1; } createThread(ctx, program, NBRuns, NBElem, NBBuffers); timer.Stop(); double time = timer.GetElapsedTime(); cout << "OpenCL setup and thread creation took " << time << "s" << endl<<endl; for (unsigned int i=0;i<g_threads.size();i++) { g_threads[i].join(); } cout << endl; cleanup(); return ret; }
DWORD WINAPI #else void #endif TestProcedure(thread_data* data) { HealthMonitorData testdata(data->NBBuffers, data->ctx, data->GPUID); CPerfCounter timer; timer.Start(); vector<float> A; A.resize(data->NBElem); int error = 0; #pragma omp parallel for for (unsigned int i=0; i<A.size(); ++i) A[i] = (float)rand()/RAND_MAX; error = testdata.AllocGPUBuffer( data->queue, data->NBElem); if (error) { THREAD_EXIT; } timer.Stop(); double time = timer.GetElapsedTime(); cout << "It took " << time << " s to allocate initial memory on CPU, fill it with random data and to allocate memory on GPU " << data->GPUID<< endl; cout << "We will start now to run the test on this GPU" << endl; size_t CurrentRun = (int)data->NBRuns == 0 ? -1 : data->NBRuns; int i = 0; timer.Reset(); timer.Start(); while ((data->NBRuns == 0 || CurrentRun != 0)) { error = testdata.TransferGPUBuffer(A, data->ComputeB); if (error) { THREAD_EXIT; } if (NeedToTerminateThread) { THREAD_EXIT; } double time = testdata.run(data->InverseKernel, data->Compare, error); if (error) { THREAD_EXIT; } data->outputfile << i<< "," << gflops(data->NBElem, time) << std::endl; i++; if (CurrentRun != -1) CurrentRun--; } timer.Stop(); cout << endl; time = timer.GetElapsedTime(); cout << "It took " << time << " s to run the GPU test on GPU " << data->GPUID << endl; if (!testdata.WasRunOK()) { ofstream Adata; Adata.open("Adata.bin", ofstream::out | ofstream::binary | ofstream::trunc); Adata.write((char*)&A[0], data->NBElem*sizeof(A[0])); Adata.close(); } THREAD_EXIT; }
void assessHostMemPerf( void *ptr, void *ptr2, size_t nbytes ) { CPerfCounter t; std::cout << "Host baseline (naive):\n\n"; double sum = 0.; int ctr = 0; for(int i = 0; i < 1e6; i++) { t.Reset(); t.Start(); t.Stop(); double e = t.GetElapsedTime(); if( e > 0. ) { sum += e; ctr++; } } std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(2); std::cout << std::setw(21) << std::left << "Timer resolution" << std::setw(7) << ( sum / (double) ctr ) * 1e9 << " ns\n"; #ifdef _WIN32 //Sleep( 1000 ); #else usleep( 1000 * 1e3 ); #endif size_t pagesize; #ifdef _WIN32 SYSTEM_INFO system_info; GetSystemInfo (&system_info); pagesize = (size_t) system_info.dwPageSize; #else pagesize = getpagesize(); #endif stridePagesCPU( ptr, pagesize, nbytes ); #ifdef MEM_MULTICORE benchBarrier(); std::cout << "\n"; #endif #if 0 TIMED_LOOP( "SSE read", readVerifyMemSSE( ptr, 0, nbytes ), nbytes ) TIMED_LOOP( "SSE write", writeMemSSE( ptr, 0, nbytes ), nbytes ) TIMED_LOOP( "CPU write", writeMemCPU( ptr, 0, nbytes ), nbytes ) #endif TIMED_LOOP( "CPU read", readVerifyMemCPU_MT( ptr, 0, nbytes ), nbytes ) TIMED_LOOP( "memcpy()", memcpy_MT( ptr, ptr2, nbytes ), nbytes ) TIMED_LOOP( "memset(,1,)", memset_MT( ptr, 1, nbytes ), nbytes ) TIMED_LOOP( "memset(,0,)", memset_MT( ptr, 0, nbytes ), nbytes ) std::cout << "\n"; }
void OCLPerfDoubleDMA::run() { if (failed_) { return; } CPerfCounter timer; const int numQueues = (test_ % MaxQueues) + 1; const bool useKernel = ((test_ / MaxQueues) > 0); const int numBufs = numQueues; Profile profile(isProfilingEnabled_, numQueues); std::vector<cl_command_queue> cmdQueues(numQueues); int q; cl_command_queue_properties qProp = (isProfilingEnabled_) ? CL_QUEUE_PROFILING_ENABLE : 0; for (q = 0; q < numQueues; ++q) { cl_command_queue cmdQueue = clCreateCommandQueue( context_, devices_[deviceId_], qProp, &error_); CHECK_RESULT((error_), "clCreateCommandQueue() failed"); cmdQueues[q] = cmdQueue; } float *Data_s = (float*)clEnqueueMapBuffer(cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, size_S, 0, NULL, NULL, &error_); CHECK_RESULT((error_), "clEnqueueMapBuffer failed"); memset(Data_s, 1, size_S); size_t gws[1] = { size_s / (4 * sizeof(float)) }; size_t lws[1] = { 256 }; // Warm-up for (q = 0; q < numQueues; ++q) { error_ |= clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL); error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[q]); error_ |= clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, NULL); error_ |= clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL); error_ |= clFinish(cmdQueues[q]); } size_t s_done = 0; cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0}; /*---------- pass2: copy Data_s to and from GPU Buffers ----------*/ s_done = 0; timer.Reset(); timer.Start(); int idx = numBufs - 1; // Start from the last so read/write won't go to the same DMA when kernel is executed q = numQueues - 1; size_t iter = 0; while( 1 ) { if (0 == r[idx]) { error_ |= clEnqueueWriteBuffer(cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 0, NULL, &w[idx]); } else { error_ |= clEnqueueWriteBuffer(cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1, &r[idx], &w[idx]); if (!isProfilingEnabled_) { error_ |= clReleaseEvent(r[idx]); } } profile.addEvent(q, ProfileQueue::Write, w[idx]); if (useKernel) { // Change the queue ++q %= numQueues; // Implicit flush of DMA engine on kernel start, because memory dependency error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[idx]); error_ |= clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, gws, lws, 1, &w[idx], &x[idx]); if (!isProfilingEnabled_) { error_ |= clReleaseEvent(w[idx]); } profile.addEvent(q, ProfileQueue::Execute, x[idx]); } // Change the queue ++q %= numQueues; error_ |= clEnqueueReadBuffer(cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1, (useKernel) ? &x[idx] : &w[idx], &r[idx]); if (!isProfilingEnabled_) { error_ |= clReleaseEvent((useKernel) ? x[idx] : w[idx]); } profile.addEvent(q, ProfileQueue::Read, r[idx]); if ((s_done += size_s) >= size_S) { if (!isProfilingEnabled_) { error_ |= clReleaseEvent(r[idx]); } break; } ++iter; ++idx %= numBufs; ++q %= numQueues; } for (q = 0; q < numQueues; ++q) { error_ |= clFinish(cmdQueues[q]); } timer.Stop(); error_ = clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs], Data_s, 0, NULL, NULL); error_ |= clFinish(cmdQueues[0]); CHECK_RESULT((error_), "Execution failed"); cl_long gpuTimeFrame = profile.findExecTime(); cl_long oneIter = gpuTimeFrame / iter; // Display 4 iterations in the middle cl_long startFrame = oneIter * (iter/2 - 2); cl_long finishFrame = oneIter * (iter/2 + 2); profile.display(startFrame, finishFrame); for (q = 0; q < numQueues; ++q) { error_ = clReleaseCommandQueue(cmdQueues[q]); CHECK_RESULT((error_), "clReleaseCommandQueue() failed"); } double GBytes = (double)(2*size_S)/(double)(1024*1024*1024); std::stringstream stream; if (useKernel) { stream << "Write/Kernel/Read operation "; } else { stream << "Write/Read operation "; } stream << numQueues << " queue; profiling " << ((isProfilingEnabled_) ? "enabled" : "disabled"); stream << ((useUHP_) ? " using UHP" : " using AHP") << ": "; stream.flags(std::ios::right | std::ios::showbase); std::cout << stream.str() << static_cast<float>(GBytes / timer.GetElapsedTime()) << " GB/s\n"; }
ULONGLONG GetPerfData(LPCWSTR objectName, LPCWSTR instanceName, LPCWSTR counterName) { BYTE data[256]; WCHAR name[256]; ULONGLONG value = 0; CPerfSnapshot snapshot(&g_TitleCounter); CPerfObjectList objList(&snapshot, &g_TitleCounter); if (snapshot.TakeSnapshot(objectName)) { CPerfObject* pPerfObj = objList.GetPerfObject(objectName); if (pPerfObj) { for (CPerfObjectInstance* pObjInst = pPerfObj->GetFirstObjectInstance(); pObjInst != nullptr; pObjInst = pPerfObj->GetNextObjectInstance()) { if (*instanceName) { if (pObjInst->GetObjectInstanceName(name, 256)) { if (_wcsicmp(instanceName, name) != 0) { delete pObjInst; continue; } } else { delete pObjInst; continue; } } CPerfCounter* pPerfCntr = pObjInst->GetCounterByName(counterName); if (pPerfCntr != nullptr) { pPerfCntr->GetData(data, 256, nullptr); if (pPerfCntr->GetSize() == 1) { value = *(BYTE*)data; } else if (pPerfCntr->GetSize() == 2) { value = *(WORD*)data; } else if (pPerfCntr->GetSize() == 4) { value = *(DWORD*)data; } else if (pPerfCntr->GetSize() == 8) { value = *(ULONGLONG*)data; } delete pPerfCntr; delete pObjInst; break; // No need to continue } delete pObjInst; } delete pPerfObj; } } return value; }