/************************************************************************************
执行常规的检索
1. 首先重置渲染画布;
2. 遍历图像库中的每个图像,并将其与目标图像进行特征向量的匹配;
3. 在检索结果列表中,根据相似程序进行排序
这其中为了统计效率,在检索开始前放置计时器,并开始计时;当检索完成之后停止计时器,得到相应的耗时
************************************************************************************/
double CCBIRView::PerformNormRetrieval()
{
	if (mpDstImage)
	{
		CPerfCounter timeCounter;
		timeCounter.Start();

		// 将绘制面板重置
		mCanvas.Reset();

		// 将目标图片与数据库列表中的图片进行一一匹配
		for (unsigned int i = 0 ; i < mImageList.GetListSize() ; ++i)
		{
			mImageList.GetItemPtr(i)->Match(mpDstImage);
		}

		// 清空排序列表
		mSortedListNormal.EmptyList();

		// 对匹配结果按Distance由小到大进行排序(这里使用比较暴力的N*N方法,可以凸显蜂群算法的速度提升^-^)
		double minDistance;
		int minImageIndex;
		for (unsigned int i = 0 ; i < mImageList.GetListSize() ; ++i)
		{
			minDistance = 999999.0f;
			minImageIndex = 0;

			// 遍历以搜索当前匹配度最高(距离最小)的图片,并对其进行标记以避免重复搜索
			for (unsigned int j = 0 ; j < mImageList.GetListSize() ; ++j)
			{
				if(mImageList.GetItemPtr(j)->IsFlagged() == false)
				{
					if (mImageList.GetItemPtr(j)->GetDistance() < minDistance)
					{
						minImageIndex = j;
						minDistance = mImageList.GetItemPtr(j)->GetDistance();
					}
				}
			}

			// 将当前最相近的图片加入到列表中,并修改其标记值(该值会有每次Match操作时重置)
			Image* pImage = mImageList.GetItemPtr(minImageIndex);
			mSortedListNormal.AddItem(pImage);
			mImageList.GetItemPtr(minImageIndex)->SetFlagged();

			TRACE("Distance: %f\n" , minDistance);
		}

		InvalidateRect(NULL , FALSE);

		timeCounter.Stop();

		return timeCounter.GetElapsedTime();
	}

	return 0.0;
}
void timedKernel( cl_command_queue queue,
                  cl_kernel        kernel,
                  cl_mem           bufSrc,
                  cl_mem           bufDst,
                  unsigned char    v )
{
     cl_int       ret;
     cl_event     ev = 0;
     CPerfCounter t;

     size_t global_work_offset[2] = { imageOrigin[0], imageOrigin[1] };
     size_t global_work_size[2] =   { nThreadsX, nThreadsY };
     size_t local_work_size[2] =    { nLocalThreadsX, nLocalThreadsY };

     cl_uint val=0;

     for(int i=0; i < nBytesPerChannel; i++)
        val |= v << (i*8);

     ret  = clSetKernelArg( kernel, 0, sizeof(void *),  (void *) &bufSrc );
     ret |= clSetKernelArg( kernel, 1, sizeof(void *),  (void *) &bufDst );
     ret |= clSetKernelArg( kernel, 2, sizeof(int),     (void *) &nPixelsPerThread );
     ret |= clSetKernelArg( kernel, 3, sizeof(cl_uint), (void *) &val );
     ret |= clSetKernelArg( kernel, 4, sizeof(cl_uint), (void *) &nKLoops );

     ASSERT_CL_RETURN( ret );

     t.Reset();
     t.Start();

     for(int i=0; i < nKLaunches; i++)
     {
        ret = clEnqueueNDRangeKernel( queue,
                                      kernel,
                                      2,
                                      global_work_offset,
                                      global_work_size,
                                      local_work_size,
                                      0, NULL, &ev );
        ASSERT_CL_RETURN( ret );
     }

     clFlush( queue );
     spinForEventsComplete( 1, &ev );

     t.Stop();


     tlog->Timer( "%32s  %lf s   %8.2lf GB/s\n", "clEnqueueNDRangeKernel():", 
                  t.GetElapsedTime() / nKLaunches, nBytesRegion, nKLoops );
}
Exemplo n.º 3
0
void benchBarrier()
{
   CPerfCounter t; 
   t.Reset();

   int nl=100000;

   t.Start();

   for(int n = 0; n < nl; n++)
   {
      empty_MT();
   }

   t.Stop();
   std::cout << std::setw(21) << std::left << "Barrier speed" << std::setw(7) << t.GetElapsedTime() / nl * 1e9 << " ns\n";
}
Exemplo n.º 4
0
void benchLaunch()
{
   CPerfCounter t; t.Reset();

   int nl = 100;

   t.Start();

   for(int n = 0; n < nl; n++)
   {
      launchThreads();
      shutdownThreads();
   }

   t.Stop();

   std::cout << "Launch speed" << "  " << t.GetElapsedTime() / nl * 1e9 << nWorkers << " ns\n";
}
Exemplo n.º 5
0
void stridePagesCPU( void *ptr, size_t stride, size_t nbytes )
{
    register unsigned int *p = ( unsigned int * ) ptr;
    register size_t i;

    CPerfCounter t;
    double kTime;

    t.Reset();
    t.Start();

    for(i = 0; i < nbytes/sizeof(unsigned int); i += stride/sizeof(unsigned int))
        p[i] = 0;

    t.Stop();
    kTime = t.GetElapsedTime();

    std::cout << std::setw(21) << std::left << "Page fault" << std::setw(7) << (kTime*1e9) / ((double) nbytes/stride) << " ns" << std::endl;
}
/************************************************************************************
执行蜂群算法的检索
1. 首先重置渲染画布;
2. 使用蜂群检索算法对象(BeeColonyAlgo)来执行检索操作
其中的检索结果排序等相关的操作均由蜂群算法对象来维护。这其中为了统计效率,在检索开始前放置计时器,
并开始计时;当检索完成之后停止计时器,得到相应的耗时
************************************************************************************/
double CCBIRView::PerformBeeRetrieval()
{
	if (mpDstImage)
	{
		CPerfCounter timeCounter;
		timeCounter.Start();

		// 将绘制面板重置
		mCanvas.Reset();

		mBeeColony.PerformRetrieval(mpDstImage , mImageList , mSortedListBeeColony);

		InvalidateRect(NULL , FALSE);

		timeCounter.Stop();
		return timeCounter.GetElapsedTime();
	}

	return 0.0;
}
Exemplo n.º 7
0
int main( int argc, char *argv[])
{
  cl_int err;
  
  int ret = 0;
  CPerfCounter timer;
  timer.Start();
#ifdef WIN32
  if (SetConsoleCtrlHandler( (PHANDLER_ROUTINE)ConsoleHandler,TRUE)==FALSE)
  {
    // unable to install handler... 
    // display message to the user
    printf("Unable to install handler!\n");
    return -1;
  }
#else
   struct sigaction sigIntHandler;

   sigIntHandler.sa_handler = my_handler;
   sigemptyset(&sigIntHandler.sa_mask);
   sigIntHandler.sa_flags = 0;

   sigaction(SIGINT, &sigIntHandler, NULL);
#endif


  bool deviceAll = true;
  std::vector<unsigned int> deviceNum;

  size_t NBRuns = 5;

  while (--argc)
  {
    ++argv;
    if (!strncmp(*argv, "-m", 2))
    {
      ++argv;
      --argc;
      int mode = strtoul((*argv), NULL, 0);
      cout << "mode = "<<mode<<endl;
      if (mode==0)
        NBRuns = 31;
      else if (mode == 1)
        NBRuns = 310;
      else if (mode == 2)
        NBRuns = 20000;
      else
        Usage();

    }
    else if (!strncmp(*argv, "-iDD", 4))
    {
      ++argv;
      --argc;
      char *p = *argv;
      while (p) 
      {
        deviceAll = false;
        int device = 0;
        sscanf(p, "%d",&device);
        cout << "device = "<<device<<endl;
        deviceNum.push_back(device);
        p = strchr(p, ',');
        if (p != NULL)
          ++p;
      }
    }
    else
    {
        Usage();
    }

  }

  int MainCL;
  float MaxGlobalMemory = 0.0f;
  InitCL(deviceNum, deviceAll, ctx, MainCL, MaxGlobalMemory);


  /* Create and build program */
  std::string src;


  //sgemm1
  //std::string srcXgemm;
  //
  //srcXgemm = get_file_contents("HealthMonitorKernels.cl");

  const char * C_KernelString = KernelString.c_str();
  std::size_t C_KernelString_size[] = { strlen(C_KernelString) };



  program = clCreateProgramWithSource(ctx, 1, &C_KernelString, C_KernelString_size, &err);
  check_err(err, "clCreateProgramWithSource", &ctx);


  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

  if (err!=CL_SUCCESS)
  {
    Print_BuildLog(program);
    check_err(err, "clCreateProgramWithSource", &ctx);
  }

  int NBElem = 6400000;
  //MaxGlobalMemory = 15.0f;
  int NBBuffers = (int)(MaxGlobalMemory / (double)(NBElem*2*sizeof(float)/(1024.0*1024.0*1024.0)));
  float totalMemory = NBElem * 2 * sizeof(float) / (1024.0f*1024.0f*1024.0f) * NBBuffers;
  if (totalMemory != 0)
    cout << "we will work with " << NBBuffers << " buffers. This represents " << totalMemory << " GB of data" << endl << endl;
  else
  {
    cout << "OpenCL reports 0 Bytes of memory available for one of the GPU. Please check if one of the GPU is damaged using clinfo and reboot the system" << endl;
    return -1;
  }


  createThread(ctx, program, NBRuns, NBElem, NBBuffers);
  timer.Stop();

  double time = timer.GetElapsedTime();

  cout << "OpenCL setup and thread creation took " << time << "s" << endl<<endl;

  for (unsigned int i=0;i<g_threads.size();i++)	  
  {
    g_threads[i].join();
  }

  cout << endl;
  cleanup();

  return ret;
}
Exemplo n.º 8
0
DWORD WINAPI
#else
void
#endif
TestProcedure(thread_data* data)
{
  HealthMonitorData testdata(data->NBBuffers, data->ctx, data->GPUID);

  CPerfCounter timer;
  timer.Start();
  vector<float> A;

  A.resize(data->NBElem);
  int error = 0;

#pragma omp parallel for
  for (unsigned int i=0; i<A.size(); ++i)
    A[i] = (float)rand()/RAND_MAX;
  


  error = testdata.AllocGPUBuffer( data->queue, data->NBElem);
  if (error)
  {
    THREAD_EXIT;
  }

  timer.Stop();
  double time = timer.GetElapsedTime();

  cout << "It took " << time << " s to allocate initial memory on CPU, fill it with random data and to allocate memory on GPU " << data->GPUID<<  endl;
  cout << "We will start now to run the test on this GPU" << endl;


  size_t CurrentRun = (int)data->NBRuns == 0 ? -1 : data->NBRuns;
  int i = 0;

  timer.Reset();
  timer.Start();

  while ((data->NBRuns == 0 || CurrentRun != 0))
  {
    error = testdata.TransferGPUBuffer(A, data->ComputeB);
    if (error)
    {
      THREAD_EXIT;
    }

    if (NeedToTerminateThread)
    {
      THREAD_EXIT;
    }
      

    double time = testdata.run(data->InverseKernel, data->Compare, error);
    if (error)
    {
      THREAD_EXIT;
    }
  
    data->outputfile <<  i<< "," << gflops(data->NBElem, time) << std::endl;
    i++;
    if (CurrentRun != -1)
      CurrentRun--;
  }

  timer.Stop();
  cout << endl;
  time = timer.GetElapsedTime();
  cout <<  "It took " << time << " s to run the GPU test on GPU " << data->GPUID << endl;



  if (!testdata.WasRunOK())
  {
    ofstream Adata;

    Adata.open("Adata.bin", ofstream::out | ofstream::binary | ofstream::trunc);
    Adata.write((char*)&A[0], data->NBElem*sizeof(A[0]));
    Adata.close();

  }

  THREAD_EXIT;

}
Exemplo n.º 9
0
void assessHostMemPerf( void *ptr, void *ptr2, size_t nbytes )
{
    CPerfCounter t;

    std::cout << "Host baseline (naive):\n\n";

    double sum = 0.;
    int ctr = 0;

    for(int i = 0; i < 1e6; i++)
    {
        t.Reset();
        t.Start();
        t.Stop();

        double e = t.GetElapsedTime();

        if( e > 0. ) {
            sum += e;
             ctr++;
        }
    }

	std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(2);
	std::cout << std::setw(21) << std::left << "Timer resolution" 
              << std::setw(7) << ( sum / (double) ctr ) * 1e9 << " ns\n";

#ifdef _WIN32
    //Sleep( 1000 );
#else
    usleep( 1000 * 1e3 );
#endif
    size_t pagesize;

#ifdef _WIN32
    SYSTEM_INFO system_info;

    GetSystemInfo (&system_info);
    pagesize = (size_t) system_info.dwPageSize;
#else
    pagesize = getpagesize();
#endif

    stridePagesCPU( ptr, pagesize, nbytes );

#ifdef MEM_MULTICORE
    benchBarrier();

    std::cout << "\n";

#endif

#if 0
    TIMED_LOOP( "SSE read", readVerifyMemSSE( ptr, 0, nbytes ), nbytes )
    TIMED_LOOP( "SSE write", writeMemSSE( ptr, 0, nbytes ), nbytes )
    TIMED_LOOP( "CPU write", writeMemCPU( ptr, 0, nbytes ), nbytes )
#endif

    TIMED_LOOP( "CPU read", readVerifyMemCPU_MT( ptr, 0, nbytes ), nbytes )

    TIMED_LOOP( "memcpy()", memcpy_MT( ptr, ptr2, nbytes ), nbytes )

    TIMED_LOOP( "memset(,1,)", memset_MT( ptr, 1, nbytes ), nbytes )
    TIMED_LOOP( "memset(,0,)", memset_MT( ptr, 0, nbytes ), nbytes )

    std::cout << "\n";
}
Exemplo n.º 10
0
void
OCLPerfDoubleDMA::run()
{
    if (failed_) {
        return;
    }
    CPerfCounter timer;
    const int   numQueues = (test_ % MaxQueues) + 1;
    const bool  useKernel = ((test_ / MaxQueues) > 0);
    const int   numBufs = numQueues;
    Profile     profile(isProfilingEnabled_, numQueues);

    std::vector<cl_command_queue> cmdQueues(numQueues);
    int q;
    cl_command_queue_properties qProp = (isProfilingEnabled_) ? CL_QUEUE_PROFILING_ENABLE : 0;
    for (q = 0; q < numQueues; ++q) {
        cl_command_queue cmdQueue = clCreateCommandQueue(
            context_, devices_[deviceId_], qProp, &error_);
        CHECK_RESULT((error_), "clCreateCommandQueue() failed");
        cmdQueues[q] = cmdQueue;
    }
    
    float *Data_s = (float*)clEnqueueMapBuffer(cmdQueues[0],
        buffers_[numBufs], CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, size_S, 0, NULL, NULL, &error_);
    CHECK_RESULT((error_), "clEnqueueMapBuffer failed");
	memset(Data_s, 1, size_S);
    size_t  gws[1] = { size_s / (4 * sizeof(float)) };
    size_t  lws[1] = { 256 };

    // Warm-up
    for (q = 0; q < numQueues; ++q) {
        error_ |= clEnqueueWriteBuffer(cmdQueues[q],
            buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL);
        error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[q]);
        error_ |= clEnqueueNDRangeKernel(cmdQueues[q],
            kernel_, 1, NULL, gws, lws, 0, NULL, NULL);
        error_ |= clEnqueueReadBuffer(cmdQueues[q],
            buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL);
        error_ |= clFinish(cmdQueues[q]);
    }

    size_t s_done = 0;
    cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0};

    /*----------  pass2:  copy Data_s to and from GPU Buffers ----------*/
    s_done = 0;
    timer.Reset();
    timer.Start();
    int idx = numBufs - 1;
    // Start from the last so read/write won't go to the same DMA when kernel is executed
    q = numQueues - 1;
    size_t iter = 0;
    while( 1 )  {
        if (0 == r[idx]) {
            error_ |= clEnqueueWriteBuffer(cmdQueues[q],
                buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 0, NULL, &w[idx]);
        }
        else {
            error_ |= clEnqueueWriteBuffer(cmdQueues[q],
                buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1, &r[idx], &w[idx]);
            if (!isProfilingEnabled_) { 
                error_ |= clReleaseEvent(r[idx]);
            }
        }
        profile.addEvent(q, ProfileQueue::Write, w[idx]);

        if (useKernel) {
            // Change the queue
            ++q %= numQueues;
            // Implicit flush of DMA engine on kernel start, because memory dependency
            error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[idx]);
            error_ |= clEnqueueNDRangeKernel(cmdQueues[q],
                kernel_, 1, NULL, gws, lws, 1, &w[idx], &x[idx]);
            if (!isProfilingEnabled_) { 
                error_ |= clReleaseEvent(w[idx]);
            }
            profile.addEvent(q, ProfileQueue::Execute, x[idx]);
        }

        // Change the queue
        ++q %= numQueues;
        error_ |= clEnqueueReadBuffer(cmdQueues[q],
            buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1,
            (useKernel) ? &x[idx] : &w[idx], &r[idx]);
        if (!isProfilingEnabled_) { 
            error_ |= clReleaseEvent((useKernel) ? x[idx] : w[idx]);
        }
        profile.addEvent(q, ProfileQueue::Read, r[idx]);

        if ((s_done += size_s) >= size_S) {
            if (!isProfilingEnabled_) { 
                error_ |= clReleaseEvent(r[idx]);
            }
            break;
        }
        ++iter;
        ++idx %= numBufs;
        ++q %= numQueues;
    }

    for (q = 0; q < numQueues; ++q) {
        error_ |= clFinish(cmdQueues[q]);
    }
    timer.Stop();

    error_ = clEnqueueUnmapMemObject(cmdQueues[0],
        buffers_[numBufs], Data_s, 0, NULL, NULL);

    error_ |= clFinish(cmdQueues[0]);
    CHECK_RESULT((error_), "Execution failed");

    cl_long gpuTimeFrame = profile.findExecTime();
    cl_long oneIter = gpuTimeFrame / iter;

    // Display 4 iterations in the middle
    cl_long startFrame = oneIter * (iter/2 - 2);
    cl_long finishFrame = oneIter * (iter/2 + 2);
    profile.display(startFrame, finishFrame);

    for (q = 0; q < numQueues; ++q) {
        error_ = clReleaseCommandQueue(cmdQueues[q]);
        CHECK_RESULT((error_), "clReleaseCommandQueue() failed");
    }

    double GBytes = (double)(2*size_S)/(double)(1024*1024*1024);

    std::stringstream stream;
    if (useKernel) {
        stream << "Write/Kernel/Read operation ";
    }
    else {
        stream << "Write/Read operation ";
    }
    stream << numQueues << " queue; profiling " <<
        ((isProfilingEnabled_) ? "enabled" : "disabled");

	stream << ((useUHP_) ? " using UHP" : " using AHP") << ": "; 
    
    stream.flags(std::ios::right | std::ios::showbase);
    std::cout << stream.str() << static_cast<float>(GBytes / timer.GetElapsedTime()) << " GB/s\n";
}
Exemplo n.º 11
0
ULONGLONG GetPerfData(LPCWSTR objectName, LPCWSTR instanceName, LPCWSTR counterName)
{
	BYTE data[256];
	WCHAR name[256];
	ULONGLONG value = 0;

	CPerfSnapshot snapshot(&g_TitleCounter);
	CPerfObjectList objList(&snapshot, &g_TitleCounter);

	if (snapshot.TakeSnapshot(objectName))
	{
		CPerfObject* pPerfObj = objList.GetPerfObject(objectName);

		if (pPerfObj)
		{
			for (CPerfObjectInstance* pObjInst = pPerfObj->GetFirstObjectInstance();
				pObjInst != nullptr;
				pObjInst = pPerfObj->GetNextObjectInstance())
			{
				if (*instanceName)
				{
					if (pObjInst->GetObjectInstanceName(name, 256))
					{
						if (_wcsicmp(instanceName, name) != 0)
						{
							delete pObjInst;
							continue;
						}
					}
					else
					{
						delete pObjInst;
						continue;
					}
				}

				CPerfCounter* pPerfCntr = pObjInst->GetCounterByName(counterName);
				if (pPerfCntr != nullptr)
				{
					pPerfCntr->GetData(data, 256, nullptr);

					if (pPerfCntr->GetSize() == 1)
					{
						value = *(BYTE*)data;
					}
					else if (pPerfCntr->GetSize() == 2)
					{
						value = *(WORD*)data;
					}
					else if (pPerfCntr->GetSize() == 4)
					{
						value = *(DWORD*)data;
					}
					else if (pPerfCntr->GetSize() == 8)
					{
						value = *(ULONGLONG*)data;
					}

					delete pPerfCntr;
					delete pObjInst;
					break;	// No need to continue
				}

				delete pObjInst;
			}

			delete pPerfObj;
		}
	}

	return value;
}