Beispiel #1
0
void TestVectorOfObjects::run(size_t count, size_t updates)
{
	PerfTimer perf;

	perf.start();
	std::vector<Particle> particles(count);
	perf.stop(&_creationTime);

	// randomize: no sense in this case...
	/*for (size_t i = 0; i < count / 2; ++i)
	{
		int a = rand() % count;
		int b = rand() % count;
		std::swap(particles[a], particles[b]);
	}*/

	_memoryKb = (particles.capacity()*sizeof(Particle)) / 1024.0;

	for (auto p = particles.begin(); p != particles.end(); ++p)
		p->generate();

	perf.start();
	for (size_t u = 0; u < updates; ++u)
	{
		for (auto p = particles.begin(); p != particles.end(); ++p)
			p->update(DELTA_TIME);
	}
	perf.stop(&_updatesTime);
}
Beispiel #2
0
void TestVectorOfPointers::run(size_t count, size_t updates)
{
	PerfTimer perf;

	perf.start();
	std::vector<std::shared_ptr<Particle>> particles(count);
	for (auto p = particles.begin(); p != particles.end(); ++p)
	{
		*p = std::make_shared<Particle>();
	}

	perf.stop(&_creationTime);

	// randomize to simulate 
	for (size_t i = 0; i < count / 2; ++i)
	{
		int a = rand() % count;
		int b = rand() % count;
		if (a != b)
			std::swap(particles[a], particles[b]);
	}

	/*for (int i = 0; i < 10; ++i)
	{
		std::cout << (unsigned long)particles[i].get() << std::endl;
	}*/

	_memoryKb = (particles.capacity()*sizeof(Particle)) / 1024.0;

	for (auto p = particles.begin(); p != particles.end(); ++p)
		(*p)->generate();

	perf.start();
	for (size_t u = 0; u < updates; ++u)
	{
		for (auto p = particles.begin(); p != particles.end(); ++p)
			(*p)->update(DELTA_TIME);
	}
	perf.stop(&_updatesTime);
}
CLerror CLElectrosFunctor<T>::LoadKernels ( size_t deviceID )
{
    PerfTimer timer;
    timer.start();
    FunctorData &data = m_functors[deviceID];

    cout<<" Reading kernel source"<<endl;
    using std::ifstream;
    ifstream reader("Electrostatics.cl.c", ifstream::in);
    if (!reader.good())
    {
        cout<<"Cannot open program source"<<endl;
        return -1;
    }
    reader.seekg (0, std::ios::end);
    size_t length = reader.tellg();
    reader.seekg (0, std::ios::beg);
    char *source = new char[length];
    reader.read(source, length);
    reader.close();

    /*
     * Different devices require different work group sizes to operate
     * optimally. The amount of __local memory on some kernels depends on these
     * work-group sizes. This causes a problem as explained below:
     * There are two ways to use group-local memory
     * 1) Allocate it as a parameter with clSetKernelArg()
     * 2) Declare it as a constant __local array within the cl kernel
     * Option (1) has the advantage of flexibility, but the extra indexing
     * overhead is a performance killer (20-25% easily lost on nvidia GPUs)
     * Option (2) has the advantage that the compiler knows the arrays are of
     * constant size, and is free to do extreme optimizations.
     * Of course, then both host and kernel have to agree on the size of the
     * work group.
     * We abuse the fact that the source code is compiled at runtime, decide
     * those sizes in the host code, then #define them in the kernel code,
     * before it is compiled.
     */

    // BLOCK size
    data.local = {BLOCK_X, 1, 1};
    size_t local_MT[3] = {BLOCK_X_MT, BLOCK_Y_MT, 1};
    // GRID size
    data.global = {((this->m_nLines + BLOCK_X - 1)/BLOCK_X)
                   * BLOCK_X, 1, 1
                  };
    data.global[0] /= data.vecWidth;
    data.local[0] /= data.vecWidth;
    cout<<"Local   : "<<data.local[0]<<" "<<data.local[1]<<" "
        <<data.local[2]<<endl;
    cout<<"Local_MT: "<<local_MT[0]<<" "<<local_MT[1]<<" "<<local_MT[2]<<endl;
    cout<<"Global  : "<<data.global[0]<<" "<<data.global[1]<<" "
        <<data.global[2]<<endl;

    char defines[1024];
    const size_t kernelSteps = this->m_pFieldLinesData->GetSize()
                               / this->m_nLines;
    snprintf(defines, sizeof(defines),
             "#define BLOCK_X %u\n"
             "#define BLOCK_X_MT %u\n"
             "#define BLOCK_Y_MT %u\n"
             "#define KERNEL_STEPS %u\n"
             "#define Tprec %s\n"
             "#define Tvec %s\n",
             (unsigned int) data.local[0],
             (unsigned int) local_MT[0], (unsigned int)local_MT[1],
             (unsigned int) kernelSteps,
             FindPrecType(),
             FindVecType(data.vecWidth)
            );

    cout<<" Calc'ed kern steps "<<kernelSteps<<endl;
    char *srcs[2] = {defines, source};
    CLerror err;
    cl_program prog = clCreateProgramWithSource(data.context, 2,
                      (const char**) srcs,
                      NULL, &err);
    if (err)cout<<"clCreateProgramWithSource returns: "<<err<<endl;

    char options[] = "-cl-fast-relaxed-math";
    err = clBuildProgram(prog, 0, NULL, options, NULL, NULL);
    if (err)cout<<"clBuildProgram returns: "<<err<<endl;

    size_t logSize;
    clGetProgramBuildInfo(prog, data.device->deviceID,
                          CL_PROGRAM_BUILD_LOG,
                          0, NULL, &logSize);
    char * log = (char*)malloc(logSize);
    clGetProgramBuildInfo(prog, data.device->deviceID,
                          CL_PROGRAM_BUILD_LOG,
                          logSize, log, 0);
    cout<<"Program Build Log:"<<endl<<log<<endl;
    CL_ASSERTE(err, "clBuildProgram failed");
    data.perfData.add(TimingInfo("Program compilation", timer.tick()));



    //==========================================================================
    cout<<" Preparing kernel"<<endl;
    data.kernel = clCreateKernel(prog, "CalcField_curvature", &err);
    CL_ASSERTE(err, "clCreateKernel");
    return CL_SUCCESS;
}
unsigned long CLElectrosFunctor<T>::MainFunctor (
    size_t functorIndex,    ///< Functor whose data to process
    size_t deviceIndex      ///< Device on which to process data
)
{
    if(functorIndex != deviceIndex)
        cerr<<"WARNING: Different functor and device"<<endl;
    PerfTimer timer;
    FunctorData &funData = m_functors[functorIndex];
    FunctorData &devData = m_functors[deviceIndex];
    perfPacket &profiler = devData.perfData;
    timer.start();
    CLerror err;
    cl_context ctx = devData.context;

    cout<<" Preparing buffers"<<endl;
    Vector3<cl_mem> &arrdata = devData.devFieldMem;
    cl_mem &charges = devData.chargeMem;
    cl_kernel &kernel = devData.kernel;

    err = CL_SUCCESS;
    // __global float *x,
    err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &arrdata.x);
    // __global float *y,
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &arrdata.y);
    // __global float *z,
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &arrdata.z);
    // __global pointCharge *Charges,
    err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charges);
    // const unsigned int linePitch,
    cl_uint param = this->m_nLines;
    err |= clSetKernelArg(kernel, 4, sizeof(param), &param);
    // const unsigned int p,
    param = (cl_uint)this->m_pPointChargeData->GetSize();
    err |= clSetKernelArg(kernel, 5, sizeof(param), &param);
    // const unsigned int fieldIndex,
    param = 1;
    err |= clSetKernelArg(kernel, 6, sizeof(param), &param);

    // const float resolution
    T res = this->m_resolution;
    err |= clSetKernelArg(kernel, 7, sizeof(res), &res);
    if (err)cout<<"clSetKernelArg cummulates: "<<err<<endl;

    //==========================================================================
    cl_command_queue queue = clCreateCommandQueue(ctx,
                             devData.device->deviceID,
                             0, &err);
    if (err)cout<<"clCreateCommandQueue returns: "<<err<<endl;

    timer.tick();
    Vector3<T*> hostArr = this->m_pFieldLinesData->GetDataPointers();
    const size_t start = funData.startIndex;
    const size_t size = funData.elements * sizeof(T) * funData.steps;

    err = CL_SUCCESS;
    err |= clEnqueueWriteBuffer(queue, arrdata.x, CL_FALSE, 0, size,
                                &hostArr.x[start], 0, NULL, NULL);
    if (err)cout<<"Write 1 returns: "<<err<<endl;
    err |= clEnqueueWriteBuffer(queue, arrdata.y, CL_FALSE, 0, size,
                                &hostArr.y[start], 0, NULL, NULL);
    if (err)cout<<"Write 2 returns: "<<err<<endl;
    err |= clEnqueueWriteBuffer(queue, arrdata.z, CL_FALSE, 0, size,
                                &hostArr.z[start], 0, NULL, NULL);
    if (err)cout<<"Write 3 returns: "<<err<<endl;
    const size_t qSize = this->m_pPointChargeData->GetSizeBytes();
    err |= clEnqueueWriteBuffer(queue, charges, CL_FALSE, 0, qSize,
                                this->m_pPointChargeData->GetDataPointer(),
                                0, NULL, NULL);
    if (err)cout<<"Write 4 returns: "<<err<<endl;
    CL_ASSERTE(err, "Sending data to device failed");

    // Finish memory copies before starting the kernel
    CL_ASSERTE(clFinish(queue), "Pre-kernel sync");

    profiler.add(TimingInfo("Host to device transfer", timer.tick(),
                            3*size + qSize ));

    //==========================================================================

    cout<<" Executing kernel"<<endl;

    timer.tick();
    err |= clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
                                  funData.global, funData.local,
                                  0, NULL, NULL);
    if (err)cout<<"clEnqueueNDRangeKernel returns: "<<err<<endl;
    // Let kernel finish before continuing
    CL_ASSERTE(clFinish(queue), "Post-kernel sync");
    double time = timer.tick();
    this->m_pPerfData->time = time;
    this->m_pPerfData->performance =
        ( this->m_nLines * ( ( 2500-1 ) * ( this->m_pPointChargeData->GetSize()
                                            * ( electroPartFieldFLOP + 3 ) + 13 ) ) / time ) / 1E9;
    profiler.add(TimingInfo("Kernel execution time", time));
    //==========================================================================
    cout<<" Recovering results"<<endl;

    timer.tick();
    err = CL_SUCCESS;
    err |= clEnqueueReadBuffer ( queue, arrdata.x, CL_FALSE, 0, size,
                                 hostArr.x, 0, NULL, NULL );
    if (err)cout<<" Read 1 returns: "<<err<<endl;
    err |= clEnqueueReadBuffer ( queue, arrdata.y, CL_FALSE, 0, size,
                                 hostArr.y, 0, NULL, NULL );
    if (err)cout<<" Read 2 returns: "<<err<<endl;
    err |= clEnqueueReadBuffer ( queue, arrdata.z, CL_FALSE, 0, size,
                                 hostArr.z, 0, NULL, NULL );
    if (err)cout<<" Read 3 returns: "<<err<<endl;
    if (err)cout<<"clEnqueueReadBuffer cummulates: "<<err<<endl;

    clFinish(queue);

    profiler.add(TimingInfo("Device to host transfer", timer.tick(),
                            3 * size));
    return CL_SUCCESS;
}
Beispiel #5
0
int _tmain(int argc, _TCHAR* argv[])
{
  // Sample 1: float image, 1 band, with some pixels set to invalid / void, maxZError = 0.1

  int h = 512;
  int w = 512;

  float* zImg = new float[w * h];
  memset(zImg, 0, w * h * sizeof(float));

  LercNS::BitMask bitMask(w, h);
  bitMask.SetAllValid();

  for (int k = 0, i = 0; i < h; i++)
  {
    for (int j = 0; j < w; j++, k++)
    {
      zImg[k] = sqrt((float)(i * i + j * j));    // smooth surface
      zImg[k] += rand() % 20;    // add some small amplitude noise

      if (j % 100 == 0 || i % 100 == 0)    // set some void points
        bitMask.SetInvalid(k);
    }
  }


  // compress into byte arr

  double maxZErrorWanted = 0.1;
  double eps = 0.0001;    // safety margin (optional), to account for finite floating point accuracy
  double maxZError = maxZErrorWanted - eps;

  size_t numBytesNeeded = 0;
  size_t numBytesWritten = 0;
  Lerc lerc;
  PerfTimer pt;

  if (!lerc.ComputeBufferSize((void*)zImg,    // raw image data, row by row, band by band
    Lerc::DT_Float,
    w, h, 1,
    &bitMask,                  // set 0 if all pixels are valid
    maxZError,                 // max coding error per pixel, or precision
    numBytesNeeded))           // size of outgoing Lerc blob
  {
    cout << "ComputeBufferSize failed" << endl;
  }

  size_t numBytesBlob = numBytesNeeded;
  Byte* pLercBlob = new Byte[numBytesBlob];

  pt.start();

  if (!lerc.Encode((void*)zImg,    // raw image data, row by row, band by band
    Lerc::DT_Float,
    w, h, 1,
    &bitMask,           // 0 if all pixels are valid
    maxZError,           // max coding error per pixel, or precision
    pLercBlob,           // buffer to write to, function will fail if buffer too small
    numBytesBlob,        // buffer size
    numBytesWritten))    // num bytes written to buffer
  {
    cout << "Encode failed" << endl;
  }

  pt.stop();

  double ratio = w * h * (0.125 + sizeof(float)) / numBytesBlob;
  cout << "sample 1 compression ratio = " << ratio << ", encode time = " << pt.ms() << " ms" << endl;

  // new data storage
  float* zImg3 = new float[w * h];
  memset(zImg3, 0, w * h * sizeof(float));

  BitMask bitMask3(w, h);
  bitMask3.SetAllValid();


  // decompress

  Lerc::LercInfo lercInfo;
  if (!lerc.GetLercInfo(pLercBlob, numBytesBlob, lercInfo))
    cout << "get header info failed" << endl;

  if (lercInfo.nCols != w || lercInfo.nRows != h || lercInfo.nBands != 1 || lercInfo.dt != Lerc::DT_Float)
    cout << "got wrong lerc info" << endl;

  pt.start();

  if (!lerc.Decode(pLercBlob, numBytesBlob, &bitMask3, w, h, 1, Lerc::DT_Float, (void*)zImg3))
    cout << "decode failed" << endl;

  pt.stop();


  // compare to orig

  double maxDelta = 0;
  for (int k = 0, i = 0; i < h; i++)
  {
    for (int j = 0; j < w; j++, k++)
    {
      if (bitMask3.IsValid(k) != bitMask.IsValid(k))
        cout << "Error in main: decoded bit mask differs from encoded bit mask" << endl;

      if (bitMask3.IsValid(k))
      {
        double delta = fabs(zImg3[k] - zImg[k]);
        if (delta > maxDelta)
          maxDelta = delta;
      }
    }
  }

  cout << "max z error per pixel = " << maxDelta << ", decode time = " << pt.ms() << " ms" << endl;

  delete[] zImg;
  delete[] zImg3;
  delete[] pLercBlob;
  pLercBlob = 0;


  // Sample 2: random byte image, 3 bands, all pixels valid, maxZError = 0 (lossless)

  h = 713;
  w = 257;

  Byte* byteImg = new Byte[w * h * 3];
  memset(byteImg, 0, w * h * 3);

  for (int iBand = 0; iBand < 3; iBand++)
  {
    Byte* arr = byteImg + iBand * w * h;
    for (int k = 0, i = 0; i < h; i++)
      for (int j = 0; j < w; j++, k++)
        arr[k] = rand() % 30;
  }

  // encode 

  if (!lerc.ComputeBufferSize((void*)byteImg, Lerc::DT_Byte, w, h, 3, 0, 0, numBytesNeeded))
    cout << "ComputeBufferSize failed" << endl;

  numBytesBlob = numBytesNeeded;
  pLercBlob = new Byte[numBytesBlob];

  pt.start();

  if (!lerc.Encode((void*)byteImg,    // raw image data, row by row, band by band
    Lerc::DT_Byte,
    w, h, 3,
    0,                   // 0 if all pixels are valid
    0,                   // max coding error per pixel, or precision
    pLercBlob,           // buffer to write to, function will fail if buffer too small
    numBytesBlob,        // buffer size
    numBytesWritten))    // num bytes written to buffer
  {
    cout << "Encode failed" << endl;
  }

  pt.stop();

  ratio = w * h * 3 / (double)numBytesBlob;
  cout << "sample 2 compression ratio = " << ratio << ", encode time = " << pt.ms() << " ms" << endl;

  // new data storage
  Byte* byteImg3 = new Byte[w * h * 3];
  memset(byteImg3, 0, w * h * 3);

  // decompress

  if (!lerc.GetLercInfo(pLercBlob, numBytesBlob, lercInfo))
    cout << "get header info failed" << endl;

  if (lercInfo.nCols != w || lercInfo.nRows != h || lercInfo.nBands != 3 || lercInfo.dt != Lerc::DT_Byte)
    cout << "got wrong lerc info" << endl;

  pt.start();

  if (!lerc.Decode(pLercBlob, numBytesBlob, 0, w, h, 3, Lerc::DT_Byte, (void*)byteImg3))
    cout << "decode failed" << endl;

  pt.stop();

  // compare to orig

  maxDelta = 0;
  for (int k = 0, i = 0; i < h; i++)
    for (int j = 0; j < w; j++, k++)
    {
      double delta = abs(byteImg3[k] - byteImg[k]);
      if (delta > maxDelta)
        maxDelta = delta;
    }

  cout << "max z error per pixel = " << maxDelta << ", decode time = " << pt.ms() << " ms" << endl;

  delete[] byteImg;
  delete[] byteImg3;
  delete[] pLercBlob;
  pLercBlob = 0;


#ifdef TestLegacyData

  Byte* pLercBuffer = new Byte[4 * 2048 * 2048];
  Byte* pDstArr     = new Byte[4 * 2048 * 2048];

  vector<string> fnVec;
  string path = "D:/GitHub/LercOpenSource/testData/";

  fnVec.push_back("amazon3.lerc1");
  fnVec.push_back("tuna.lerc1");
  fnVec.push_back("tuna_0_to_1_w1920_h925.lerc1");

  fnVec.push_back("testbytes.lerc2");
  fnVec.push_back("testHuffman_w30_h20_uchar0.lerc2");
  fnVec.push_back("testHuffman_w30_h20_ucharx.lerc2");
  fnVec.push_back("testHuffman_w1922_h1083_uchar.lerc2");

  fnVec.push_back("testall_w30_h20_char.lerc2");
  fnVec.push_back("testall_w30_h20_byte.lerc2");
  fnVec.push_back("testall_w30_h20_short.lerc2");
  fnVec.push_back("testall_w30_h20_ushort.lerc2");
  fnVec.push_back("testall_w30_h20_long.lerc2");
  fnVec.push_back("testall_w30_h20_ulong.lerc2");
  fnVec.push_back("testall_w30_h20_float.lerc2");

  fnVec.push_back("testall_w1922_h1083_char.lerc2");
  fnVec.push_back("testall_w1922_h1083_byte.lerc2");
  fnVec.push_back("testall_w1922_h1083_short.lerc2");
  fnVec.push_back("testall_w1922_h1083_ushort.lerc2");
  fnVec.push_back("testall_w1922_h1083_long.lerc2");
  fnVec.push_back("testall_w1922_h1083_ulong.lerc2");
  fnVec.push_back("testall_w1922_h1083_float.lerc2");

  fnVec.push_back("testuv_w30_h20_char.lerc2");
  fnVec.push_back("testuv_w30_h20_byte.lerc2");
  fnVec.push_back("testuv_w30_h20_short.lerc2");
  fnVec.push_back("testuv_w30_h20_ushort.lerc2");
  fnVec.push_back("testuv_w30_h20_long.lerc2");
  fnVec.push_back("testuv_w30_h20_ulong.lerc2");
  fnVec.push_back("testuv_w30_h20_float.lerc2");

  fnVec.push_back("testuv_w1922_h1083_char.lerc2");
  fnVec.push_back("testuv_w1922_h1083_byte.lerc2");
  fnVec.push_back("testuv_w1922_h1083_short.lerc2");
  fnVec.push_back("testuv_w1922_h1083_ushort.lerc2");
  fnVec.push_back("testuv_w1922_h1083_long.lerc2");
  fnVec.push_back("testuv_w1922_h1083_ulong.lerc2");
  fnVec.push_back("testuv_w1922_h1083_float.lerc2");

  for (size_t n = 0; n < fnVec.size(); n++)
  {
    string fn = path;
    fn += fnVec[n];

    FILE* fp = 0;
    fopen_s(&fp, fn.c_str(), "rb");
    fseek(fp, 0, SEEK_END);
    size_t fileSize = ftell(fp);    // get the file size
    fclose(fp);
    fp = 0;

    fopen_s(&fp, fn.c_str(), "rb");
    fread(pLercBuffer, 1, fileSize, fp);    // read Lerc blob into buffer
    fclose(fp);
    fp = 0;

    if (!lerc.GetLercInfo(pLercBuffer, fileSize, lercInfo))
      cout << "get header info failed" << endl;
    else
    {
      int w = lercInfo.nCols;
      int h = lercInfo.nRows;
      int nBands = lercInfo.nBands;
      Lerc::DataType dt = lercInfo.dt;

      pt.start();
      
      std::string resultMsg = "ok";
      BitMask bitMask;
      if (!lerc.Decode(pLercBuffer, fileSize, &bitMask, w, h, nBands, dt, (void*)pDstArr))
        resultMsg = "FAILED";

      pt.stop();
      printf("w = %4d, h = %4d, nBands = %2d, dt = %2d, time = %4d ms,  %s :  %s\n", w, h, nBands, (int)dt, pt.ms(), resultMsg.c_str(), fnVec[n].c_str());
    }
  }

#endif

  printf("\npress ENTER\n");
  getchar();
  
	return 0;
}