void TestVectorOfObjects::run(size_t count, size_t updates) { PerfTimer perf; perf.start(); std::vector<Particle> particles(count); perf.stop(&_creationTime); // randomize: no sense in this case... /*for (size_t i = 0; i < count / 2; ++i) { int a = rand() % count; int b = rand() % count; std::swap(particles[a], particles[b]); }*/ _memoryKb = (particles.capacity()*sizeof(Particle)) / 1024.0; for (auto p = particles.begin(); p != particles.end(); ++p) p->generate(); perf.start(); for (size_t u = 0; u < updates; ++u) { for (auto p = particles.begin(); p != particles.end(); ++p) p->update(DELTA_TIME); } perf.stop(&_updatesTime); }
void TestVectorOfPointers::run(size_t count, size_t updates) { PerfTimer perf; perf.start(); std::vector<std::shared_ptr<Particle>> particles(count); for (auto p = particles.begin(); p != particles.end(); ++p) { *p = std::make_shared<Particle>(); } perf.stop(&_creationTime); // randomize to simulate for (size_t i = 0; i < count / 2; ++i) { int a = rand() % count; int b = rand() % count; if (a != b) std::swap(particles[a], particles[b]); } /*for (int i = 0; i < 10; ++i) { std::cout << (unsigned long)particles[i].get() << std::endl; }*/ _memoryKb = (particles.capacity()*sizeof(Particle)) / 1024.0; for (auto p = particles.begin(); p != particles.end(); ++p) (*p)->generate(); perf.start(); for (size_t u = 0; u < updates; ++u) { for (auto p = particles.begin(); p != particles.end(); ++p) (*p)->update(DELTA_TIME); } perf.stop(&_updatesTime); }
CLerror CLElectrosFunctor<T>::LoadKernels ( size_t deviceID ) { PerfTimer timer; timer.start(); FunctorData &data = m_functors[deviceID]; cout<<" Reading kernel source"<<endl; using std::ifstream; ifstream reader("Electrostatics.cl.c", ifstream::in); if (!reader.good()) { cout<<"Cannot open program source"<<endl; return -1; } reader.seekg (0, std::ios::end); size_t length = reader.tellg(); reader.seekg (0, std::ios::beg); char *source = new char[length]; reader.read(source, length); reader.close(); /* * Different devices require different work group sizes to operate * optimally. The amount of __local memory on some kernels depends on these * work-group sizes. This causes a problem as explained below: * There are two ways to use group-local memory * 1) Allocate it as a parameter with clSetKernelArg() * 2) Declare it as a constant __local array within the cl kernel * Option (1) has the advantage of flexibility, but the extra indexing * overhead is a performance killer (20-25% easily lost on nvidia GPUs) * Option (2) has the advantage that the compiler knows the arrays are of * constant size, and is free to do extreme optimizations. * Of course, then both host and kernel have to agree on the size of the * work group. * We abuse the fact that the source code is compiled at runtime, decide * those sizes in the host code, then #define them in the kernel code, * before it is compiled. */ // BLOCK size data.local = {BLOCK_X, 1, 1}; size_t local_MT[3] = {BLOCK_X_MT, BLOCK_Y_MT, 1}; // GRID size data.global = {((this->m_nLines + BLOCK_X - 1)/BLOCK_X) * BLOCK_X, 1, 1 }; data.global[0] /= data.vecWidth; data.local[0] /= data.vecWidth; cout<<"Local : "<<data.local[0]<<" "<<data.local[1]<<" " <<data.local[2]<<endl; cout<<"Local_MT: "<<local_MT[0]<<" "<<local_MT[1]<<" "<<local_MT[2]<<endl; cout<<"Global : "<<data.global[0]<<" "<<data.global[1]<<" " <<data.global[2]<<endl; char defines[1024]; const size_t kernelSteps = this->m_pFieldLinesData->GetSize() / this->m_nLines; snprintf(defines, sizeof(defines), "#define BLOCK_X %u\n" "#define BLOCK_X_MT %u\n" "#define BLOCK_Y_MT %u\n" "#define KERNEL_STEPS %u\n" "#define Tprec %s\n" "#define Tvec %s\n", (unsigned int) data.local[0], (unsigned int) local_MT[0], (unsigned int)local_MT[1], (unsigned int) kernelSteps, FindPrecType(), FindVecType(data.vecWidth) ); cout<<" Calc'ed kern steps "<<kernelSteps<<endl; char *srcs[2] = {defines, source}; CLerror err; cl_program prog = clCreateProgramWithSource(data.context, 2, (const char**) srcs, NULL, &err); if (err)cout<<"clCreateProgramWithSource returns: "<<err<<endl; char options[] = "-cl-fast-relaxed-math"; err = clBuildProgram(prog, 0, NULL, options, NULL, NULL); if (err)cout<<"clBuildProgram returns: "<<err<<endl; size_t logSize; clGetProgramBuildInfo(prog, data.device->deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char * log = (char*)malloc(logSize); clGetProgramBuildInfo(prog, data.device->deviceID, CL_PROGRAM_BUILD_LOG, logSize, log, 0); cout<<"Program Build Log:"<<endl<<log<<endl; CL_ASSERTE(err, "clBuildProgram failed"); data.perfData.add(TimingInfo("Program compilation", timer.tick())); //========================================================================== cout<<" Preparing kernel"<<endl; data.kernel = clCreateKernel(prog, "CalcField_curvature", &err); CL_ASSERTE(err, "clCreateKernel"); return CL_SUCCESS; }
unsigned long CLElectrosFunctor<T>::MainFunctor ( size_t functorIndex, ///< Functor whose data to process size_t deviceIndex ///< Device on which to process data ) { if(functorIndex != deviceIndex) cerr<<"WARNING: Different functor and device"<<endl; PerfTimer timer; FunctorData &funData = m_functors[functorIndex]; FunctorData &devData = m_functors[deviceIndex]; perfPacket &profiler = devData.perfData; timer.start(); CLerror err; cl_context ctx = devData.context; cout<<" Preparing buffers"<<endl; Vector3<cl_mem> &arrdata = devData.devFieldMem; cl_mem &charges = devData.chargeMem; cl_kernel &kernel = devData.kernel; err = CL_SUCCESS; // __global float *x, err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &arrdata.x); // __global float *y, err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &arrdata.y); // __global float *z, err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &arrdata.z); // __global pointCharge *Charges, err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charges); // const unsigned int linePitch, cl_uint param = this->m_nLines; err |= clSetKernelArg(kernel, 4, sizeof(param), ¶m); // const unsigned int p, param = (cl_uint)this->m_pPointChargeData->GetSize(); err |= clSetKernelArg(kernel, 5, sizeof(param), ¶m); // const unsigned int fieldIndex, param = 1; err |= clSetKernelArg(kernel, 6, sizeof(param), ¶m); // const float resolution T res = this->m_resolution; err |= clSetKernelArg(kernel, 7, sizeof(res), &res); if (err)cout<<"clSetKernelArg cummulates: "<<err<<endl; //========================================================================== cl_command_queue queue = clCreateCommandQueue(ctx, devData.device->deviceID, 0, &err); if (err)cout<<"clCreateCommandQueue returns: "<<err<<endl; timer.tick(); Vector3<T*> hostArr = this->m_pFieldLinesData->GetDataPointers(); const size_t start = funData.startIndex; const size_t size = funData.elements * sizeof(T) * funData.steps; err = CL_SUCCESS; err |= clEnqueueWriteBuffer(queue, arrdata.x, CL_FALSE, 0, size, &hostArr.x[start], 0, NULL, NULL); if (err)cout<<"Write 1 returns: "<<err<<endl; err |= clEnqueueWriteBuffer(queue, arrdata.y, CL_FALSE, 0, size, &hostArr.y[start], 0, NULL, NULL); if (err)cout<<"Write 2 returns: "<<err<<endl; err |= clEnqueueWriteBuffer(queue, arrdata.z, CL_FALSE, 0, size, &hostArr.z[start], 0, NULL, NULL); if (err)cout<<"Write 3 returns: "<<err<<endl; const size_t qSize = this->m_pPointChargeData->GetSizeBytes(); err |= clEnqueueWriteBuffer(queue, charges, CL_FALSE, 0, qSize, this->m_pPointChargeData->GetDataPointer(), 0, NULL, NULL); if (err)cout<<"Write 4 returns: "<<err<<endl; CL_ASSERTE(err, "Sending data to device failed"); // Finish memory copies before starting the kernel CL_ASSERTE(clFinish(queue), "Pre-kernel sync"); profiler.add(TimingInfo("Host to device transfer", timer.tick(), 3*size + qSize )); //========================================================================== cout<<" Executing kernel"<<endl; timer.tick(); err |= clEnqueueNDRangeKernel(queue, kernel, 3, NULL, funData.global, funData.local, 0, NULL, NULL); if (err)cout<<"clEnqueueNDRangeKernel returns: "<<err<<endl; // Let kernel finish before continuing CL_ASSERTE(clFinish(queue), "Post-kernel sync"); double time = timer.tick(); this->m_pPerfData->time = time; this->m_pPerfData->performance = ( this->m_nLines * ( ( 2500-1 ) * ( this->m_pPointChargeData->GetSize() * ( electroPartFieldFLOP + 3 ) + 13 ) ) / time ) / 1E9; profiler.add(TimingInfo("Kernel execution time", time)); //========================================================================== cout<<" Recovering results"<<endl; timer.tick(); err = CL_SUCCESS; err |= clEnqueueReadBuffer ( queue, arrdata.x, CL_FALSE, 0, size, hostArr.x, 0, NULL, NULL ); if (err)cout<<" Read 1 returns: "<<err<<endl; err |= clEnqueueReadBuffer ( queue, arrdata.y, CL_FALSE, 0, size, hostArr.y, 0, NULL, NULL ); if (err)cout<<" Read 2 returns: "<<err<<endl; err |= clEnqueueReadBuffer ( queue, arrdata.z, CL_FALSE, 0, size, hostArr.z, 0, NULL, NULL ); if (err)cout<<" Read 3 returns: "<<err<<endl; if (err)cout<<"clEnqueueReadBuffer cummulates: "<<err<<endl; clFinish(queue); profiler.add(TimingInfo("Device to host transfer", timer.tick(), 3 * size)); return CL_SUCCESS; }
int _tmain(int argc, _TCHAR* argv[]) { // Sample 1: float image, 1 band, with some pixels set to invalid / void, maxZError = 0.1 int h = 512; int w = 512; float* zImg = new float[w * h]; memset(zImg, 0, w * h * sizeof(float)); LercNS::BitMask bitMask(w, h); bitMask.SetAllValid(); for (int k = 0, i = 0; i < h; i++) { for (int j = 0; j < w; j++, k++) { zImg[k] = sqrt((float)(i * i + j * j)); // smooth surface zImg[k] += rand() % 20; // add some small amplitude noise if (j % 100 == 0 || i % 100 == 0) // set some void points bitMask.SetInvalid(k); } } // compress into byte arr double maxZErrorWanted = 0.1; double eps = 0.0001; // safety margin (optional), to account for finite floating point accuracy double maxZError = maxZErrorWanted - eps; size_t numBytesNeeded = 0; size_t numBytesWritten = 0; Lerc lerc; PerfTimer pt; if (!lerc.ComputeBufferSize((void*)zImg, // raw image data, row by row, band by band Lerc::DT_Float, w, h, 1, &bitMask, // set 0 if all pixels are valid maxZError, // max coding error per pixel, or precision numBytesNeeded)) // size of outgoing Lerc blob { cout << "ComputeBufferSize failed" << endl; } size_t numBytesBlob = numBytesNeeded; Byte* pLercBlob = new Byte[numBytesBlob]; pt.start(); if (!lerc.Encode((void*)zImg, // raw image data, row by row, band by band Lerc::DT_Float, w, h, 1, &bitMask, // 0 if all pixels are valid maxZError, // max coding error per pixel, or precision pLercBlob, // buffer to write to, function will fail if buffer too small numBytesBlob, // buffer size numBytesWritten)) // num bytes written to buffer { cout << "Encode failed" << endl; } pt.stop(); double ratio = w * h * (0.125 + sizeof(float)) / numBytesBlob; cout << "sample 1 compression ratio = " << ratio << ", encode time = " << pt.ms() << " ms" << endl; // new data storage float* zImg3 = new float[w * h]; memset(zImg3, 0, w * h * sizeof(float)); BitMask bitMask3(w, h); bitMask3.SetAllValid(); // decompress Lerc::LercInfo lercInfo; if (!lerc.GetLercInfo(pLercBlob, numBytesBlob, lercInfo)) cout << "get header info failed" << endl; if (lercInfo.nCols != w || lercInfo.nRows != h || lercInfo.nBands != 1 || lercInfo.dt != Lerc::DT_Float) cout << "got wrong lerc info" << endl; pt.start(); if (!lerc.Decode(pLercBlob, numBytesBlob, &bitMask3, w, h, 1, Lerc::DT_Float, (void*)zImg3)) cout << "decode failed" << endl; pt.stop(); // compare to orig double maxDelta = 0; for (int k = 0, i = 0; i < h; i++) { for (int j = 0; j < w; j++, k++) { if (bitMask3.IsValid(k) != bitMask.IsValid(k)) cout << "Error in main: decoded bit mask differs from encoded bit mask" << endl; if (bitMask3.IsValid(k)) { double delta = fabs(zImg3[k] - zImg[k]); if (delta > maxDelta) maxDelta = delta; } } } cout << "max z error per pixel = " << maxDelta << ", decode time = " << pt.ms() << " ms" << endl; delete[] zImg; delete[] zImg3; delete[] pLercBlob; pLercBlob = 0; // Sample 2: random byte image, 3 bands, all pixels valid, maxZError = 0 (lossless) h = 713; w = 257; Byte* byteImg = new Byte[w * h * 3]; memset(byteImg, 0, w * h * 3); for (int iBand = 0; iBand < 3; iBand++) { Byte* arr = byteImg + iBand * w * h; for (int k = 0, i = 0; i < h; i++) for (int j = 0; j < w; j++, k++) arr[k] = rand() % 30; } // encode if (!lerc.ComputeBufferSize((void*)byteImg, Lerc::DT_Byte, w, h, 3, 0, 0, numBytesNeeded)) cout << "ComputeBufferSize failed" << endl; numBytesBlob = numBytesNeeded; pLercBlob = new Byte[numBytesBlob]; pt.start(); if (!lerc.Encode((void*)byteImg, // raw image data, row by row, band by band Lerc::DT_Byte, w, h, 3, 0, // 0 if all pixels are valid 0, // max coding error per pixel, or precision pLercBlob, // buffer to write to, function will fail if buffer too small numBytesBlob, // buffer size numBytesWritten)) // num bytes written to buffer { cout << "Encode failed" << endl; } pt.stop(); ratio = w * h * 3 / (double)numBytesBlob; cout << "sample 2 compression ratio = " << ratio << ", encode time = " << pt.ms() << " ms" << endl; // new data storage Byte* byteImg3 = new Byte[w * h * 3]; memset(byteImg3, 0, w * h * 3); // decompress if (!lerc.GetLercInfo(pLercBlob, numBytesBlob, lercInfo)) cout << "get header info failed" << endl; if (lercInfo.nCols != w || lercInfo.nRows != h || lercInfo.nBands != 3 || lercInfo.dt != Lerc::DT_Byte) cout << "got wrong lerc info" << endl; pt.start(); if (!lerc.Decode(pLercBlob, numBytesBlob, 0, w, h, 3, Lerc::DT_Byte, (void*)byteImg3)) cout << "decode failed" << endl; pt.stop(); // compare to orig maxDelta = 0; for (int k = 0, i = 0; i < h; i++) for (int j = 0; j < w; j++, k++) { double delta = abs(byteImg3[k] - byteImg[k]); if (delta > maxDelta) maxDelta = delta; } cout << "max z error per pixel = " << maxDelta << ", decode time = " << pt.ms() << " ms" << endl; delete[] byteImg; delete[] byteImg3; delete[] pLercBlob; pLercBlob = 0; #ifdef TestLegacyData Byte* pLercBuffer = new Byte[4 * 2048 * 2048]; Byte* pDstArr = new Byte[4 * 2048 * 2048]; vector<string> fnVec; string path = "D:/GitHub/LercOpenSource/testData/"; fnVec.push_back("amazon3.lerc1"); fnVec.push_back("tuna.lerc1"); fnVec.push_back("tuna_0_to_1_w1920_h925.lerc1"); fnVec.push_back("testbytes.lerc2"); fnVec.push_back("testHuffman_w30_h20_uchar0.lerc2"); fnVec.push_back("testHuffman_w30_h20_ucharx.lerc2"); fnVec.push_back("testHuffman_w1922_h1083_uchar.lerc2"); fnVec.push_back("testall_w30_h20_char.lerc2"); fnVec.push_back("testall_w30_h20_byte.lerc2"); fnVec.push_back("testall_w30_h20_short.lerc2"); fnVec.push_back("testall_w30_h20_ushort.lerc2"); fnVec.push_back("testall_w30_h20_long.lerc2"); fnVec.push_back("testall_w30_h20_ulong.lerc2"); fnVec.push_back("testall_w30_h20_float.lerc2"); fnVec.push_back("testall_w1922_h1083_char.lerc2"); fnVec.push_back("testall_w1922_h1083_byte.lerc2"); fnVec.push_back("testall_w1922_h1083_short.lerc2"); fnVec.push_back("testall_w1922_h1083_ushort.lerc2"); fnVec.push_back("testall_w1922_h1083_long.lerc2"); fnVec.push_back("testall_w1922_h1083_ulong.lerc2"); fnVec.push_back("testall_w1922_h1083_float.lerc2"); fnVec.push_back("testuv_w30_h20_char.lerc2"); fnVec.push_back("testuv_w30_h20_byte.lerc2"); fnVec.push_back("testuv_w30_h20_short.lerc2"); fnVec.push_back("testuv_w30_h20_ushort.lerc2"); fnVec.push_back("testuv_w30_h20_long.lerc2"); fnVec.push_back("testuv_w30_h20_ulong.lerc2"); fnVec.push_back("testuv_w30_h20_float.lerc2"); fnVec.push_back("testuv_w1922_h1083_char.lerc2"); fnVec.push_back("testuv_w1922_h1083_byte.lerc2"); fnVec.push_back("testuv_w1922_h1083_short.lerc2"); fnVec.push_back("testuv_w1922_h1083_ushort.lerc2"); fnVec.push_back("testuv_w1922_h1083_long.lerc2"); fnVec.push_back("testuv_w1922_h1083_ulong.lerc2"); fnVec.push_back("testuv_w1922_h1083_float.lerc2"); for (size_t n = 0; n < fnVec.size(); n++) { string fn = path; fn += fnVec[n]; FILE* fp = 0; fopen_s(&fp, fn.c_str(), "rb"); fseek(fp, 0, SEEK_END); size_t fileSize = ftell(fp); // get the file size fclose(fp); fp = 0; fopen_s(&fp, fn.c_str(), "rb"); fread(pLercBuffer, 1, fileSize, fp); // read Lerc blob into buffer fclose(fp); fp = 0; if (!lerc.GetLercInfo(pLercBuffer, fileSize, lercInfo)) cout << "get header info failed" << endl; else { int w = lercInfo.nCols; int h = lercInfo.nRows; int nBands = lercInfo.nBands; Lerc::DataType dt = lercInfo.dt; pt.start(); std::string resultMsg = "ok"; BitMask bitMask; if (!lerc.Decode(pLercBuffer, fileSize, &bitMask, w, h, nBands, dt, (void*)pDstArr)) resultMsg = "FAILED"; pt.stop(); printf("w = %4d, h = %4d, nBands = %2d, dt = %2d, time = %4d ms, %s : %s\n", w, h, nBands, (int)dt, pt.ms(), resultMsg.c_str(), fnVec[n].c_str()); } } #endif printf("\npress ENTER\n"); getchar(); return 0; }