CLerror CLElectrosFunctor<T>::LoadKernels ( size_t deviceID ) { PerfTimer timer; timer.start(); FunctorData &data = m_functors[deviceID]; cout<<" Reading kernel source"<<endl; using std::ifstream; ifstream reader("Electrostatics.cl.c", ifstream::in); if (!reader.good()) { cout<<"Cannot open program source"<<endl; return -1; } reader.seekg (0, std::ios::end); size_t length = reader.tellg(); reader.seekg (0, std::ios::beg); char *source = new char[length]; reader.read(source, length); reader.close(); /* * Different devices require different work group sizes to operate * optimally. The amount of __local memory on some kernels depends on these * work-group sizes. This causes a problem as explained below: * There are two ways to use group-local memory * 1) Allocate it as a parameter with clSetKernelArg() * 2) Declare it as a constant __local array within the cl kernel * Option (1) has the advantage of flexibility, but the extra indexing * overhead is a performance killer (20-25% easily lost on nvidia GPUs) * Option (2) has the advantage that the compiler knows the arrays are of * constant size, and is free to do extreme optimizations. * Of course, then both host and kernel have to agree on the size of the * work group. * We abuse the fact that the source code is compiled at runtime, decide * those sizes in the host code, then #define them in the kernel code, * before it is compiled. */ // BLOCK size data.local = {BLOCK_X, 1, 1}; size_t local_MT[3] = {BLOCK_X_MT, BLOCK_Y_MT, 1}; // GRID size data.global = {((this->m_nLines + BLOCK_X - 1)/BLOCK_X) * BLOCK_X, 1, 1 }; data.global[0] /= data.vecWidth; data.local[0] /= data.vecWidth; cout<<"Local : "<<data.local[0]<<" "<<data.local[1]<<" " <<data.local[2]<<endl; cout<<"Local_MT: "<<local_MT[0]<<" "<<local_MT[1]<<" "<<local_MT[2]<<endl; cout<<"Global : "<<data.global[0]<<" "<<data.global[1]<<" " <<data.global[2]<<endl; char defines[1024]; const size_t kernelSteps = this->m_pFieldLinesData->GetSize() / this->m_nLines; snprintf(defines, sizeof(defines), "#define BLOCK_X %u\n" "#define BLOCK_X_MT %u\n" "#define BLOCK_Y_MT %u\n" "#define KERNEL_STEPS %u\n" "#define Tprec %s\n" "#define Tvec %s\n", (unsigned int) data.local[0], (unsigned int) local_MT[0], (unsigned int)local_MT[1], (unsigned int) kernelSteps, FindPrecType(), FindVecType(data.vecWidth) ); cout<<" Calc'ed kern steps "<<kernelSteps<<endl; char *srcs[2] = {defines, source}; CLerror err; cl_program prog = clCreateProgramWithSource(data.context, 2, (const char**) srcs, NULL, &err); if (err)cout<<"clCreateProgramWithSource returns: "<<err<<endl; char options[] = "-cl-fast-relaxed-math"; err = clBuildProgram(prog, 0, NULL, options, NULL, NULL); if (err)cout<<"clBuildProgram returns: "<<err<<endl; size_t logSize; clGetProgramBuildInfo(prog, data.device->deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char * log = (char*)malloc(logSize); clGetProgramBuildInfo(prog, data.device->deviceID, CL_PROGRAM_BUILD_LOG, logSize, log, 0); cout<<"Program Build Log:"<<endl<<log<<endl; CL_ASSERTE(err, "clBuildProgram failed"); data.perfData.add(TimingInfo("Program compilation", timer.tick())); //========================================================================== cout<<" Preparing kernel"<<endl; data.kernel = clCreateKernel(prog, "CalcField_curvature", &err); CL_ASSERTE(err, "clCreateKernel"); return CL_SUCCESS; }
unsigned long CLElectrosFunctor<T>::MainFunctor ( size_t functorIndex, ///< Functor whose data to process size_t deviceIndex ///< Device on which to process data ) { if(functorIndex != deviceIndex) cerr<<"WARNING: Different functor and device"<<endl; PerfTimer timer; FunctorData &funData = m_functors[functorIndex]; FunctorData &devData = m_functors[deviceIndex]; perfPacket &profiler = devData.perfData; timer.start(); CLerror err; cl_context ctx = devData.context; cout<<" Preparing buffers"<<endl; Vector3<cl_mem> &arrdata = devData.devFieldMem; cl_mem &charges = devData.chargeMem; cl_kernel &kernel = devData.kernel; err = CL_SUCCESS; // __global float *x, err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &arrdata.x); // __global float *y, err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &arrdata.y); // __global float *z, err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &arrdata.z); // __global pointCharge *Charges, err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charges); // const unsigned int linePitch, cl_uint param = this->m_nLines; err |= clSetKernelArg(kernel, 4, sizeof(param), ¶m); // const unsigned int p, param = (cl_uint)this->m_pPointChargeData->GetSize(); err |= clSetKernelArg(kernel, 5, sizeof(param), ¶m); // const unsigned int fieldIndex, param = 1; err |= clSetKernelArg(kernel, 6, sizeof(param), ¶m); // const float resolution T res = this->m_resolution; err |= clSetKernelArg(kernel, 7, sizeof(res), &res); if (err)cout<<"clSetKernelArg cummulates: "<<err<<endl; //========================================================================== cl_command_queue queue = clCreateCommandQueue(ctx, devData.device->deviceID, 0, &err); if (err)cout<<"clCreateCommandQueue returns: "<<err<<endl; timer.tick(); Vector3<T*> hostArr = this->m_pFieldLinesData->GetDataPointers(); const size_t start = funData.startIndex; const size_t size = funData.elements * sizeof(T) * funData.steps; err = CL_SUCCESS; err |= clEnqueueWriteBuffer(queue, arrdata.x, CL_FALSE, 0, size, &hostArr.x[start], 0, NULL, NULL); if (err)cout<<"Write 1 returns: "<<err<<endl; err |= clEnqueueWriteBuffer(queue, arrdata.y, CL_FALSE, 0, size, &hostArr.y[start], 0, NULL, NULL); if (err)cout<<"Write 2 returns: "<<err<<endl; err |= clEnqueueWriteBuffer(queue, arrdata.z, CL_FALSE, 0, size, &hostArr.z[start], 0, NULL, NULL); if (err)cout<<"Write 3 returns: "<<err<<endl; const size_t qSize = this->m_pPointChargeData->GetSizeBytes(); err |= clEnqueueWriteBuffer(queue, charges, CL_FALSE, 0, qSize, this->m_pPointChargeData->GetDataPointer(), 0, NULL, NULL); if (err)cout<<"Write 4 returns: "<<err<<endl; CL_ASSERTE(err, "Sending data to device failed"); // Finish memory copies before starting the kernel CL_ASSERTE(clFinish(queue), "Pre-kernel sync"); profiler.add(TimingInfo("Host to device transfer", timer.tick(), 3*size + qSize )); //========================================================================== cout<<" Executing kernel"<<endl; timer.tick(); err |= clEnqueueNDRangeKernel(queue, kernel, 3, NULL, funData.global, funData.local, 0, NULL, NULL); if (err)cout<<"clEnqueueNDRangeKernel returns: "<<err<<endl; // Let kernel finish before continuing CL_ASSERTE(clFinish(queue), "Post-kernel sync"); double time = timer.tick(); this->m_pPerfData->time = time; this->m_pPerfData->performance = ( this->m_nLines * ( ( 2500-1 ) * ( this->m_pPointChargeData->GetSize() * ( electroPartFieldFLOP + 3 ) + 13 ) ) / time ) / 1E9; profiler.add(TimingInfo("Kernel execution time", time)); //========================================================================== cout<<" Recovering results"<<endl; timer.tick(); err = CL_SUCCESS; err |= clEnqueueReadBuffer ( queue, arrdata.x, CL_FALSE, 0, size, hostArr.x, 0, NULL, NULL ); if (err)cout<<" Read 1 returns: "<<err<<endl; err |= clEnqueueReadBuffer ( queue, arrdata.y, CL_FALSE, 0, size, hostArr.y, 0, NULL, NULL ); if (err)cout<<" Read 2 returns: "<<err<<endl; err |= clEnqueueReadBuffer ( queue, arrdata.z, CL_FALSE, 0, size, hostArr.z, 0, NULL, NULL ); if (err)cout<<" Read 3 returns: "<<err<<endl; if (err)cout<<"clEnqueueReadBuffer cummulates: "<<err<<endl; clFinish(queue); profiler.add(TimingInfo("Device to host transfer", timer.tick(), 3 * size)); return CL_SUCCESS; }