CLerror CLElectrosFunctor<T>::LoadKernels ( size_t deviceID )
{
    PerfTimer timer;
    timer.start();
    FunctorData &data = m_functors[deviceID];

    cout<<" Reading kernel source"<<endl;
    using std::ifstream;
    ifstream reader("Electrostatics.cl.c", ifstream::in);
    if (!reader.good())
    {
        cout<<"Cannot open program source"<<endl;
        return -1;
    }
    reader.seekg (0, std::ios::end);
    size_t length = reader.tellg();
    reader.seekg (0, std::ios::beg);
    char *source = new char[length];
    reader.read(source, length);
    reader.close();

    /*
     * Different devices require different work group sizes to operate
     * optimally. The amount of __local memory on some kernels depends on these
     * work-group sizes. This causes a problem as explained below:
     * There are two ways to use group-local memory
     * 1) Allocate it as a parameter with clSetKernelArg()
     * 2) Declare it as a constant __local array within the cl kernel
     * Option (1) has the advantage of flexibility, but the extra indexing
     * overhead is a performance killer (20-25% easily lost on nvidia GPUs)
     * Option (2) has the advantage that the compiler knows the arrays are of
     * constant size, and is free to do extreme optimizations.
     * Of course, then both host and kernel have to agree on the size of the
     * work group.
     * We abuse the fact that the source code is compiled at runtime, decide
     * those sizes in the host code, then #define them in the kernel code,
     * before it is compiled.
     */

    // BLOCK size
    data.local = {BLOCK_X, 1, 1};
    size_t local_MT[3] = {BLOCK_X_MT, BLOCK_Y_MT, 1};
    // GRID size
    data.global = {((this->m_nLines + BLOCK_X - 1)/BLOCK_X)
                   * BLOCK_X, 1, 1
                  };
    data.global[0] /= data.vecWidth;
    data.local[0] /= data.vecWidth;
    cout<<"Local   : "<<data.local[0]<<" "<<data.local[1]<<" "
        <<data.local[2]<<endl;
    cout<<"Local_MT: "<<local_MT[0]<<" "<<local_MT[1]<<" "<<local_MT[2]<<endl;
    cout<<"Global  : "<<data.global[0]<<" "<<data.global[1]<<" "
        <<data.global[2]<<endl;

    char defines[1024];
    const size_t kernelSteps = this->m_pFieldLinesData->GetSize()
                               / this->m_nLines;
    snprintf(defines, sizeof(defines),
             "#define BLOCK_X %u\n"
             "#define BLOCK_X_MT %u\n"
             "#define BLOCK_Y_MT %u\n"
             "#define KERNEL_STEPS %u\n"
             "#define Tprec %s\n"
             "#define Tvec %s\n",
             (unsigned int) data.local[0],
             (unsigned int) local_MT[0], (unsigned int)local_MT[1],
             (unsigned int) kernelSteps,
             FindPrecType(),
             FindVecType(data.vecWidth)
            );

    cout<<" Calc'ed kern steps "<<kernelSteps<<endl;
    char *srcs[2] = {defines, source};
    CLerror err;
    cl_program prog = clCreateProgramWithSource(data.context, 2,
                      (const char**) srcs,
                      NULL, &err);
    if (err)cout<<"clCreateProgramWithSource returns: "<<err<<endl;

    char options[] = "-cl-fast-relaxed-math";
    err = clBuildProgram(prog, 0, NULL, options, NULL, NULL);
    if (err)cout<<"clBuildProgram returns: "<<err<<endl;

    size_t logSize;
    clGetProgramBuildInfo(prog, data.device->deviceID,
                          CL_PROGRAM_BUILD_LOG,
                          0, NULL, &logSize);
    char * log = (char*)malloc(logSize);
    clGetProgramBuildInfo(prog, data.device->deviceID,
                          CL_PROGRAM_BUILD_LOG,
                          logSize, log, 0);
    cout<<"Program Build Log:"<<endl<<log<<endl;
    CL_ASSERTE(err, "clBuildProgram failed");
    data.perfData.add(TimingInfo("Program compilation", timer.tick()));



    //==========================================================================
    cout<<" Preparing kernel"<<endl;
    data.kernel = clCreateKernel(prog, "CalcField_curvature", &err);
    CL_ASSERTE(err, "clCreateKernel");
    return CL_SUCCESS;
}
unsigned long CLElectrosFunctor<T>::MainFunctor (
    size_t functorIndex,    ///< Functor whose data to process
    size_t deviceIndex      ///< Device on which to process data
)
{
    if(functorIndex != deviceIndex)
        cerr<<"WARNING: Different functor and device"<<endl;
    PerfTimer timer;
    FunctorData &funData = m_functors[functorIndex];
    FunctorData &devData = m_functors[deviceIndex];
    perfPacket &profiler = devData.perfData;
    timer.start();
    CLerror err;
    cl_context ctx = devData.context;

    cout<<" Preparing buffers"<<endl;
    Vector3<cl_mem> &arrdata = devData.devFieldMem;
    cl_mem &charges = devData.chargeMem;
    cl_kernel &kernel = devData.kernel;

    err = CL_SUCCESS;
    // __global float *x,
    err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &arrdata.x);
    // __global float *y,
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &arrdata.y);
    // __global float *z,
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &arrdata.z);
    // __global pointCharge *Charges,
    err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charges);
    // const unsigned int linePitch,
    cl_uint param = this->m_nLines;
    err |= clSetKernelArg(kernel, 4, sizeof(param), &param);
    // const unsigned int p,
    param = (cl_uint)this->m_pPointChargeData->GetSize();
    err |= clSetKernelArg(kernel, 5, sizeof(param), &param);
    // const unsigned int fieldIndex,
    param = 1;
    err |= clSetKernelArg(kernel, 6, sizeof(param), &param);

    // const float resolution
    T res = this->m_resolution;
    err |= clSetKernelArg(kernel, 7, sizeof(res), &res);
    if (err)cout<<"clSetKernelArg cummulates: "<<err<<endl;

    //==========================================================================
    cl_command_queue queue = clCreateCommandQueue(ctx,
                             devData.device->deviceID,
                             0, &err);
    if (err)cout<<"clCreateCommandQueue returns: "<<err<<endl;

    timer.tick();
    Vector3<T*> hostArr = this->m_pFieldLinesData->GetDataPointers();
    const size_t start = funData.startIndex;
    const size_t size = funData.elements * sizeof(T) * funData.steps;

    err = CL_SUCCESS;
    err |= clEnqueueWriteBuffer(queue, arrdata.x, CL_FALSE, 0, size,
                                &hostArr.x[start], 0, NULL, NULL);
    if (err)cout<<"Write 1 returns: "<<err<<endl;
    err |= clEnqueueWriteBuffer(queue, arrdata.y, CL_FALSE, 0, size,
                                &hostArr.y[start], 0, NULL, NULL);
    if (err)cout<<"Write 2 returns: "<<err<<endl;
    err |= clEnqueueWriteBuffer(queue, arrdata.z, CL_FALSE, 0, size,
                                &hostArr.z[start], 0, NULL, NULL);
    if (err)cout<<"Write 3 returns: "<<err<<endl;
    const size_t qSize = this->m_pPointChargeData->GetSizeBytes();
    err |= clEnqueueWriteBuffer(queue, charges, CL_FALSE, 0, qSize,
                                this->m_pPointChargeData->GetDataPointer(),
                                0, NULL, NULL);
    if (err)cout<<"Write 4 returns: "<<err<<endl;
    CL_ASSERTE(err, "Sending data to device failed");

    // Finish memory copies before starting the kernel
    CL_ASSERTE(clFinish(queue), "Pre-kernel sync");

    profiler.add(TimingInfo("Host to device transfer", timer.tick(),
                            3*size + qSize ));

    //==========================================================================

    cout<<" Executing kernel"<<endl;

    timer.tick();
    err |= clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
                                  funData.global, funData.local,
                                  0, NULL, NULL);
    if (err)cout<<"clEnqueueNDRangeKernel returns: "<<err<<endl;
    // Let kernel finish before continuing
    CL_ASSERTE(clFinish(queue), "Post-kernel sync");
    double time = timer.tick();
    this->m_pPerfData->time = time;
    this->m_pPerfData->performance =
        ( this->m_nLines * ( ( 2500-1 ) * ( this->m_pPointChargeData->GetSize()
                                            * ( electroPartFieldFLOP + 3 ) + 13 ) ) / time ) / 1E9;
    profiler.add(TimingInfo("Kernel execution time", time));
    //==========================================================================
    cout<<" Recovering results"<<endl;

    timer.tick();
    err = CL_SUCCESS;
    err |= clEnqueueReadBuffer ( queue, arrdata.x, CL_FALSE, 0, size,
                                 hostArr.x, 0, NULL, NULL );
    if (err)cout<<" Read 1 returns: "<<err<<endl;
    err |= clEnqueueReadBuffer ( queue, arrdata.y, CL_FALSE, 0, size,
                                 hostArr.y, 0, NULL, NULL );
    if (err)cout<<" Read 2 returns: "<<err<<endl;
    err |= clEnqueueReadBuffer ( queue, arrdata.z, CL_FALSE, 0, size,
                                 hostArr.z, 0, NULL, NULL );
    if (err)cout<<" Read 3 returns: "<<err<<endl;
    if (err)cout<<"clEnqueueReadBuffer cummulates: "<<err<<endl;

    clFinish(queue);

    profiler.add(TimingInfo("Device to host transfer", timer.tick(),
                            3 * size));
    return CL_SUCCESS;
}