예제 #1
0
void RunTest(ResultDatabase &resultDB, const int npasses, const int verbose,
        const int noPB, const float repeatF, ProgressBar &pb, 
        const char* precision, const int micdev)
{
    char sizeStr[128];
    static __declspec(target(mic)) T *hostMem;

    int realRepeats = (int)round(repeatF*20);
    if (realRepeats < 2) realRepeats = 2;

    // Allocate host memory
    int halfNumFloats = 1024*1024;
    int numFloats = 2*halfNumFloats;
    hostMem = (T*)_mm_malloc(sizeof(T)*numFloats,64);

    sprintf (sizeStr, "Size:%07d", numFloats);
    float t = 0.0f;
    double TH;
    double flopCount;
    double gflop;

    for (int pass=0 ; pass<npasses ; ++pass)
    {
        ////////// Add1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add1_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * omp_get_num_threads();
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add1")+precision, sizeStr, "GFLOPS", gflop);
        
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Add2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add2_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * 120 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Add4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add4_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * 60 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Add8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add8_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * 80 * 3;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul1_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 200;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul1")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul2_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 100 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul4_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 50 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul8_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 25 * 8;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd1_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * omp_get_num_threads() * 1;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd1")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd2_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 120 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd4_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 60 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd8_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 30 * 8;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd1_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 160 * 1;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd1")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd2_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 80 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd4_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 40 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd8_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 20 * 8;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);
    }
    _mm_free(hostMem);
}
예제 #2
0
template <class T> void
RunTest(cl_device_id id,
        cl_context ctx,
        cl_command_queue queue,
        ResultDatabase &resultDB,
        int npasses,
        int verbose,
        int quiet,
        float repeatF,
        size_t localWorkSize,
        ProgressBar &pb,
        const char* typeName,
        const char* precision,
        const char* pragmaText)
{
    int err;
    cl_mem mem1;
    char sizeStr[128];
    T *hostMem, *hostMem2;
    
    int aIdx = 0;
    while ((aTests!=0) && (aTests[aIdx].name!=0))
    {
       ostringstream oss;
       struct _benchmark_type temp = aTests[aIdx];

       // Calculate adjusted repeat factor
       int tentativeRepeats = (int)round(repeatF*temp.numRepeats);
       if (tentativeRepeats < 2) {
          tentativeRepeats = 2;
          double realRepeatF = ((double)tentativeRepeats) 
            / temp.numRepeats;
          if (realRepeatF>8.0*repeatF)  // do not cut the number of unrolls
                                        // by more than a factor of 8
             realRepeatF = 8.0*repeatF;
          temp.numUnrolls = 
                  (int)round(repeatF*temp.numUnrolls/realRepeatF);
       }
       temp.numRepeats = tentativeRepeats;
       
       // Generate kernel source code
       generateKernel(oss, temp, typeName, pragmaText);
       std::string kernelCode(oss.str());

       // If in verbose mode, print the kernel
       if (verbose)
       {
           cout << "Code for kernel " << temp.name 
                << ":\n" + kernelCode << endl;
       }

       // Alloc host memory
       int halfNumFloatsMax = temp.halfBufSizeMax*1024;
       int numFloatsMax = 2*halfNumFloatsMax;
       hostMem = new T[numFloatsMax];
       hostMem2 = new T[numFloatsMax];
       
       // Allocate device memory
       mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
                                 sizeof(T)*numFloatsMax, NULL, &err);
       CL_CHECK_ERROR(err);
       
       // Issue a copy to force device allocation
       err = clEnqueueWriteBuffer(queue, mem1, true, 0,
                               numFloatsMax*sizeof(T), hostMem,
                               0, NULL, NULL);
       CL_CHECK_ERROR(err);

       // Create kernel program object
       const char* progSource[] = {kernelCode.c_str()};
       cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, 
           NULL, &err);
       CL_CHECK_ERROR(err);
       
       // Compile the program
       err = clBuildProgram(prog, 1, &id, opts, NULL, NULL);
       CL_CHECK_ERROR(err);
       
       if (err != 0)
       {
           char log[5000];
           size_t retsize = 0;
           err =  clGetProgramBuildInfo(prog, id, CL_PROGRAM_BUILD_LOG,
                    5000*sizeof(char), log, &retsize);
           CL_CHECK_ERROR(err);
           
           cout << "Build error." << endl;
           cout << "Retsize: " << retsize << endl;
           cout << "Log: " << log << endl;
           return;
       }

       // Check if we have to dump the PTX (NVIDIA only)
       // Disabled by default
       // Set environment variable DUMP_PTX to enable
       char* dumpPtx = getenv("DUMP_PTX");
       if (dumpPtx && !strcmp(dumpPtx, "1")) {  // must dump the PTX
          dumpPTXCode(ctx, prog, temp.name);
       }
       
       // Extract out kernel
       cl_kernel kernel_madd = clCreateKernel(prog, temp.name, &err);
       CL_CHECK_ERROR(err);
       
       err = clSetKernelArg (kernel_madd, 0, sizeof(cl_mem), (void*)&mem1);
       CL_CHECK_ERROR (err);
       err = clSetKernelArg (kernel_madd, 1, sizeof(cl_int), 
           (void*)&temp.numRepeats);
       CL_CHECK_ERROR (err);

       if (verbose)
       {
          cout << "Running kernel " << temp.name << endl;
       }
       
       for (int halfNumFloats=temp.halfBufSizeMin*1024 ; 
            halfNumFloats<=temp.halfBufSizeMax*1024 ;
            halfNumFloats*=temp.halfBufSizeStride)
       {
          // Set up input memory, first half = second half
          int numFloats = 2*halfNumFloats;
          for (int j=0; j<halfNumFloats; ++j)
          {
             hostMem[j] = hostMem[numFloats-j-1] = (T)(drand48()*5.0);
          }

          size_t globalWorkSize = numFloats;

          for (int pas=0 ; pas<npasses ; ++pas) 
          {
             err = clEnqueueWriteBuffer (queue, mem1, true, 0,
                                   numFloats*sizeof(T), hostMem,
                                   0, NULL, NULL);
             CL_CHECK_ERROR(err);

             Event evKernel(temp.name);
             
             err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL,
                                 &globalWorkSize, &localWorkSize,
                                 0, NULL, &evKernel.CLEvent());
             CL_CHECK_ERROR(err);
             
             err = clWaitForEvents(1, &evKernel.CLEvent());
             CL_CHECK_ERROR(err);
             
             evKernel.FillTimingInfo();
             double flopCount = (double)numFloats * 
                                temp.flopCount *
                                temp.numRepeats * 
                                temp.numUnrolls * 
                                temp.numStreams;

             double gflop = flopCount / (double)(evKernel.SubmitEndRuntime());

             sprintf (sizeStr, "Size:%07d", numFloats);
             resultDB.AddResult(string(temp.name)+precision, sizeStr, "GFLOPS", gflop);

             // Zero out the test host memory
             for (int j=0 ; j<numFloats ; ++j)
             {
                 hostMem2[j] = 0.0;
             }

             // Read the result device memory back to the host
             err = clEnqueueReadBuffer(queue, mem1, true, 0,
                                 numFloats*sizeof(T), hostMem2,
                                 0, NULL, NULL);
             CL_CHECK_ERROR(err);
               
             // Check the result -- At a minimum the first half of memory
             // should match the second half exactly
             for (int j=0 ; j<halfNumFloats ; ++j)
             {
                if (hostMem2[j] != hostMem2[numFloats-j-1])
                {
                   cout << "Error; hostMem2[" << j << "]=" << hostMem2[j]
                        << " is different from its twin element hostMem2["
                        << (numFloats-j-1) << "]=" << hostMem2[numFloats-j-1]
                        <<"; stopping check\n";
                   break;
                }
             }
             
             // update progress bar
             pb.addItersDone();
             if (!verbose && !quiet)
                 pb.Show(stdout);
          }
       }

       err = clReleaseKernel (kernel_madd);
       CL_CHECK_ERROR(err);
       err = clReleaseProgram (prog);
       CL_CHECK_ERROR(err);
       err = clReleaseMemObject(mem1);
       CL_CHECK_ERROR(err);

       aIdx += 1;
       
       delete[] hostMem;
       delete[] hostMem2;
    }
 
    // Now, test hand-tuned custom kernels
    
    // 2D - width and height of input
    const int w = 2048, h = 2048;
    const int bytes = w * h * sizeof(T);
    
    // Allocate some device memory
    mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // Get a couple non-zero random numbers
    float val1 = 0, val2 = 0;
    while (val1==0 || val2==0)
    {
        val1 = drand48();
        val2 = drand48();
    }
    // For each custom kernel
    for (int kCounter = 0; kCounter < 2; kCounter++)
    {
        // Calculate adjusted repeat factor
        int tentativeRepeats = (int)round(repeatF*5);
        int nUnrolls = 100;
        if (tentativeRepeats < 2) {
           tentativeRepeats = 2;
           double realRepeatF = ((double)tentativeRepeats) / 5;
           if (realRepeatF>8.0*repeatF)  // do not cut the number of unrolls
                                         // by more than a factor of 8
              realRepeatF = 8.0*repeatF;
           nUnrolls = (int)round(repeatF*100/realRepeatF);
        }
        
        // Double precision not currently supported
        string kSource = generateUKernel(kCounter, false, tentativeRepeats, nUnrolls, 
                       typeName, pragmaText);
        
        const char* progSource[] = {kSource.c_str()};
        cl_program prog = clCreateProgramWithSource(ctx,
                                 1, progSource, NULL, &err);
        CL_CHECK_ERROR(err);
    
        // Compile kernel
        err = clBuildProgram(prog, 1, &id, opts, NULL, NULL);
        CL_CHECK_ERROR(err);
        
        // Extract out kernel 
        cl_kernel kernel_madd = clCreateKernel(prog, "peak", &err);
        
        // Calculate kernel launch parameters
        //size_t localWorkSize = maxGroupSize<128?maxGroupSize:128;
        size_t globalWorkSize = w * h;
        
        // Set the arguments
        err = clSetKernelArg(kernel_madd, 0, sizeof(cl_mem), (void*)&mem1);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(kernel_madd, 1, sizeof(T), (void*)&val1);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(kernel_madd, 2, sizeof(T), (void*)&val2);
        CL_CHECK_ERROR(err);


        // Event object for timing
        Event evKernel_madd("madd");
        for (int passCounter=0; passCounter < npasses; passCounter++)
        {
            err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL,
                      &globalWorkSize, &localWorkSize,
                      0, NULL, &evKernel_madd.CLEvent());
            CL_CHECK_ERROR(err);
    
            // Wait for the kernel to finish
            err = clWaitForEvents(1, &evKernel_madd.CLEvent());
            CL_CHECK_ERROR(err);  
            evKernel_madd.FillTimingInfo();
            
            // Calculate result and add to DB
            char atts[1024];
            double nflopsPerItem = getUFlopCount(kCounter, false, tentativeRepeats, nUnrolls);
            sprintf(atts, "Size:%d", w*h);
            double gflops = (double) (nflopsPerItem*w*h) /
                            (double) evKernel_madd.SubmitEndRuntime();
            
            if (kCounter) {
                resultDB.AddResult(string("MulMAddU")+precision, atts, "GFLOPS", gflops);
            } else {
                resultDB.AddResult(string("MAddU")+precision, atts, "GFLOPS", gflops);
            }
            // update progress bar
            pb.addItersDone();
            if (!verbose && !quiet)
            {
                pb.Show(stdout);
            }
        }
        err = clReleaseKernel(kernel_madd);
        CL_CHECK_ERROR(err);
        err = clReleaseProgram(prog);
        CL_CHECK_ERROR(err);
    }
    err = clReleaseMemObject(mem1);
    CL_CHECK_ERROR(err);
}