コード例 #1
0
ファイル: reduce.c プロジェクト: kuolei/CocoaSampleCode
int main(int argc, char **argv)
{
    uint64_t         t1 = 0;
    uint64_t         t2 = 0;
    int              err;
    cl_device_id     device_id;
    cl_command_queue commands;
    cl_context       context;
    cl_mem			 output_buffer;
    cl_mem           input_buffer;
    cl_mem           partials_buffer;
    size_t           typesize;
    int              pass_count = 0;
    size_t*          group_counts = 0;
    size_t*          work_item_counts = 0;
    int*             operation_counts = 0;
    int*             entry_counts = 0;
    int              use_gpu = 1;

    int i;
    int c;

    // Parse command line options
    //
    for( i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;

        if(strstr(argv[i], "cpu"))
        {
            use_gpu = 0;
        }
        else if(strstr(argv[i], "gpu"))
        {
            use_gpu = 1;
        }
        else if(strstr(argv[i], "float2"))
        {
            integer = false;
            channels = 2;
        }
        else if(strstr(argv[i], "float4"))
        {
            integer = false;
            channels = 4;
        }
        else if(strstr(argv[i], "float"))
        {
            integer = false;
            channels = 1;
        }
        else if(strstr(argv[i], "int2"))
        {
            integer = true;
            channels = 2;
        }
        else if(strstr(argv[i], "int4"))
        {
            integer = true;
            channels = 4;
        }
        else if(strstr(argv[i], "int"))
        {
            integer = true;
            channels = 1;
        }
    }

    // Create some random input data on the host
    //
    float *float_data = (float*)malloc(count * channels * sizeof(float));
    int *integer_data = (int*)malloc(count * channels * sizeof(int));
    for (i = 0; i < count * channels; i++)
    {
        float_data[i] = ((float) rand() / (float) RAND_MAX);
        integer_data[i] = (int) (255.0f * float_data[i]);
    }

    // Connect to a compute device
    //
    err = clGetDeviceIDs(NULL, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to locate a compute device!\n");
        return EXIT_FAILURE;
    }

    size_t returned_size = 0;
    size_t max_workgroup_size = 0;
    err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
    err|= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    printf(SEPARATOR);
    printf("Connecting to %s %s...\n", vendor_name, device_name);

    // Load the compute program from disk into a cstring buffer
    //
    typesize = integer ? (sizeof(int)) : (sizeof(float));
    const char* filename = 0;
    switch(channels)
    {
    case 4:
        filename = integer ? "reduce_int4_kernel.cl" : "reduce_float4_kernel.cl";
        break;
    case 2:
        filename = integer ? "reduce_int2_kernel.cl" : "reduce_float2_kernel.cl";
        break;
    case 1:
        filename = integer ? "reduce_int_kernel.cl" : "reduce_float_kernel.cl";
        break;
    default:
        printf("Invalid channel count specified!\n");
        return EXIT_FAILURE;
    };

    printf(SEPARATOR);
    printf("Loading program '%s'...\n", filename);
    printf(SEPARATOR);

    char *source = load_program_source(filename);
    if(!source)
    {
        printf("Error: Failed to load compute program from file!\n");
        return EXIT_FAILURE;
    }

    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the input buffer on the device
    //
    size_t buffer_size = typesize * count * channels;
    input_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!input_buffer)
    {
        printf("Error: Failed to allocate input buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Fill the input buffer with the host allocated random data
    //
    void *input_data = (integer) ? (void*)integer_data : (void*)float_data;
    err = clEnqueueWriteBuffer(commands, input_buffer, CL_TRUE, 0, buffer_size, input_data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return EXIT_FAILURE;
    }

    // Create an intermediate data buffer for intra-level results
    //
    partials_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!partials_buffer)
    {
        printf("Error: Failed to allocate partial sum buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Create the output buffer on the device
    //
    output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!output_buffer)
    {
        printf("Error: Failed to allocate result buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Determine the reduction pass configuration for each level in the pyramid
    //
    create_reduction_pass_counts(
        count, max_workgroup_size,
        MAX_GROUPS, MAX_WORK_ITEMS,
        &pass_count, &group_counts,
        &work_item_counts, &operation_counts,
        &entry_counts);

    // Create specialized programs and kernels for each level of the reduction
    //
    cl_program *programs = (cl_program*)malloc(pass_count * sizeof(cl_program));
    memset(programs, 0, pass_count * sizeof(cl_program));

    cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel));
    memset(kernels, 0, pass_count * sizeof(cl_kernel));

    for(i = 0; i < pass_count; i++)
    {
        char *block_source = malloc(strlen(source) + 1024);
        size_t source_length = strlen(source) + 1024;
        memset(block_source, 0, source_length);

        // Insert macro definitions to specialize the kernel to a particular group size
        //
        const char group_size_macro[] = "#define GROUP_SIZE";
        const char operations_macro[] = "#define OPERATIONS";
        sprintf(block_source, "%s (%d) \n%s (%d)\n\n%s\n",
                group_size_macro, (int)group_counts[i],
                operations_macro, (int)operation_counts[i],
                source);

        // Create the compute program from the source buffer
        //
        programs[i] = clCreateProgramWithSource(context, 1, (const char **) & block_source, NULL, &err);
        if (!programs[i] || err != CL_SUCCESS)
        {
            printf("%s\n", block_source);
            printf("Error: Failed to create compute program!\n");
            return EXIT_FAILURE;
        }

        // Build the program executable
        //
        err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            size_t length;
            char build_log[2048];
            printf("%s\n", block_source);
            printf("Error: Failed to build program executable!\n");
            clGetProgramBuildInfo(programs[i], device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length);
            printf("%s\n", build_log);
            return EXIT_FAILURE;
        }

        // Create the compute kernel from within the program
        //
        kernels[i] = clCreateKernel(programs[i], "reduce", &err);
        if (!kernels[i] || err != CL_SUCCESS)
        {
            printf("Error: Failed to create compute kernel!\n");
            return EXIT_FAILURE;
        }

        free(block_source);
    }

    // Do the reduction for each level
    //
    cl_mem pass_swap;
    cl_mem pass_input = output_buffer;
    cl_mem pass_output = input_buffer;

    for(i = 0; i < pass_count; i++)
    {
        size_t global = group_counts[i] * work_item_counts[i];
        size_t local = work_item_counts[i];
        unsigned int operations = operation_counts[i];
        unsigned int entries = entry_counts[i];
        size_t shared_size = typesize * channels * local * operations;

        printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n",  i,
               (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries);

        // Swap the inputs and outputs for each pass
        //
        pass_swap = pass_input;
        pass_input = pass_output;
        pass_output = pass_swap;

        err = CL_SUCCESS;
        err |= clSetKernelArg(kernels[i],  0, sizeof(cl_mem), &pass_output);
        err |= clSetKernelArg(kernels[i],  1, sizeof(cl_mem), &pass_input);
        err |= clSetKernelArg(kernels[i],  2, shared_size,    NULL);
        err |= clSetKernelArg(kernels[i],  3, sizeof(int),    &entries);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments!\n");
            return EXIT_FAILURE;
        }

        // After the first pass, use the partial sums for the next input values
        //
        if(pass_input == input_buffer)
            pass_input = partials_buffer;

        err = CL_SUCCESS;
        err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel!\n");
            return EXIT_FAILURE;
        }
    }

    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }

    // Start the timing loop and execute the kernel over several iterations
    //
    printf(SEPARATOR);
    printf("Timing %d iterations of reduction with %d elements of type %s%s...\n",
           iterations, count, integer ? "int" : "float",
           (channels <= 1) ? (" ") : (channels == 2) ? "2" : "4");
    printf(SEPARATOR);

    int k;
    err = CL_SUCCESS;
    t1 = current_time();
    for (k = 0 ; k < iterations; k++)
    {
        for(i = 0; i < pass_count; i++)
        {
            size_t global = group_counts[i] * work_item_counts[i];
            size_t local = work_item_counts[i];

            err = clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
            if (err != CL_SUCCESS)
            {
                printf("Error: Failed to execute kernel!\n");
                return EXIT_FAILURE;
            }
        }
    }
    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }
    t2 = current_time();

    // Calculate the statistics for execution time and throughput
    //
    double t = subtract_time_in_seconds(t2, t1);
    printf("Exec Time:  %.2f ms\n", 1000.0 * t / (double)(iterations));
    printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t);
    printf(SEPARATOR);

    // Read back the results that were computed on the device
    //
    void *computed_result = malloc(typesize * channels);
    memset(computed_result, 0, typesize * channels);
    err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0, typesize * channels, computed_result, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to read back results from the device!\n");
        return EXIT_FAILURE;
    }


    // Verify the results are correct
    //
    if(integer)
    {
        int reference[4] = { 0, 0, 0, 0};
        switch(channels)
        {
        case 4:
            reduce_validate_int4(integer_data, count, reference);
            break;
        case 2:
            reduce_validate_int2(integer_data, count, reference);
            break;
        case 1:
            reduce_validate_int(integer_data, count, reference);
            break;
        default:
            printf("Invalid channel count specified!\n");
            return EXIT_FAILURE;
        }

        int result[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
        for(c = 0; c < channels; c++)
        {
            int v = ((int*) computed_result)[c];
            result[c] += v;
        }

        float error = 0.0f;
        float diff = 0.0f;
        for(c = 0; c < channels; c++)
        {
            diff = fabs(reference[c] - result[c]);
            error = diff > error ? diff : error;
        }

        if (error > MIN_ERROR)
        {
            for(c = 0; c < channels; c++)
                printf("Result[%d] %d != %d\n", c, reference[c], result[c]);

            printf("Error:  Incorrect results obtained! Max error = %f\n", error);
            return EXIT_FAILURE;
        }
        else
        {
            printf("Results Validated!\n");
            printf(SEPARATOR);
        }
    }
    else
    {
        float reference[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
        switch(channels)
        {
        case 4:
            reduce_validate_float4(float_data, count, reference);
            break;
        case 2:
            reduce_validate_float2(float_data, count, reference);
            break;
        case 1:
            reduce_validate_float(float_data, count, reference);
            break;
        default:
            printf("Invalid channel count specified!\n");
            return EXIT_FAILURE;
        }

        float result[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
        for(c = 0; c < channels; c++)
        {
            float v = ((float*) computed_result)[c];
            result[c] += v;
        }

        float error = 0.0f;
        float diff = 0.0f;
        for(c = 0; c < channels; c++)
        {
            diff = fabs(reference[c] - result[c]);
            error = diff > error ? diff : error;
        }

        if (error > MIN_ERROR)
        {
            for(c = 0; c < channels; c++)
                printf("Result[%d] %f != %f\n", c, reference[c], result[c]);

            printf("Error:  Incorrect results obtained! Max error = %f\n", error);
            return EXIT_FAILURE;
        }
        else
        {
            printf("Results Validated!\n");
            printf(SEPARATOR);
        }
    }

    // Shutdown and cleanup
    //
    for(i = 0; i < pass_count; i++)
    {
        clReleaseKernel(kernels[i]);
        clReleaseProgram(programs[i]);
    }

    clReleaseMemObject(input_buffer);
    clReleaseMemObject(output_buffer);
    clReleaseMemObject(partials_buffer);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    free(group_counts);
    free(work_item_counts);
    free(operation_counts);
    free(entry_counts);
    free(computed_result);
    free(kernels);
    free(float_data);
    free(integer_data);


    return 0;
}
コード例 #2
0
ファイル: map-exp.c プロジェクト: esheldon/misc
int main(int argc, char **argv)
{
    int              err;
    cl_device_id     device_id;
    cl_command_queue commands;
    cl_context       context;
    cl_mem			 output_buffer;
    cl_mem           input_buffer;
    cl_mem           partials_buffer;
    size_t           typesize;
    int              pass_count = 0;
    size_t*          group_counts = 0;
    size_t*          work_item_counts = 0;
    int*             operation_counts = 0;
    int*             entry_counts = 0;
    int              use_gpu = 1;
    
    int i;
    int c;
    
    // Parse command line options
    //
    for( i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
            
        if(strstr(argv[i], "cpu"))
        {
            use_gpu = 0;        
        }
        else if(strstr(argv[i], "gpu"))
        {
            use_gpu = 1;
        }
    }

    channels=1;
    
    // Create some random input data on the host 
    //
    time_t tstart=0;
    (void) time(&tstart);
    srand48((long) tstart);
    float *float_data = (float*)malloc(count * channels * sizeof(float));
    for (i = 0; i < count * channels; i++)
    {
        float_data[i] = drand48();
    }


    //SETUP PLATFORM
    cl_uint numPlatforms;
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not get platform\n");
        exit(EXIT_FAILURE);
    }

    cl_platform_id platform_id;
    if(numPlatforms > 0)
    {
        //we have at least one
        //cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        cl_platform_id* platforms = calloc(numPlatforms, sizeof(cl_platform_id));
        err = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (err != CL_SUCCESS) {
            fprintf(stderr,"could not get platform id\n");
            exit(EXIT_FAILURE);
        }

        fprintf(stderr,"Found %d platforms\n", numPlatforms);
        platform_id = platforms[0];
        //delete[] platforms;
        free(platforms);
    }
    else
        exit(0);


    // Connect to a compute device
    //
    err = clGetDeviceIDs(platform_id, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to locate a compute device!\n");
        return EXIT_FAILURE;
    }

    size_t returned_size = 0;
    size_t max_workgroup_size = 0;
    err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
    err|= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    printf(SEPARATOR);
    printf("Connecting to %s %s...\n", vendor_name, device_name);

    // Load the compute program from disk into a cstring buffer
    //
    typesize = (sizeof(float));    
    const char* filename = 0;
    filename = "apple-reduce-kernel-float.cl";

    printf(SEPARATOR);
    printf("Loading program '%s'...\n", filename);
    printf(SEPARATOR);

    char *source = load_program_source(filename);
    if(!source)
    {
        printf("Error: Failed to load compute program from file!\n");
        return EXIT_FAILURE;    
    }
    
    // Create a compute context 
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the input buffer on the device
    //
    size_t buffer_size = typesize * count * channels;
    input_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!input_buffer)
    {
        printf("Error: Failed to allocate input buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Fill the input buffer with the host allocated random data
    //
    void *input_data = (void*)float_data;
    err = clEnqueueWriteBuffer(commands, input_buffer, CL_TRUE, 0, buffer_size, input_data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return EXIT_FAILURE;
    }

    // Create an intermediate data buffer for intra-level results
    //
    partials_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!partials_buffer)
    {
        printf("Error: Failed to allocate partial sum buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Create the output buffer on the device
    //
    output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!output_buffer)
    {
        printf("Error: Failed to allocate result buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Determine the reduction pass configuration for each level in the pyramid
    //
    create_reduction_pass_counts(
        count, max_workgroup_size, 
        MAX_GROUPS, MAX_WORK_ITEMS, 
        &pass_count, &group_counts, 
        &work_item_counts, &operation_counts,
        &entry_counts);

    // Create specialized programs and kernels for each level of the reduction
    //
    cl_program *programs = (cl_program*)malloc(pass_count * sizeof(cl_program));
    memset(programs, 0, pass_count * sizeof(cl_program));

    cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel));
    memset(kernels, 0, pass_count * sizeof(cl_kernel));

    for(i = 0; i < pass_count; i++)
    {
        char *block_source = malloc(strlen(source) + 1024);
        size_t source_length = strlen(source) + 1024;
        memset(block_source, 0, source_length);
        
        // Insert macro definitions to specialize the kernel to a particular group size
        //
        const char group_size_macro[] = "#define GROUP_SIZE";
        const char operations_macro[] = "#define OPERATIONS";
        sprintf(block_source, "%s (%d) \n%s (%d)\n\n%s\n", 
            group_size_macro, (int)group_counts[i], 
            operations_macro, (int)operation_counts[i], 
            source);
        
        // Create the compute program from the source buffer
        //
        programs[i] = clCreateProgramWithSource(context, 1, (const char **) & block_source, NULL, &err);
        if (!programs[i] || err != CL_SUCCESS)
        {
            printf("%s\n", block_source);
            printf("Error: Failed to create compute program!\n");
            return EXIT_FAILURE;
        }
    
        // Build the program executable
        //
        err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            size_t length;
            char build_log[2048];
            printf("%s\n", block_source);
            printf("Error: Failed to build program executable!\n");
            clGetProgramBuildInfo(programs[i], device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length);
            printf("%s\n", build_log);
            return EXIT_FAILURE;
        }
    
        // Create the compute kernel from within the program
        //
        kernels[i] = clCreateKernel(programs[i], "reduce", &err);
        if (!kernels[i] || err != CL_SUCCESS)
        {
            printf("Error: Failed to create compute kernel!\n");
            return EXIT_FAILURE;
        }

        free(block_source);
    }
    
    // Do the reduction for each level  
    // this is one pass over it to establish the kernel args and such, so
    // it is negligible time
    //
    cl_mem pass_swap;
    cl_mem pass_input = output_buffer;
    cl_mem pass_output = input_buffer;

    for(i = 0; i < pass_count; i++)
    {
        size_t global = group_counts[i] * work_item_counts[i];        
        size_t local = work_item_counts[i];
        unsigned int operations = operation_counts[i];
        unsigned int entries = entry_counts[i];
        size_t shared_size = typesize * channels * local * operations;

        printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n",  i, 
            (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries);

        // Swap the inputs and outputs for each pass
        //
        pass_swap = pass_input;
        pass_input = pass_output;
        pass_output = pass_swap;
        
        err = CL_SUCCESS;
        err |= clSetKernelArg(kernels[i],  0, sizeof(cl_mem), &pass_output);  
        err |= clSetKernelArg(kernels[i],  1, sizeof(cl_mem), &pass_input);
        err |= clSetKernelArg(kernels[i],  2, shared_size,    NULL);
        err |= clSetKernelArg(kernels[i],  3, sizeof(int),    &entries);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments!\n");
            return EXIT_FAILURE;
        }
        
        // After the first pass, use the partial sums for the next input values
        //
        if(pass_input == input_buffer)
            pass_input = partials_buffer;
            
        err = CL_SUCCESS;
        err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel!\n");
            return EXIT_FAILURE;
        }
    }
    
    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }

    // Start the timing loop and execute the kernel over several iterations  
    //
    printf(SEPARATOR);
    printf("Timing %d iterations of reduction with %d elements of type %s%s...\n", 
        iterations, count, "float", 
        (channels <= 1) ? (" ") : (channels == 2) ? "2" : "4");
    printf(SEPARATOR);

    int k;
    err = CL_SUCCESS;
    time_t t1 = clock();
    for (k = 0 ; k < iterations; k++)
    {    
        for(i = 0; i < pass_count; i++)
        {
            size_t global = group_counts[i] * work_item_counts[i];        
            size_t local = work_item_counts[i];

            err = clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
            if (err != CL_SUCCESS)
            {
                printf("Error: Failed to execute kernel!\n");
                return EXIT_FAILURE;
            }
        }
    }
    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }
    time_t t2 = clock();
    
    // Calculate the statistics for execution time and throughput
    //
    double t = (t2-t1)/( (double)CLOCKS_PER_SEC );
    printf("Exec Time:  %.2f ms\n", t);
    printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t);
    printf(SEPARATOR);

    // Read back the results that were computed on the device
    //
    void *computed_result = malloc(typesize * channels);
    memset(computed_result, 0, typesize * channels);
    err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0, typesize * channels, computed_result, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to read back results from the device!\n");
        return EXIT_FAILURE;
    }

    // now do the speed test on standard

    float reference=0;
    t1 = clock();
    for (k=0; k<iterations; k++) {
        reference = reduce_validate_float(float_data, count);
    }
    t2 = clock();
    double tcpu = (t2-t1)/( (double)CLOCKS_PER_SEC );
    printf("CPU Exec Time:  %.2f ms\n", tcpu);
    printf("CPU Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / tcpu);
    printf("GPU is faster by %.16g\n", tcpu/t);
    printf(SEPARATOR);


    float result= ( (float *)computed_result )[0];

    float ferror = fabs(reference - result)/reference;
   
    if (ferror > MIN_ERROR)
    {
        printf("Result %.16g != %.16g\n", reference, result);

        printf("Error:  Incorrect results obtained! Rel error %.16g > Max allowed = %.16g\n", ferror, MIN_ERROR);
        return EXIT_FAILURE;
    }
    else
    {
        printf("Results Validated!\n");
        printf(SEPARATOR);
    }

    // Shutdown and cleanup
    //
    for(i = 0; i < pass_count; i++)
    {
        clReleaseKernel(kernels[i]);
        clReleaseProgram(programs[i]);
    }
    
    clReleaseMemObject(input_buffer);
    clReleaseMemObject(output_buffer);
    clReleaseMemObject(partials_buffer);        
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    
    free(group_counts);
    free(work_item_counts);
    free(operation_counts);
    free(entry_counts);
    free(computed_result);
    free(kernels);
    free(float_data);
    
        
    return 0;
}