Exemple #1
0
cl_kernel get_kernel(char *kernel_name, cl_context *context, cl_device_id *device)
{
  cl_int status = CL_SUCCESS;

  const char* program_source = load_program_source(PROGRAM_SRC);
  if(program_source == NULL) {
    fprintf(stderr, "Programm can not be created. File was not found.");
    abort();
  }

  cl_program program = clCreateProgramWithSource(*context, 1,
                                                 &program_source, NULL,
                                                 &status);
  CL_CHECK_ERROR(status);

  status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  /* Print build log */
  char buf[0x10000];
  clGetProgramBuildInfo(program,
                        *device,
                        CL_PROGRAM_BUILD_LOG,
                        0x10000,
                        buf,
                        NULL);

  if(status != CL_SUCCESS) {
    printf("\n-------BUILD LOG:\n %s \n-------\n", buf);
    fprintf(stderr, "Programm can not be build. (%s)", opencl_map_error(status));
    abort();
  }

  return clCreateKernel(program, kernel_name, &status);
}
Exemple #2
0
    void createKernel(const char* kernel, const char* path, int indice) {

        //	TheContext* tc = new TheContext();

        //	cl_context GPUContext_K = tc->getMyContext()->getContextCL();
        //	cl_device_id cdDevice_K = tc->getMyContext()->getDeviceCL();

        // Creates the program
        // Uses NVIDIA helper functions to get the code string and it's size (in bytes)
        //size_t src_size = 0;

        char full_path[256];

#ifdef _VIVID_STATIC_LIB
        sprintf(full_path, "%s", path);
#else
        sprintf(full_path, "%s", path);
#endif

        char *program_source = load_program_source(full_path);
        if (program_source == NULL) {
            printf("Error: Failed to read the OpenCL kernel: %s\n",path);
            exit(-1);
        }
        cl_int err;

        program_list[indice] = clCreateProgramWithSource(GPUContext_K, 1, (const char **) &program_source, NULL, &err);
        if (!program_list[indice]) {
            printf("Error: Failed to create compute program for device %d Kernel: (%s)!\n", indice,kernel);
            printf("************\n%s\n************\n", program_source);
        }

        // Build the program executable
        const char * options = "-cl-fast-relaxed-math";
        err = clBuildProgram(program_list[indice], 0, NULL, options, NULL, NULL);
        if (err != CL_SUCCESS) {
            size_t len;
            char buffer[10000];

            printf("Error: Failed to build program executable for device %d kernel: (%s)!\n",err,kernel);
            cl_int get_err=clGetProgramBuildInfo(program_list[indice], cdDevice_K, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
            printf("%d %s\n", get_err, buffer);

        }

        kernel_list[indice] = clCreateKernel(program_list[indice], kernel, &err);
        if (!kernel_list[indice] || err != CL_SUCCESS) {
            printf("Error: Failed to create compute kernel for device %d Kernel: (%s)!\n", indice, full_path);
            exit(1);
        }
    }
void	init_cl(int ker_id, char *kernel_path, t_cl *cl)
{
	cl_device_id			device;
	char 				*source;
	cl_program			program;

	clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
	if (!cl->context)	
	{
		cl->context = clCreateContext(0, 1, &device, NULL, NULL, NULL);
		cl->cmd_queue = clCreateCommandQueue(cl->context, device, 0, NULL);
	}
	source = load_program_source(kernel_path);
	program = clCreateProgramWithSource(cl->context, 1, (const char **)&source, NULL, NULL);
	free(source);
	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	cl->kernel[ker_id] = clCreateKernel(program, "thread", NULL);
	cl->input_cl_mem[ker_id] = clCreateBuffer(cl->context, CL_MEM_READ_ONLY, cl->gws[ker_id] * sizeof(float), NULL, NULL);
	cl->output_cl_mem[ker_id] = clCreateBuffer(cl->context, CL_MEM_READ_WRITE, cl->gws[ker_id] * sizeof(float), NULL, NULL);
}
void opencl_setup(CLEnv& env)
{
     /*****************************************/
     /* Initialize OpenCL */
     /*****************************************/
     clGetPlatformIDs(1, &env.cpPlatform, NULL);
     clGetDeviceIDs(env.cpPlatform, CL_DEVICE_TYPE_GPU, 1, &env.cdDevice, NULL);

     env.context = clCreateContext(0, 1, &env.cdDevice, NULL, NULL, &env.errcode);
     //env.context = clCreateContextFromType(0,  CL_DEVICE_TYPE_GPU,NULL, NULL, &env.errcode);

     opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ );


     // get the list of GPU devices associated with context
     env.errcode = clGetContextInfo(env.context, CL_CONTEXT_DEVICES, 0, NULL,&env.device_size);
     env.devices = (cl_device_id *) malloc(env.device_size);

     env.errcode |= clGetContextInfo(env.context, CL_CONTEXT_DEVICES, env.device_size, env.devices, NULL);
     opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

     //Create a command-queue
     env.command_queue = clCreateCommandQueue(env.context, env.cdDevice, 0, &env.errcode);
     opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

     // Load and build OpenCL kernel
     const char * filename = "kernel.cl";
     char* kernel_source = load_program_source(filename);
     env.program = clCreateProgramWithSource(env.context, 1, (const char**)&kernel_source, NULL, &env.errcode);
     opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

     env.errcode = clBuildProgram(env.program, 0,  NULL, NULL, NULL, NULL);
     opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

     env.kernel = clCreateKernel(env.program, "matrix_mul", &env.errcode);
     opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ );

     free(kernel_source);
}
Exemple #5
0
cl_int
load_kernel(cl_context context, cl_device_id *devices, unsigned int devc, cl_program *prog, cl_kernel *kern)
{
    cl_int err;
    char* source = load_program_source("ConwayKernel.cl");
    
    *prog = clCreateProgramWithSource(context, 1, (const char**)&source,
                                      NULL, &err);
    if (*prog == NULL)
    {
        printf("clCreateProgramWithSource failed! (Error: %d)\n", err);
        return err;
    }
    
    err = clBuildProgram(*prog, devc, devices, NULL, NULL, NULL);
	if (err)
	{
		printf("clBuildProgram failed! (Error: %d)\n", err);
		size_t length;
		char buffer[2048];
		clGetProgramBuildInfo(*prog, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
		printf("Build log: %s\n", buffer);
		return err;
	}
    
    *kern = clCreateKernel(*prog, "evaluate_bit", &err);
    if (*kern == NULL)
    {
        printf("clCreateKernel failed! (Error: %d)\n", err);
        return err;
    }
    
    printf("Kernel build completed successfully\n");
    
    return 0;
}
Exemple #6
0
int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem buff_A, buff_B, buff_C;

    int mult = 1;
    uint32_t uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
    uiWA = WA * mult;
    uiHA = HA * mult;
    uiWB = WB * mult;
    uiHB = HB * mult;
    uiWC = WC * mult;
    uiHC = HC * mult;

    printf("sizes WA %u HA %u WB %u HB %u WC %u HC %u\n",
            uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); 

    uint32_t size_A = uiWA * uiHA;
    uint32_t size_B = uiWB * uiHB;
    uint32_t size_C = uiWC * uiHC;

    size_t mem_size_A = size_A * sizeof(float); 
    size_t mem_size_B = size_B * sizeof(float); 
    size_t mem_size_C = size_C * sizeof(float); 

    float *data_A = (float *)malloc(mem_size_A);
    float *data_B = (float *)malloc(mem_size_B);
    float *data_C = (float *)malloc(mem_size_C);

    srand(2012);
    shrFillArray(data_A, size_A);
    shrFillArray(data_B, size_B);

    size_t global_work_size[2];
    size_t local_work_size[] = { BLOCK_SIZE, BLOCK_SIZE };

    global_work_size[0] = shrRoundUp(BLOCK_SIZE, uiWC);
    global_work_size[1] = shrRoundUp(BLOCK_SIZE, uiHA);

    const char *source = load_program_source("MatrixMul.cl");
    size_t source_len = strlen(source);;
    cl_uint err = 0;

    char *flags = "-cl-fast-relaxed-math";

    clGetPlatformIDs(1, &platform, NULL);
    printf("platform %p err %d\n", platform, err);

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err);
    printf("device %p err %d\n", device, err);

    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    printf("context %p err %d\n", context, err);

    queue = clCreateCommandQueue(context, device, 0, &err);
    printf("queue %p err %d\n", queue, err);

    program = clCreateProgramWithSource(context, 1, &source, &source_len, &err);
    printf("program %p err %d\n", program, err);

    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    printf("err %d\n", err);

    kernel = clCreateKernel(program, "matrixMul", &err);
    printf("kernel %p err %d\n", kernel, err);

    buff_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
        mem_size_A, data_A, NULL);
    printf("buff_A %p\n", buff_A);

    buff_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
        mem_size_B, data_B, NULL);
    printf("buff_B %p\n", buff_B);

    buff_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_C, NULL, NULL);
    printf("buff_C %p\n", buff_C);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buff_C);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buff_A);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buff_B);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 4, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&uiWA);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&uiWB);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&uiHA);
    printf("err %d\n", err);

    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, 
        local_work_size, 0, NULL, NULL);
    printf("err %d\n", err);

    err = clFlush(queue);
    printf("err %d\n", err);

    err = clFinish(queue);
    printf("err %d\n", err);

    err = clEnqueueReadBuffer(queue, buff_C, CL_TRUE, 0, mem_size_C, data_C, 0,
          NULL, NULL);
    printf("err %d\n", err);

    int i;
    for (i = 0; i < size_C; i++) {
        printf("%d %f\n", i, data_C[i]);
    }

    clReleaseMemObject(buff_A);
    clReleaseMemObject(buff_B);
    clReleaseMemObject(buff_C);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
}
Exemple #7
0
CLKernel::CLKernel(cl_context context, cl_command_queue commands, cl_device_id device,
                   const char * filename, const char * name, const char * options)
{
   this->device = device;
   this->commands = commands;
   //this->profiling = true;
   //Profiling doesn't work on neuro
   this->profiling = false;
   this->elapsed = 0;

#ifdef PV_USE_OPENCL

   int status = CL_SUCCESS;

   // Create the compute program from the source buffer
   //
   char * source = load_program_source(filename);
   program = clCreateProgramWithSource(context, 1, (const char **) &source, NULL, &status);
   if (!program || status != CL_SUCCESS)
   {
       printf("Error: Failed to create compute program!\n");
       CLDevice::print_error_code(status);
       exit(status);
   }

   // Build the program executable
   //
   // TODO - fix include path
   status = clBuildProgram(program, 0, NULL, options, NULL, NULL);
   if (status != CL_SUCCESS)
   {
       size_t len;
       char buffer[150641]; //[12050]; //[8192];

       printf("Error: Failed to build program executable!\n");
       CLDevice::print_error_code(status);
       status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
       if (status != CL_SUCCESS) {
          printf("CLKernel: error buffer length may be too small, is %ld, should be %ld\n", sizeof(buffer), len);
          CLDevice::print_error_code(status);
       }
       printf("%s\n", buffer);
       exit(status);
   }

   // Create the compute kernel in the program we wish to run
   //
   kernel = clCreateKernel(program, name, &status);
   if (!kernel || status != CL_SUCCESS)
   {
       fprintf(stderr, "Error: Failed to create compute kernel!\n");
       CLDevice::print_error_code(status);
       exit(status);
   }




#endif // PV_USE_OPENCL

}
Exemple #8
0
int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 10;
struct timespec start, finish;

    // Host input vectors
    int *h_a;
    int *h_b;
    // Host output vector
    int *h_c;
 double elapsed;
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
 
    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(int);
 
    // Allocate memory for each vector on host
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = i;
        h_b[i] = i;
    }
 
    size_t globalSize, localSize;
    cl_int err;
 int workgrp;
int wrkitm;
//wrkitm=atoi(argv[1]);
    // Number of work items in each local work group
//    localSize = wrkitm ;
//workgrp=atoi(argv[2]);
    // Number of total work items - localSize must be devisor
    globalSize = n;//ceil(n/(float)localSize)*localSize;
//cl_uint platformCount;
//cl_platform_id* platforms;
 //clGetPlatformIDs(0, NULL, &platformCount);
//	    platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
//	    clGetPlatformIDs(platformCount, platforms, NULL);
//printf("%d",platformCount);
    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
    // Get ID for the device
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)

    {

        printf("Error: Failed to create a device group!\n");
}
    // Create a context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
    
        printf("Error: Failed to create a compute context!\n");
    
// Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
//loading external cl file
 const char *file="vectadd.cl";
const char *kernelSource =  load_program_source(file);
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL);
clFinish(queue);
    // Set the arguments to our compute kernel
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
 clock_gettime(CLOCK_MONOTONIC, &start);
    // Execute the kernel over the entire range of the data set 
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);

 clock_gettime(CLOCK_MONOTONIC, &finish);
        elapsed = (finish.tv_sec - start.tv_sec);
        elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0;
 
    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);
    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );
 clFinish(queue);

    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
printf("Work Item/threads = %d \n",wrkitm);
printf("time taken by GPU = %le\n ",elapsed);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Exemple #9
0
int main(int argc, char **argv)
{
    int              err;
    cl_device_id     device_id;
    cl_command_queue commands;
    cl_context       context;
    cl_mem			 output_buffer;
    cl_mem           input_buffer;
    cl_mem           partials_buffer;
    size_t           typesize;
    int              pass_count = 0;
    size_t*          group_counts = 0;
    size_t*          work_item_counts = 0;
    int*             operation_counts = 0;
    int*             entry_counts = 0;
    int              use_gpu = 1;
    
    int i;
    int c;
    
    // Parse command line options
    //
    for( i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
            
        if(strstr(argv[i], "cpu"))
        {
            use_gpu = 0;        
        }
        else if(strstr(argv[i], "gpu"))
        {
            use_gpu = 1;
        }
    }

    channels=1;
    
    // Create some random input data on the host 
    //
    time_t tstart=0;
    (void) time(&tstart);
    srand48((long) tstart);
    float *float_data = (float*)malloc(count * channels * sizeof(float));
    for (i = 0; i < count * channels; i++)
    {
        float_data[i] = drand48();
    }


    //SETUP PLATFORM
    cl_uint numPlatforms;
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not get platform\n");
        exit(EXIT_FAILURE);
    }

    cl_platform_id platform_id;
    if(numPlatforms > 0)
    {
        //we have at least one
        //cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        cl_platform_id* platforms = calloc(numPlatforms, sizeof(cl_platform_id));
        err = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (err != CL_SUCCESS) {
            fprintf(stderr,"could not get platform id\n");
            exit(EXIT_FAILURE);
        }

        fprintf(stderr,"Found %d platforms\n", numPlatforms);
        platform_id = platforms[0];
        //delete[] platforms;
        free(platforms);
    }
    else
        exit(0);


    // Connect to a compute device
    //
    err = clGetDeviceIDs(platform_id, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to locate a compute device!\n");
        return EXIT_FAILURE;
    }

    size_t returned_size = 0;
    size_t max_workgroup_size = 0;
    err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
    err|= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    printf(SEPARATOR);
    printf("Connecting to %s %s...\n", vendor_name, device_name);

    // Load the compute program from disk into a cstring buffer
    //
    typesize = (sizeof(float));    
    const char* filename = 0;
    filename = "apple-reduce-kernel-float.cl";

    printf(SEPARATOR);
    printf("Loading program '%s'...\n", filename);
    printf(SEPARATOR);

    char *source = load_program_source(filename);
    if(!source)
    {
        printf("Error: Failed to load compute program from file!\n");
        return EXIT_FAILURE;    
    }
    
    // Create a compute context 
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the input buffer on the device
    //
    size_t buffer_size = typesize * count * channels;
    input_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!input_buffer)
    {
        printf("Error: Failed to allocate input buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Fill the input buffer with the host allocated random data
    //
    void *input_data = (void*)float_data;
    err = clEnqueueWriteBuffer(commands, input_buffer, CL_TRUE, 0, buffer_size, input_data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return EXIT_FAILURE;
    }

    // Create an intermediate data buffer for intra-level results
    //
    partials_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!partials_buffer)
    {
        printf("Error: Failed to allocate partial sum buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Create the output buffer on the device
    //
    output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!output_buffer)
    {
        printf("Error: Failed to allocate result buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Determine the reduction pass configuration for each level in the pyramid
    //
    create_reduction_pass_counts(
        count, max_workgroup_size, 
        MAX_GROUPS, MAX_WORK_ITEMS, 
        &pass_count, &group_counts, 
        &work_item_counts, &operation_counts,
        &entry_counts);

    // Create specialized programs and kernels for each level of the reduction
    //
    cl_program *programs = (cl_program*)malloc(pass_count * sizeof(cl_program));
    memset(programs, 0, pass_count * sizeof(cl_program));

    cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel));
    memset(kernels, 0, pass_count * sizeof(cl_kernel));

    for(i = 0; i < pass_count; i++)
    {
        char *block_source = malloc(strlen(source) + 1024);
        size_t source_length = strlen(source) + 1024;
        memset(block_source, 0, source_length);
        
        // Insert macro definitions to specialize the kernel to a particular group size
        //
        const char group_size_macro[] = "#define GROUP_SIZE";
        const char operations_macro[] = "#define OPERATIONS";
        sprintf(block_source, "%s (%d) \n%s (%d)\n\n%s\n", 
            group_size_macro, (int)group_counts[i], 
            operations_macro, (int)operation_counts[i], 
            source);
        
        // Create the compute program from the source buffer
        //
        programs[i] = clCreateProgramWithSource(context, 1, (const char **) & block_source, NULL, &err);
        if (!programs[i] || err != CL_SUCCESS)
        {
            printf("%s\n", block_source);
            printf("Error: Failed to create compute program!\n");
            return EXIT_FAILURE;
        }
    
        // Build the program executable
        //
        err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            size_t length;
            char build_log[2048];
            printf("%s\n", block_source);
            printf("Error: Failed to build program executable!\n");
            clGetProgramBuildInfo(programs[i], device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length);
            printf("%s\n", build_log);
            return EXIT_FAILURE;
        }
    
        // Create the compute kernel from within the program
        //
        kernels[i] = clCreateKernel(programs[i], "reduce", &err);
        if (!kernels[i] || err != CL_SUCCESS)
        {
            printf("Error: Failed to create compute kernel!\n");
            return EXIT_FAILURE;
        }

        free(block_source);
    }
    
    // Do the reduction for each level  
    // this is one pass over it to establish the kernel args and such, so
    // it is negligible time
    //
    cl_mem pass_swap;
    cl_mem pass_input = output_buffer;
    cl_mem pass_output = input_buffer;

    for(i = 0; i < pass_count; i++)
    {
        size_t global = group_counts[i] * work_item_counts[i];        
        size_t local = work_item_counts[i];
        unsigned int operations = operation_counts[i];
        unsigned int entries = entry_counts[i];
        size_t shared_size = typesize * channels * local * operations;

        printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n",  i, 
            (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries);

        // Swap the inputs and outputs for each pass
        //
        pass_swap = pass_input;
        pass_input = pass_output;
        pass_output = pass_swap;
        
        err = CL_SUCCESS;
        err |= clSetKernelArg(kernels[i],  0, sizeof(cl_mem), &pass_output);  
        err |= clSetKernelArg(kernels[i],  1, sizeof(cl_mem), &pass_input);
        err |= clSetKernelArg(kernels[i],  2, shared_size,    NULL);
        err |= clSetKernelArg(kernels[i],  3, sizeof(int),    &entries);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments!\n");
            return EXIT_FAILURE;
        }
        
        // After the first pass, use the partial sums for the next input values
        //
        if(pass_input == input_buffer)
            pass_input = partials_buffer;
            
        err = CL_SUCCESS;
        err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel!\n");
            return EXIT_FAILURE;
        }
    }
    
    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }

    // Start the timing loop and execute the kernel over several iterations  
    //
    printf(SEPARATOR);
    printf("Timing %d iterations of reduction with %d elements of type %s%s...\n", 
        iterations, count, "float", 
        (channels <= 1) ? (" ") : (channels == 2) ? "2" : "4");
    printf(SEPARATOR);

    int k;
    err = CL_SUCCESS;
    time_t t1 = clock();
    for (k = 0 ; k < iterations; k++)
    {    
        for(i = 0; i < pass_count; i++)
        {
            size_t global = group_counts[i] * work_item_counts[i];        
            size_t local = work_item_counts[i];

            err = clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
            if (err != CL_SUCCESS)
            {
                printf("Error: Failed to execute kernel!\n");
                return EXIT_FAILURE;
            }
        }
    }
    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }
    time_t t2 = clock();
    
    // Calculate the statistics for execution time and throughput
    //
    double t = (t2-t1)/( (double)CLOCKS_PER_SEC );
    printf("Exec Time:  %.2f ms\n", t);
    printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t);
    printf(SEPARATOR);

    // Read back the results that were computed on the device
    //
    void *computed_result = malloc(typesize * channels);
    memset(computed_result, 0, typesize * channels);
    err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0, typesize * channels, computed_result, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to read back results from the device!\n");
        return EXIT_FAILURE;
    }

    // now do the speed test on standard

    float reference=0;
    t1 = clock();
    for (k=0; k<iterations; k++) {
        reference = reduce_validate_float(float_data, count);
    }
    t2 = clock();
    double tcpu = (t2-t1)/( (double)CLOCKS_PER_SEC );
    printf("CPU Exec Time:  %.2f ms\n", tcpu);
    printf("CPU Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / tcpu);
    printf("GPU is faster by %.16g\n", tcpu/t);
    printf(SEPARATOR);


    float result= ( (float *)computed_result )[0];

    float ferror = fabs(reference - result)/reference;
   
    if (ferror > MIN_ERROR)
    {
        printf("Result %.16g != %.16g\n", reference, result);

        printf("Error:  Incorrect results obtained! Rel error %.16g > Max allowed = %.16g\n", ferror, MIN_ERROR);
        return EXIT_FAILURE;
    }
    else
    {
        printf("Results Validated!\n");
        printf(SEPARATOR);
    }

    // Shutdown and cleanup
    //
    for(i = 0; i < pass_count; i++)
    {
        clReleaseKernel(kernels[i]);
        clReleaseProgram(programs[i]);
    }
    
    clReleaseMemObject(input_buffer);
    clReleaseMemObject(output_buffer);
    clReleaseMemObject(partials_buffer);        
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    
    free(group_counts);
    free(work_item_counts);
    free(operation_counts);
    free(entry_counts);
    free(computed_result);
    free(kernels);
    free(float_data);
    
        
    return 0;
}
Exemple #10
0
int main(int argc, char **argv){
	
	printf("Check OpenCL environtment\n");

	cl_platform_id platid;
	cl_device_id devid;
	cl_int res;
	size_t param;
	
	/* Query OpenCL, get some information about the returned device */
	clGetPlatformIDs(1u, &platid, NULL);
	clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL);

	cl_char vendor_name[1024] = {0};
	cl_char device_name[1024] = {0};
	clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL);
	clGetDeviceInfo(devid, CL_DEVICE_NAME,   sizeof(device_name), device_name, NULL);
	printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &param, NULL);
	printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param);

	clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param);

	/* Check if kernel source exists, we compile argv[1] passed kernel */
	if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); }

	char *kernel_source;
	if(load_program_source(argv[1], &kernel_source)) return 1;
	
	printf("Building from OpenCL source: \t%s\n", argv[1]);
	printf("Compile/query OpenCL_program:\t%s\n", argv[2]);
	
	/* Create context and kernel program */
	cl_context context = 	clCreateContext(0, 1, &devid, NULL, NULL, NULL);
	cl_program pro = 	clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL);
	res = 			clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL);

	if(res != CL_SUCCESS){
		printf("clBuildProgram failed: %d\n", res); char buf[0x10000];
		clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL);
		printf("\n%s\n", buf); return(-1); }

	cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); 	check_return(res);
	
	/* Get the maximum work-group size for executing the kernel on the device */
	size_t global, local;
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL);		check_return(res);
	printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local);
	
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);	check_return(res);
	printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param);
	
	cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL);
	if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); }

	local = 4;
	int n = 2 * local;	//num_group * local workgroup size 
	global = n;
	
	int	num_groups=		global / local,
		allocated_local=	sizeof(data) * local + 
					sizeof(debug) * local;

	data *DP __attribute__ ((aligned(16)));
	DP = calloc(n, sizeof(data) *1);

	debug *dbg __attribute__ ((aligned(16)));
	dbg = calloc(n, sizeof(debug));
	
	printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups);
	printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256));
	printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local);

	cl_mem	cl_DP, cl_EC, cl_INV, DEBUG;
	cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res);					check_return(res);				
	cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(Elliptic_Curve), NULL, &res);	check_return(res);	//_constant address space
	cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(u8) * 0x80, NULL, &res);		check_return(res);
	DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res);		check_return(res);
	
	Elliptic_Curve EC;
	/*	
		Curve domain parameters, (test vectors)
		-------------------------------------------------------------------------------------
		p:	c1c627e1638fdc8e24299bb041e4e23af4bb5427		is prime
		a:	c1c627e1638fdc8e24299bb041e4e23af4bb5424		divisor g = 62980
		b:	877a6d84155a1de374b72d9f9d93b36bb563b2ab		divisor g = 227169643
		Gx: 	010aff82b3ac72569ae645af3b527be133442131		divisor g = 32209245
		Gy: 	46b8ec1e6d71e5ecb549614887d57a287df573cc		divisor g = 972	
		precomputed_per_curve_constants:
		U:	c1c627e1638fdc8e24299bb041e4e23af4bb5425
		V:	3e39d81e9c702371dbd6644fbe1b1dc50b44abd9
		
		already prepared mod p to test:
		a:      07189f858e3f723890a66ec1079388ebd2ed509c
		b:      6043379beb0dade6eed1e9d6de64f4a0c50639d4
		gx:     5ef84aacf4f0ea6752f572d0741f40049f354dca
		gy:     418c695435af6b3d4d7cbb72967395016ef67239
		resulting point:
		P.x:    01718f862ebe9423bd661a65355aa1c86ba330f8		program MUST got this point !!
		P.y:    557e8ed53ffbfe2c990a121967b340f62e0e4fe2
		taken mod p:
		P.x:    41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3		
		P.y:    73ca143c9badedf2d9d3c7573307115ccfe04f13
	*/	
	u8 *t;
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427");	memcpy(EC.p, t, 20);
	t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c");	memcpy(EC.a, t, 20);
	t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4");	memcpy(EC.b, t, 20);
	t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca");	memcpy(EC.Gx, t, 20);
	t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239");	memcpy(EC.Gy, t, 20);
	
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425");	memcpy(EC.U, t, 20);
	t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9");	memcpy(EC.V, t, 20);

	/* we need to map buffer now to load some k into data */
	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);

	t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710");
	for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21);
	
	free(t);
//d	for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1);

	/* we can alter just a byte into a chosen k to verify that we'll get a different point! */
	//DP[2].k[2] = 0x09;
	
//no	res = clEnqueueWriteBuffer(cmd_queue, cl_DP,  CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL);	check_return(res);

	res = clEnqueueWriteBuffer(cmd_queue, cl_EC,  CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL);	check_return(res);
	res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL);	check_return(res);

	res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP);		/* i/o buffer */
	res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC);
	res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV);	
	res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG);		//this used to debug kernel output
	check_return(res);

//	printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local);	
	
	cl_event NDRangeEvent;
	cl_ulong start, end;
	
	/* Execute NDrange */	
	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent);		check_return(res);
//	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent);		check_return(res);
	
	printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data));

	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);
	dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res);	check_return(res);
	
	/* using clEnqueueReadBuffer template */
//	res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL);			check_return(res);
		
	clFlush(cmd_queue);
	clFinish(cmd_queue);

	/* get NDRange execution time with internal ocl profiler */
	res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &end,   NULL);
	check_return(res);
	printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000));			//relative to NDRange call
	printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start)));

	
	printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n");
	for(int i = 0; i < n; i++) {		
//		if(i %local == 0) {
			printf("%d \t", i);
			//printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7));
			
			/* silence this doubled debug info
			printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", 
				dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3],
				dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]);
			*/	
			//printf("%d %d\n", P[i].dig, P[i].c);
			bn_print("", DP[i].k, 21, 1);
			bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1);
			
			printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", 
				DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3],
				DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]);
//		}
	}
	
	/* Release OpenCL stuff, free the rest */
	clReleaseMemObject(cl_DP);
	clReleaseMemObject(cl_EC);
	clReleaseMemObject(cl_INV);
	clReleaseMemObject(DEBUG);
	clReleaseKernel(kernelobj);
	clReleaseProgram(pro);
	clReleaseCommandQueue(cmd_queue);
	clReleaseContext(context);
	
	free(kernel_source);
	
	puts("Done!");
	return 0;
}
int main(int argc, char *argv[]) {

   /* Variables used to manage the OpenCL environment. */
   cl_int rc;
   size_t return_size[1];
   unsigned int column_span = 0;
   static cl_device_type device_type = CL_DEVICE_TYPE_DEFAULT;
   static cl_uint kernel_type = KERNEL_DEFAULT;
   static int gpu_wgsz = MAX_WGSZ;

   /* The external file containing the matrix data in Matrix Market format */
   static char *file_name;
   
   /* These variables deal with the source file for the kernel, and the names of the kernels contained therein. */
   char kernel_source_file[8] = "spmv.cl";
   char kernel_name_LS[21]   = "tiled_spmv_kernel_LS";
   char kernel_name_AWGC[23] = "tiled_spmv_kernel_AWGC";
   char kernel_name[32];
   
   /* Basic "size of problem" variables. */
   unsigned int nx; /* Number of elements in the X direction (length of the "input" vector. */
   unsigned int ny; /* Number of elements in the Y direction (length of the "answer" vector. */
   unsigned int non_zero; /* Number of non_zero elements in the matrix. */
   unsigned int nx_pad, nyround; /* Rounded versions of nx and ny. */
   
   /* Variables used to hold user-specified overrides and intermediate control values derived from them. */
   unsigned int *slab_startrow = NULL;
   
   unsigned int segcachesize;
   unsigned int max_slabheight; /* Maximum matrix chunksize. */
   unsigned int i, j, pdex = 0, ddex = 0;
   size_t param_value_size_ret;

   /* ================================================================================== */
   /* Read in command line arguments.                                                    */
   /* ================================================================================== */

   int opt;
   int option_index;

   struct option long_options[] = {
      {"help", no_argument, NULL, 'h'},
      {"accel", no_argument, NULL, 'a'},
      {"cpu", no_argument, NULL, 'c'},
      {"gpu", no_argument, NULL, 'g'},
      {"ls", no_argument, NULL, 'L'},   
      {"awgc", no_argument, NULL, 'A'},   
      {"verify", no_argument, NULL, 'v'},
      {"lwgsize", required_argument, NULL, 'l'},
      {"filename", required_argument, NULL, 'f'},
      {NULL, 0, NULL, 0}
   };
   char *name;

   /* ================================================================================== */
   /* Change current working directory to that of the invocation path so that spmv can   */
   /* be run from any current working directory.                                         */
   /* ================================================================================== */

   name = basename(argv[0]);
   (void)chdir(dirname(argv[0]));

   while (1) {
      opt = getopt_long(argc, argv, "hacgLAl:f:", long_options, &option_index);

      if (opt == -1) break;

      switch (opt) {

      /* -h, --help */
      case 'h': usage(); exit(EXIT_SUCCESS);

      /* -a, --accel */
      case 'a': device_type = CL_DEVICE_TYPE_ACCELERATOR; break;

      /* -c, --cpu */
      case 'c': device_type = CL_DEVICE_TYPE_CPU; break;

      /* -g, --gpu */
      case 'g': device_type = CL_DEVICE_TYPE_GPU; break;

      /* -L, --ls */
      case 'L': kernel_type = KERNEL_LS; break;

      /* -A, --awgc */
      case 'A': kernel_type = KERNEL_AWGC; break;

      /* -l, --lwgsize */
      case 'l': gpu_wgsz = atoi(optarg); break;

      /* -f, --filename */
      case 'f':
         posix_memalign((void **) &file_name, 128, 1+strlen(optarg));
         strcpy(file_name, optarg);
         break;

      case '?':
         printf("Try '%s --help' for more information.\n", name);
         exit(EXIT_FAILURE);
      }
   }

   if (optind != argc) {
      printf("%s: unrecognized option '%s'.\n", name, argv[optind]);
      printf("Try '%s --help' for more information.\n", name);
      exit(EXIT_FAILURE);
   }

   /* ================================================================================== */
   /* Start up OpenCL.                                                                   */
   /* ================================================================================== */

   cl_uint preferred_alignment = 16; // used by "MEMORY_ALLOC_CHECK" macro   
   cl_uint num_platforms;
   rc = clGetPlatformIDs(0, (cl_platform_id *) NULL, &num_platforms);
   CHECK_RESULT("clGetPlatformIDs(num_platforms)")

   platform_struct *platform;
   MEMORY_ALLOC_CHECK(platform, num_platforms * sizeof(platform_struct), "platform");

   cl_mem *buffer;
   MEMORY_ALLOC_CHECK(buffer, num_platforms * sizeof(cl_mem), "buffer");

   cl_platform_id *temp_platform_id_array;
   MEMORY_ALLOC_CHECK(temp_platform_id_array, num_platforms * sizeof(cl_platform_id), "temp_platform_id_array");
   rc = clGetPlatformIDs(num_platforms, temp_platform_id_array, (cl_uint *) NULL);
   CHECK_RESULT("clGetPlatform IDs(Platform IDs)")
   for (i=0; i<num_platforms; ++i) {
      platform[i].id = temp_platform_id_array[i];
   }
   free(temp_platform_id_array);

   printf("[START RUN]\n");
   printf("command line: "); 
   for (i=0; i<(unsigned int) argc; ++i) {
      printf("%s ", argv[i]);
   }
   printf("\n");
   //printf("num_platforms = %d\n\n", num_platforms);

   for (i=0; i<num_platforms; ++i) {
      rc = clGetPlatformInfo(platform[i].id, CL_PLATFORM_NAME, (size_t) 0, NULL, (size_t *) &param_value_size_ret);
      CHECK_RESULT("clGetPlatformInfo(size of platform name)")
      MEMORY_ALLOC_CHECK(platform[i].name, param_value_size_ret, "platform name");
      rc = clGetPlatformInfo(platform[i].id, CL_PLATFORM_NAME, param_value_size_ret, platform[i].name, (size_t *) NULL);
      CHECK_RESULT("clGetPlatformInfo(platform name)")

      rc = clGetDeviceIDs(platform[i].id, CL_DEVICE_TYPE_ALL, 0, NULL, (cl_uint *) &(platform[i].num_devices));
      CHECK_RESULT("clGetDeviceIDs(number of devices)")

      MEMORY_ALLOC_CHECK(platform[i].device, platform[i].num_devices * sizeof(device_struct), "device structure");

      cl_device_id *tmpdevices;
      MEMORY_ALLOC_CHECK(tmpdevices, platform[i].num_devices * sizeof(cl_device_id), "tmpdevices");
      rc = clGetDeviceIDs(platform[i].id, CL_DEVICE_TYPE_ALL, platform[i].num_devices, tmpdevices, NULL);
      CHECK_RESULT("clGetDeviceIDs(list of device IDs)")
      for (j=0; j<platform[i].num_devices; ++j) {
         platform[i].device[j].id = tmpdevices[j];
         rc = clGetDeviceInfo(platform[i].device[j].id, CL_DEVICE_TYPE, sizeof(cl_device_type), &platform[i].device[j].type, NULL);
         CHECK_RESULT("clGetDeviceInfo(device type)")
      }
      free(tmpdevices);
   }

   /* ================================================================================== */
   /* Choose the best device to use, if one is not explicitly called for.                */
   /* If a device is specified, ensure that device is present on this hardware.          */
   /* ================================================================================== */

   if (device_type == CL_DEVICE_TYPE_DEFAULT) {
      int accel_found = 0;
      for (i=0; i<num_platforms; ++i) {
         for (j=0; j<platform[i].num_devices; ++j) {
            if (platform[i].device[j].type == CL_DEVICE_TYPE_ACCELERATOR) {
               accel_found = 1;
               pdex = i;
               ddex = j;
            }
         }
      }
      if (!accel_found) {
         int gpu_found = 0; 
         for (i=0; i<num_platforms; ++i) {
            for (j=0; j<platform[i].num_devices; ++j) {
               if ((gpu_found == 0) && (platform[i].device[j].type == CL_DEVICE_TYPE_GPU)) {
                  gpu_found = 1;
                  pdex = i;
                  ddex = j;
               }
            }
         }
         if (!gpu_found) {
            int cpu_found = 0; 
            for (i=0; i<num_platforms; ++i) {
               for (j=0; j<platform[i].num_devices; ++j) {
                  if (platform[i].device[j].type == CL_DEVICE_TYPE_CPU) {
                     cpu_found = 1;
                     pdex = i;
                     ddex = j;
                  }
               }
            }
            if (!cpu_found) {
               fprintf(stderr, "no devices of any kind were found on this system.  Leaving...\n"); 
               fflush(stderr);
               exit(EXIT_FAILURE);
            }
         }
      }
   }
   else {
      int device_found = 0;
      for (i=0; i<num_platforms; ++i) for (j=0; j<platform[i].num_devices; ++j) {
         if (platform[i].device[j].type == device_type) {
            device_found = 1;
            pdex = i;
            ddex = j;
         }
      }
      if (device_found == 0) {
         fprintf(stderr, "no devices of the requested type were found on this system.  Leaving...\n"); 
         fflush(stderr);
         exit(EXIT_FAILURE);
      }
   }

   /* ================================================================================== */
   /* Choose the best kernel to use, if one is not explicitly called for.                */
   /* ================================================================================== */

   if (kernel_type == KERNEL_DEFAULT) {
      kernel_type = (platform[pdex].device[ddex].type == CL_DEVICE_TYPE_ACCELERATOR) ? KERNEL_AWGC : KERNEL_LS;
   }

   /* ================================================================================== */
   /* Create a context.                                                                  */
   /* ================================================================================== */

   cl_context_properties properties[3];
   properties[0] = CL_CONTEXT_PLATFORM;
   properties[1] = (const cl_context_properties) platform[pdex].id;
   properties[2] = 0;
   platform[pdex].context = clCreateContext((const cl_context_properties *) properties, 1, &(platform[pdex].device[ddex].id), NULL, NULL, &rc);
   CHECK_RESULT("clCreateContext")

   /* ================================================================================== */
   /* Build the kernel, create the Command Queue, and print kernel/device info.          */
   /* ================================================================================== */

   switch (kernel_type) {
      case KERNEL_LS:
      strcpy(kernel_name, kernel_name_LS);
      break;
      case KERNEL_AWGC: 
      strcpy(kernel_name, kernel_name_AWGC);
      break;
   }

   char *kernel_source;
   kernel_source = load_program_source(kernel_source_file);
   if (kernel_source == NULL) {
      fprintf(stderr, "Error: Failed to load compute program from file!\n");
      exit(EXIT_FAILURE);
   }

   platform[pdex].program = clCreateProgramWithSource(platform[pdex].context, 1, (const char **) &kernel_source, NULL, &rc);
   CHECK_RESULT("clCreateProgramWithSource")
   free(kernel_source);

   rc = clBuildProgram(platform[pdex].program, 1, &(platform[pdex].device[ddex].id), "", NULL, NULL);
   CHECK_RESULT("clBuildProgram")

   platform[pdex].kernel = clCreateKernel(platform[pdex].program, kernel_name, &rc);
   CHECK_RESULT("clCreateKernel")

   platform[pdex].device[ddex].ComQ = clCreateCommandQueue(platform[pdex].context, platform[pdex].device[ddex].id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &rc);
   CHECK_RESULT("clCreateCommandQueue")

   rc = clGetDeviceInfo(platform[pdex].device[ddex].id, CL_DEVICE_NAME, (size_t) 0, NULL, (size_t *) &param_value_size_ret);
   CHECK_RESULT("clGetDeviceInfo(size of CL_DEVICE_NAME)")
   MEMORY_ALLOC_CHECK(platform[pdex].device[ddex].name, param_value_size_ret, "device name");
   rc = clGetDeviceInfo(platform[pdex].device[ddex].id, CL_DEVICE_NAME, (size_t) param_value_size_ret, platform[pdex].device[ddex].name, (size_t *) NULL);
   CHECK_RESULT("clGetDeviceInfo(CL_DEVICE_NAME)")

   printf("We'll run kernel %s on device %s\n", ((kernel_type == KERNEL_LS) ? "kernel_ls" : "kernel_awgc"), platform[pdex].device[ddex].name); 

   /* ================================================================================== */
   /* Determine device alignment, and whether "out-of-order" processing is supported.    */
   /* ================================================================================== */

   rc = clGetDeviceInfo(platform[pdex].device[ddex].id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &preferred_alignment, NULL);
   CHECK_RESULT("clGetDeviceInfo(CL_DEVICE_MEM_BASE_ADDR_ALIGN)")
   if (preferred_alignment > 1024) preferred_alignment = 1024;
   preferred_alignment /= 8;  /* Convert from units of bits to units of bytes. */

   cl_command_queue_properties command_queue_properties;
   clGetDeviceInfo (platform[pdex].device[ddex].id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &command_queue_properties, NULL); 
   command_queue_properties &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;

   /* ================================================================================== */
   /* Determine local memory size and maximum compute units.                             */
   /* ================================================================================== */

   size_t kernel_wg_size;
   rc = clGetKernelWorkGroupInfo (platform[pdex].kernel, platform[pdex].device[ddex].id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &kernel_wg_size, return_size);
   CHECK_RESULT("clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)")

   cl_ulong total_local_mem;
   rc = clGetDeviceInfo (platform[pdex].device[ddex].id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (cl_ulong), (void *) &total_local_mem, NULL);
   CHECK_RESULT("clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_SIZE)")

   cl_ulong used_local_mem;
   rc = clGetKernelWorkGroupInfo (platform[pdex].kernel, platform[pdex].device[ddex].id, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (cl_ulong), &used_local_mem, NULL);
   CHECK_RESULT("clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)")

   cl_ulong local_mem_size;
   local_mem_size = total_local_mem - used_local_mem;

   cl_uint max_compute_units;
   clGetDeviceInfo (platform[pdex].device[ddex].id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); 

   /* ================================================================================== */
   /* Set up parameter structure and call the function that builds the tiled matrix.     */
   /* ================================================================================== */

   matrix_gen_struct mgs;
   unsigned int nslabs_round, memsize;
   packet *seg_workspace;
   slab_header *matrix_header;
   unsigned int num_header_packets;
   unsigned int *row_index_array = NULL;
   unsigned int *x_index_array = NULL;
   float *data_array = NULL;

   mgs.matrix_header = &matrix_header;
   mgs.seg_workspace = &seg_workspace;
   mgs.num_header_packets = &num_header_packets;
   mgs.row_index_array = &row_index_array;
   mgs.x_index_array = &x_index_array;
   mgs.data_array = &data_array;
   mgs.nx_pad = &nx_pad;
   mgs.nyround = &nyround;
   mgs.slab_startrow = &slab_startrow;
   mgs.nx = &nx;
   mgs.ny = &ny;
   mgs.non_zero = &non_zero;
   mgs.file_name = (char *) file_name;
   mgs.preferred_alignment = preferred_alignment;
   mgs.max_compute_units = &max_compute_units;
   mgs.kernel_type = kernel_type;
   mgs.column_span = &column_span;
   mgs.local_mem_size = (unsigned int) local_mem_size;
   mgs.segcachesize = &segcachesize;
   mgs.max_slabheight = &max_slabheight;
   mgs.device_type = platform[pdex].device[ddex].type,
   mgs.gpu_wgsz = &gpu_wgsz,
   mgs.kernel_wg_size = kernel_wg_size;
   mgs.nslabs_round = &nslabs_round;
   mgs.memsize = &memsize;

   rc = matrix_gen(&mgs);

   /* =============================================================================================== */
   /* Compute the local and global work group sizes.                                                  */
   /* =============================================================================================== */

   unsigned int ndims;
   unsigned int team_size;

   size_t global_work_size[3];
   size_t local_work_size[3];
   if (kernel_type == KERNEL_AWGC) {
      ndims = 1;
      global_work_size[0] = nslabs_round;
      local_work_size[0] = 1;
   }
   else {
      ndims = 2;
      team_size = (platform[pdex].device[ddex].type == CL_DEVICE_TYPE_GPU) ? 16 : 1;
      global_work_size[1] = nslabs_round;
      local_work_size[1] = 1;
      global_work_size[0] = local_work_size[0] = (platform[pdex].device[ddex].type == CL_DEVICE_TYPE_GPU) ? gpu_wgsz : CPU_WGSZ;
      int max_aggregate_local_work_group_size = 0;
      int aggregate_local_work_group_size = 1;
      for (i=0; i<ndims; ++i) {
         aggregate_local_work_group_size *= local_work_size[i];
      }
      max_aggregate_local_work_group_size = aggregate_local_work_group_size;
      if (max_aggregate_local_work_group_size > (int) kernel_wg_size) {
         while (max_aggregate_local_work_group_size > (int) kernel_wg_size) {
            local_work_size[0] /= 2;
            gpu_wgsz /= 2;
            max_aggregate_local_work_group_size /= 2;
         }
         printf("coercing work group size to fit within hardware limits.  New size is %d\n", gpu_wgsz);
      }
   }

   /* =============================================================================================== */
   /* Our Tiled format is now complete, but still in "working storage".  We cannot allocate its       */
   /* buffer in OpenCL until we know how big it is, and now, we finally know how big it is.  So, we   */
   /* create the Input and Output arrays, and the final array to hold the Tiled Format of the Matrix. */
   /* =============================================================================================== */

   /* Arrays to hold input and output data, and the finished tiled matrix data. */
   float *input_array, *output_array, *output_array_verify;
   unsigned int *tilebuffer;
   
   MEMORY_ALLOC_CHECK(output_array_verify, (nyround * sizeof(float)), "output_array_verify") 
   if (output_array_verify == NULL) {
      fprintf(stderr, "insufficient memory to perform this workload.\n"); fflush(stderr);
      exit(EXIT_FAILURE);
   }

   cl_mem input_buffer;
   cl_mem matrix_buffer;
   cl_mem output_buffer;
   unsigned int input_buffer_size;
   unsigned int matrix_buffer_size;
   /* Create the input and matrix buffer memory objects. */
   input_buffer_size = (nx_pad * sizeof(float));
   input_buffer = clCreateBuffer(platform[pdex].context, CL_MEM_ALLOC_HOST_PTR, input_buffer_size, NULL, &rc);
   CHECK_RESULT("clCreateBuffer(input_buffer)")

   matrix_buffer_size = memsize;
   matrix_buffer = clCreateBuffer(platform[pdex].context, CL_MEM_ALLOC_HOST_PTR, matrix_buffer_size, NULL, &rc);
   CHECK_RESULT("clCreateBuffer(matrix_buffer)")

   cl_event events[2];

   unsigned int output_buffer_size;
   output_buffer_size = (slab_startrow[nslabs_round] - slab_startrow[0]) * sizeof(float);
   output_buffer = clCreateBuffer(platform[pdex].context, CL_MEM_ALLOC_HOST_PTR, output_buffer_size, NULL, &rc);
   CHECK_RESULT("clCreateBuffer(output_buffer)")

   /* =============================================================================================== */
   /* Map these buffers to allocate pointers into these buffers that we can use to load them.         */
   /* =============================================================================================== */

   input_array =       (float *) clEnqueueMapBuffer(platform[pdex].device[ddex].ComQ, 
                                                       input_buffer, 
                                                       CL_TRUE, 
                                                       CL_MAP_WRITE, 
                                                       0, 
                                                       (size_t) input_buffer_size, 
                                                       0, 
                                                       NULL, 
                                                       NULL, 
                                                       &rc);
   CHECK_RESULT("clEnqueueMapBuffer(input_array)")

   tilebuffer = (unsigned int *) clEnqueueMapBuffer(platform[pdex].device[ddex].ComQ, 
                                                       matrix_buffer, 
                                                       CL_TRUE, 
                                                       CL_MAP_WRITE, 
                                                       0, 
                                                       (size_t) matrix_buffer_size, 
                                                       0, 
                                                       NULL, 
                                                       NULL, 
                                                       &rc);
   CHECK_RESULT("clEnqueueMapBuffer(tilebuffer)")

   output_array =     (float *) clEnqueueMapBuffer(platform[pdex].device[ddex].ComQ, 
                                                      output_buffer, 
                                                      CL_TRUE, 
                                                      CL_MAP_WRITE, 
                                                      0, 
                                                      (size_t) output_buffer_size, 
                                                      0, 
                                                      NULL, 
                                                      NULL, 
                                                      &rc);
   CHECK_RESULT("clEnqueueMapBuffer(output_array)")

   /* =============================================================================================== */
   /* Copy the tiled matrix into the memory buffer, and then unmap it.                                */
   /* =============================================================================================== */

   memcpy(tilebuffer, seg_workspace, sizeof(packet) * (matrix_header[nslabs_round].offset));
   rc = clEnqueueUnmapMemObject(platform[pdex].device[ddex].ComQ, matrix_buffer, tilebuffer, 0, NULL, &events[0]);
   CHECK_RESULT("clEnqueueUnmapMemObject(tilebuffer)")
   clWaitForEvents(1, events);

   /* Load random data into the input array.                                         */
   /* The user can substitute initialization of real data at this point in the code. */
   for (i=0; i<nx; ++i) {
      float rval;
      rval = ((float) (rand() & 0x7fff)) * 0.001f - 15.0f;
      input_array[i] = rval;
   }

   /* Zero out the output array.                                                             */
   /* Note that this is only needed because some matrices are singular and have whole rows   */
   /* that are all zero, which is detected, and no work is done on those rows, so that they  */
   /* will never get written by the kernel, so to be safe, we zero it all out here, as well. */

   memset((void *) output_array, 0, output_buffer_size);

   /* =============================================================================================== */
   /* Unmap the input and output memory buffers, to prepare for kernel execution.                     */
   /* =============================================================================================== */

   rc = clEnqueueUnmapMemObject(platform[pdex].device[ddex].ComQ, input_buffer, input_array,   0, NULL, &events[0]);
   CHECK_RESULT("clEnqueueUnmapMemObject(input_array)")
   rc = clEnqueueUnmapMemObject(platform[pdex].device[ddex].ComQ, output_buffer, output_array, 0, NULL, &events[1]);
   CHECK_RESULT("clEnqueueUnmapMemObject(output_array)")
   clWaitForEvents(2, events);

   /* =============================================================================================== */
   /* Execution: Multiplication of the input array times the Tiled Format of the Matrix.              */
   /* =============================================================================================== */

   /* Run once to verifying correct answer, and computing a baseline number of repetitions for later performance measurements. */

   rc = clSetKernelArg(platform[pdex].kernel, 0, sizeof(cl_mem), (const void *) &input_buffer);
   CHECK_RESULT("clSetKernelArg(0)")
   rc = clSetKernelArg(platform[pdex].kernel, 1, sizeof(cl_mem), (const void *) &output_buffer);
   CHECK_RESULT("clSetKernelArg(1)")
   rc = clSetKernelArg(platform[pdex].kernel, 2, sizeof(cl_mem), (const void *) &matrix_buffer);
   CHECK_RESULT("clSetKernelArg(2)")
   rc = clSetKernelArg(platform[pdex].kernel, 3, sizeof(cl_uint), &column_span);
   CHECK_RESULT("clSetKernelArg(3)")
   rc = clSetKernelArg(platform[pdex].kernel, 4, sizeof(cl_uint), &max_slabheight);
   CHECK_RESULT("clSetKernelArg(4)")

   if (kernel_type == KERNEL_LS) {
      rc = clSetKernelArg(platform[pdex].kernel, 5, sizeof(cl_uint), &team_size);
      CHECK_RESULT("clSetKernelArg(5)")
      rc = clSetKernelArg(platform[pdex].kernel, 6, sizeof(cl_uint), &num_header_packets);
      CHECK_RESULT("clSetKernelArg(6)")
      rc = clSetKernelArg(platform[pdex].kernel, 7, (size_t) (max_slabheight * sizeof(float)), (void *) NULL);
      CHECK_RESULT("clSetKernelArg(7)")
   }
Exemple #12
0
int main(int argc, char * argv[])
{
    init_rpc(argv[1]);

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem d_in_pos, d_in_vel, d_out_pos, d_out_vel;

    int iterations = 100;
    int num_bodies = 1024;
    float espSqr = 500.0f;
    float delT = 0.005f;
    int exchange = 1;

    size_t buf_size = 4 * num_bodies * sizeof(float);
    float *ref_pos = (float *)malloc(buf_size);
    float *ref_vel = (float *)malloc(buf_size);

    int i, j;
    for (i = 0; i < num_bodies; i++) {
        int index = 4 * i;

        for (j = 0; j < 3; ++j) {
            ref_pos[index + j] = frandom(3, 50);
        }

        ref_pos[index + 3] = frandom(1, 1000);

        for (j = 0; j < 3; ++j) {
            ref_vel[index + j] = 0.0f;
        }
        ref_vel[3] = 0.0f;
    }

    size_t local_work_size[1];
    size_t global_work_size[1];

    local_work_size[0] = 256;
    global_work_size[0] = num_bodies;

    const char *source = load_program_source("NBody.cl");
    size_t source_len = strlen(source);;
    cl_uint err = 0;

    char *flags = "";

    clGetPlatformIDs(1, &platform, NULL);
    printf("platform %p err %d\n", platform, err);

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err);
    printf("device %p err %d\n", device, err);

    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    printf("context %p err %d\n", context, err);

    queue = clCreateCommandQueue(context, device, 0, &err);
    printf("queue %p err %d\n", queue, err);

    program = clCreateProgramWithSource(context, 1, &source, &source_len, &err);
    printf("program %p err %d\n", program, err);

    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    printf("err %d\n", err);

    kernel = clCreateKernel(program, "nbody_sim", NULL);
    printf("kernel %p\n", kernel);

    d_in_pos = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        buf_size, ref_pos, &err);
    printf("d_in_pos %p err %d\n", d_in_pos, err);

    d_in_vel = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        buf_size, ref_vel, &err);
    printf("d_in_vel %p err %d\n", d_in_vel, err);

    d_out_pos = clCreateBuffer(context, CL_MEM_READ_WRITE,
        buf_size, NULL, &err);
    printf("d_out_pos %p err %d\n", d_out_pos, err);

    d_out_vel = clCreateBuffer(context, CL_MEM_READ_WRITE,
        buf_size, NULL, &err);
    printf("d_out_vel %p err %d\n", d_out_vel, err);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_in_pos);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_in_vel);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 2, sizeof(int), (void*)&num_bodies);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 3, sizeof(float), (void*)&delT);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 4, sizeof(float), (void*)&espSqr);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 5, 256 * 4 * sizeof(float), NULL);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&d_out_pos);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&d_out_vel);
    printf("err %d\n", err);

    for (i = 0; i < iterations; i++) {

        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
            local_work_size, 0, NULL, NULL);
        printf("err %d\n", err);

        clFinish(queue);

        err = clSetKernelArg(kernel, exchange ? 6 : 0, sizeof(cl_mem), 
            (void*)&d_in_pos);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel, exchange ? 7 : 1, sizeof(cl_mem), 
            (void*)&d_in_vel);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel, exchange ? 0 : 6, sizeof(cl_mem), 
            (void*)&d_out_pos);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel, exchange ? 1 : 7, sizeof(cl_mem), 
            (void*)&d_out_vel);
        printf("err %d\n", err);

        exchange = exchange ? 0 : 1;

    }

    err = clEnqueueReadBuffer(queue, d_out_pos, CL_TRUE, 0, buf_size, ref_pos,
        0, NULL, NULL);
    printf("err %d\n", err);

    err = clEnqueueReadBuffer(queue, d_out_vel, CL_TRUE, 0, buf_size, ref_vel,
        0, NULL, NULL);
    printf("err %d\n", err);

    for (i = 0; i < num_bodies ; i++) {
        printf("%i %f %f\n", i, ref_pos[i], ref_vel[i]);
    }

    clReleaseMemObject(d_in_pos);
    clReleaseMemObject(d_in_vel);
    clReleaseMemObject(d_out_pos);
    clReleaseMemObject(d_out_vel);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);

}
Exemple #13
0
int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel_one, kernel_path;
    cl_mem d_mt_state, d_mt_emit, d_max_prob_old;
    cl_mem d_max_prob_new, d_path, v_prob, v_path;

    int wg_size = 256;
    int n_state = 256*16;
    int n_emit = 128;
    int n_obs = 100;

    size_t init_prob_size = sizeof(float) * n_state;
    size_t mt_state_size = sizeof(float) * n_state * n_state;
    size_t mt_emit_size = sizeof(float) * n_emit * n_state;

    float *init_prob = (float *) malloc(init_prob_size);
    float *mt_state = (float *) malloc(mt_state_size);
    float *mt_emit = (float *) malloc(mt_emit_size);
    int *obs = (int *) malloc(sizeof(int) * n_obs);
    int *viterbi_gpu = (int *) malloc(sizeof(int) * n_obs);

    srand(2012);
    initHMM(init_prob, mt_state, mt_emit, n_state, n_emit);

    int i;
    for (i = 0; i < n_obs; i++) {
        obs[i] = i % 15;
    }

    const char *source = load_program_source("Viterbi.cl");
    size_t source_len = strlen(source);;
    cl_uint err = 0;

    char *flags = "-cl-fast-relaxed-math";

    clGetPlatformIDs(1, &platform, NULL);
    printf("platform %p err %d\n", platform, err);

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err);
    printf("device %p err %d\n", device, err);

    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    printf("context %p err %d\n", context, err);

    queue = clCreateCommandQueue(context, device, 0, &err);
    printf("queue %p err %d\n", queue, err);

    program = clCreateProgramWithSource(context, 1, &source, &source_len, &err);
    printf("program %p err %d\n", program, err);

    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    printf("err %d\n", err);

    /*
    char tmp[102400];
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(tmp),
        tmp, NULL);

    printf("error %s\n", tmp);
    */

    kernel_one = clCreateKernel(program, "ViterbiOneStep", &err);
    printf("kernel %p err %d\n", kernel_one, err);

    kernel_path = clCreateKernel(program, "ViterbiPath", &err);
    printf("kernel %p err %d\n", kernel_path, err);

    d_mt_state = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_state_size, 
        NULL, &err);
    printf("buffer %p\n", d_mt_state);

    d_mt_emit = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_emit_size, 
        NULL, &err);
    printf("buffer %p\n", d_mt_emit);

    d_max_prob_new = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        init_prob_size, NULL, &err);
    printf("buffer %p\n", d_max_prob_new);

    d_max_prob_old = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        init_prob_size, NULL, &err);
    printf("buffer %p\n", d_max_prob_old);

    d_path = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        sizeof(int)*(n_obs-1)*n_state, NULL, &err);
    printf("buffer %p\n", d_path);

    v_prob = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float),
        NULL, &err);
    printf("buffer %p\n", v_prob);

    v_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*n_obs,
        NULL, &err);
    printf("buffer %p\n", v_prob);

    err = clEnqueueWriteBuffer(queue, d_mt_state, CL_TRUE, 0, mt_state_size,
        mt_state, 0, NULL, NULL);
    printf("err %d\n", err);

    err = clEnqueueWriteBuffer(queue, d_mt_emit, CL_TRUE, 0, mt_emit_size,
        mt_emit, 0, NULL, NULL);
    printf("err %d\n", err);

    err = clEnqueueWriteBuffer(queue, d_max_prob_old, CL_TRUE, 0, init_prob_size,
        init_prob, 0, NULL, NULL);
    printf("err %d\n", err);

    // max_wg_size is 1024 for Intel Core 2 CPU
    size_t max_wg_size;
    err = clGetKernelWorkGroupInfo(kernel_one, device, 
        CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL);
    printf("max_wg_size %d\n", max_wg_size);

    size_t local_work_size[2], global_work_size[2];
    local_work_size[0] = wg_size;
    local_work_size[1] = 1;
    global_work_size[0] = local_work_size[0] * 256;
    global_work_size[1] = n_state/256;

    for (i = 1; i < n_obs; i++) {
        err = clSetKernelArg(kernel_one, 0, sizeof(cl_mem), 
            (void*)&d_max_prob_new);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 1, sizeof(cl_mem), 
            (void*)&d_path);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 2, sizeof(cl_mem), 
            (void*)&d_max_prob_old);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 3, sizeof(cl_mem), 
            (void*)&d_mt_state);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 4, sizeof(cl_mem),
            (void*)&d_mt_emit);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 5, sizeof(float)*local_work_size[0],
            NULL);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 6, sizeof(int)*local_work_size[0],
            NULL);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 7, sizeof(int),
            (void*)&n_state);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 8, sizeof(int),
            (void*)&(obs[i]));
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 9, sizeof(int),
            (void*)&i);
        printf("err %d\n", err);


        err = clEnqueueNDRangeKernel(queue, kernel_one, 2, NULL, 
            global_work_size, local_work_size, 0, NULL, NULL);
        printf("err %d\n", err);

        err = clEnqueueCopyBuffer(queue, d_max_prob_new, d_max_prob_old, 0, 0,
            sizeof(float)*n_state, 0, NULL, NULL);
        printf("err %d\n", err);
    }

    local_work_size[0] = 1;
    global_work_size[0] = 1;

    err = clSetKernelArg(kernel_path, 0, sizeof(cl_mem), (void*)&v_prob);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 1, sizeof(cl_mem), (void*)&v_path);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 2, sizeof(cl_mem), 
        (void*)&d_max_prob_new);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 3, sizeof(cl_mem), (void*)&d_path);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 4, sizeof(int), (void*)&n_state);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 5, sizeof(int), (void*)&n_obs);
    printf("err %d\n", err);

    err = clEnqueueNDRangeKernel(queue, kernel_path, 1, NULL, 
        global_work_size, local_work_size, 0, NULL, NULL);
    printf("err %d\n", err);

    clFinish(queue);
    printf("finish done\n");

    err = clEnqueueReadBuffer(queue, v_path, CL_TRUE, 0, sizeof(int)*n_obs, 
        viterbi_gpu, 0, NULL, NULL);
    printf("err %d\n", err);

    for (i = 0; i < n_obs; i++) {
        printf("%d %d\n", i, viterbi_gpu[i]);
    }

    clReleaseMemObject(d_mt_state);
    clReleaseMemObject(d_mt_emit);
    clReleaseMemObject(d_max_prob_old);
    clReleaseMemObject(d_max_prob_new);
    clReleaseMemObject(d_path);
    clReleaseMemObject(v_prob);
    clReleaseMemObject(v_path);
    clReleaseProgram(program);
    clReleaseKernel(kernel_one);
    clReleaseKernel(kernel_path);
    clReleaseCommandQueue(queue);
}
Exemple #14
0
int main(int argc, char *argv[])
{
    init_rpc(argv[1]);

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem buffer;

    size_t i;
    int scale = 8; // scale should be higher than 8
    size_t array_size = powl(2, scale) * 4;
    cl_int *input = (cl_int *) malloc(sizeof(cl_int) * array_size);
    cl_int *output = (cl_int *) malloc(sizeof(cl_int) * array_size);

    cl_int dir = 1;
    cl_int no_stages = 0;
    cl_int temp;

    generateInput(input, array_size);
    //ExecuteSortReference(input, array_size, dir);

    for (temp = array_size; temp > 2; temp >>= 1) {
        no_stages++;
    }

    size_t local_work_size[1];
    size_t global_work_size[1];

    const char *source = load_program_source("BitonicSort.cl");
    size_t source_len = strlen(source);;
    cl_uint err = 0;

    char *flags = "-cl-fast-relaxed-math";

    clGetPlatformIDs(1, &platform, NULL);
    printf("platform %p err %d\n", platform, err);

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err);
    printf("device %p err %d\n", device, err);

    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    printf("context %p err %d\n", context, err);

    queue = clCreateCommandQueue(context, device, 0, &err);
    printf("queue %p err %d\n", queue, err);

    program = clCreateProgramWithSource(context, 1, &source, &source_len, &err);
    printf("program %p err %d\n", program, err);

    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    printf("err %d\n", err);

    kernel = clCreateKernel(program, "BitonicSort", NULL);
    printf("kernel %p\n", kernel);

    buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, 
        sizeof(cl_int) * array_size, input, &err);
    printf("buffer %p err %d\n", buffer, err);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&dir);
    printf("err %d\n", err);

    cl_int stage, pass_stage;

    for (stage = 0; stage < no_stages; stage++) {
        err = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&stage);
        printf("err %d\n", err);

        for (pass_stage = stage; pass_stage >= 0; pass_stage--) {
            err = clSetKernelArg(kernel, 2, sizeof(cl_int), 
                (void*)&pass_stage);
            printf("err %d\n", err);

            size_t gsz = array_size/(2*4);
            global_work_size[0] = pass_stage ? gsz : gsz << 1;
            local_work_size[0] = 128;

            err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
                local_work_size, 0, NULL, NULL);
            printf("err %d\n", err);
        }
    }

    clFinish(queue);

    err = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, 
        sizeof(cl_int) * array_size, output, 0, NULL, NULL);
    printf("err %d\n", err);

    for (i = 0; i < array_size; i++) {
        printf("%i %i\n", i, output[i]);
    }

    clReleaseMemObject(buffer);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    
}
Exemple #15
0
int main( int argc, char* argv[] )
{
    // Length of vectors
  int m = atoi(argv[4]);
	unsigned int n=(256*m);
//matrix variable
 // OpenCL device memory for matrices
   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;

//########################Vector Add Variables
// Host input vectors
    int *h_a;
    int *h_b;
    // Host output vector
    int *h_c;
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
//	cl_kernel *kernel; 
    cl_platform_id* cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    //cl_command_queue* queue;           // command queue
    //cl_command_queue queue;           // command queue
//    cl_program *program;               // program
cl_platform_id* platforms;		// platform id,
// differnt for all the device we have in the system
cl_uint platformCount; //keeps the divice count

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(int);
 
    // Allocate memory for each vector on host
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = i;
        h_b[i] = i;
//	printf("%d ",h_a[i]);
    }
 
    size_t globalSize, localSize; //similar to cuda
    cl_int err;//for errors
    int workgrp;
    int wrkitm;
    int num_ker;
    num_ker=atoi(argv[2]);
    wrkitm=atoi(argv[3]);// i have tried automating lots of data,
    // Number of work items in each local work group
    localSize = wrkitm ;
    // Number of total work items - localSize must be devisor
    globalSize = n;
//################################# Done vector ###################
//#############Matrix Multiplication Variables ###############
int WA,HA,WB,HB,WC,HC;
WA = n;
HA = WA;
WB = WA;
HB = WB;
WC = WA;
HC = WA;
   // set seed for rand()
   srand(2006);

   // 1. allocate host memory for matrices A and B
        //automate the size of the matrix
   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(float) * size_A;
   float* h_A = (float*) malloc(mem_size_A);

   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(float) * size_B;
   float* h_B = (float*) malloc(mem_size_B);
// 4. allocate host memory for the result C
   unsigned int size_C = WC * HC;
   unsigned int  mem_size_C = sizeof(float) * size_C;
   float* h_C = (float*) malloc(mem_size_C);
 // 2. initialize host memory
   randomInit(h_A, size_A);
   randomInit(h_B, size_B);
//######################## matrix done #######################
//mallocing for array of queues (break through)
cl_command_queue * queue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue));
cl_kernel *kernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel));
cl_program *program=(cl_program *)malloc(num_ker * sizeof(cl_kernel));
//defining platform
 clGetPlatformIDs(0, NULL, &platformCount);
    cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here

int choice = atoi(argv[1]);
if(choice ==1)
{
// we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here
// we can these multiple times depending on requirements
    err = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    
        printf("Error: Failed to create a device group!\n");
}

else
{
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

    if (err != CL_SUCCESS)

    {

        printf("Error: Failed to create a device group!\n");
}
}
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
//malloc file and kernel variable
char **file=(char **)malloc(num_ker * sizeof(char *));
char **KernelSource=(char **)malloc(num_ker * sizeof(char *));

	for(i=0;i<num_ker;i++)
	{
    queue[i] = clCreateCommandQueue(context, device_id, 0, &err);
	}
	file[0]="vectadd.cl";
        KernelSource[0] =  load_program_source(file[0]);
        file[1]="matxm.cl";
        KernelSource[1] =  load_program_source(file[1]);
for(i=0;i<num_ker;i++)
{
	// Create the compute program from the source buffer
    program[i] = clCreateProgramWithSource(context, 1,
                            (const char **) & KernelSource[i], NULL, &err);
    // Build the program executable
    clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL);
    // Create the compute kernel in the program we wish to run
    kernel[i] = clCreateKernel(program[i], file[i], &err);
 }
//Vector Start
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
//vector finsih
//matrix start 
d_C = clCreateBuffer(context, CL_MEM_READ_WRITE,
          mem_size_A, NULL, &err);
   d_A = clCreateBuffer(context,
          CL_MEM_READ_WRITE,
          mem_size_A, h_A, &err);
   d_B = clCreateBuffer(context,
          CL_MEM_READ_WRITE,
          mem_size_B, h_B, &err);
//matrix finish
	// Write our data set into the input array in device memory
	for(i=0;i<num_ker;i++)
{
if(i=0)//for vectorADD
{
    err = clEnqueueWriteBuffer(queue[i], d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue[i], d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL);
  // Set the arguments to our compute kernel
    err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &d_a);
    err = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), &d_b);
    err = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &d_c);
    err = clSetKernelArg(kernel[i], 3, sizeof(unsigned int), &n);
  // Get the maximum work group size for executing the kernel on the device
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

}
else if(i=1)
{ err = clEnqueueWriteBuffer(queue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL);
 //size_t localWorkSize[2], globalWorkSize[2];

   int wA = WA;
   int wC = WC;
   err = clSetKernelArg(kernel[i], 0,
              sizeof(cl_mem), (void *)&d_C);
   err = clSetKernelArg(kernel[i], 1,
              sizeof(cl_mem), (void *)&d_A);
   err = clSetKernelArg(kernel[i], 2,
              sizeof(cl_mem), (void *)&d_B);
   err = clSetKernelArg(kernel[i], 3,
              sizeof(int), (void *)&wA);
   err = clSetKernelArg(kernel[i], 4,
              sizeof(int), (void *)&wC);

}
}
  /*  // Set the arguments to our compute kernel
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
  // Get the maximum work group size for executing the kernel on the device
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    */
//struct timeval tim;
  //double t1,t2;

    //gettimeofday(&tim, NULL);
    //t1=tim.tv_sec+(tim.tv_usec/1000000.0);

//need to work on work size#############################
for(i=0;i<num_ker;i++)
{
err = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);


}

//for(i=0;i<num_ker;i++)
//clFinish(queue[i]);

//gettimeofday(&tim, NULL);
  //  t2=tim.tv_sec+(tim.tv_usec/1000000.0);
//printf("GPU time %.4lf\t",(t2-t1));

for(i=0;i<num_ker;++i)
{
if(i=0)
{
clEnqueueReadBuffer(queue[i], d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL ); 
}
else if(i=1)
{
err = clEnqueueReadBuffer(queue[i],
              d_C, CL_TRUE, 0, mem_size_C,
              h_C, 0, NULL, NULL);
   }
}  
for(i=0;i<num_ker;++i)
{
clFinish(queue[i]);
}
    // release OpenCL resources
    free(h_A);
   free(h_B);
   free(h_C);

   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);
   clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
 //   clReleaseProgram(program);
   // clReleaseKernel(kernel);
for(i=0;i<num_ker;++i)
{
    clReleaseCommandQueue(queue[i]);
    clReleaseKernel(kernel[i]);
    clReleaseProgram(program[i]);
}
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Exemple #16
0
int main(int argc, char *argv[])
{
    init_rpc(argv[1]);

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem d_input_r;
    cl_mem d_input_i;

    int length = 1024;
    size_t buf_size = length * sizeof(float);
    float *input_r, *input_i, *output_r, *output_i;

    posix_memalign((void **)&input_r, 16, buf_size);
    posix_memalign((void **)&input_i, 16, buf_size);
    posix_memalign((void **)&output_r, 16, buf_size);
    posix_memalign((void **)&output_i, 16, buf_size);

    fill_rand(input_r, length, 0, 255);
    fill_rand(input_i, length, 0, 0);
    memcpy(output_r, input_r, buf_size);
    memcpy(output_i, input_i, buf_size);

    size_t local_work_size[1];
    size_t global_work_size[1];

    local_work_size[0] = 64;
    global_work_size[0] = 64;

    const char *source = load_program_source("FFT.cl");
    size_t source_len = strlen(source);;
    cl_uint err = 0;

    char *flags = "-x clc++";

    clGetPlatformIDs(1, &platform, NULL);
    printf("platform %p err %d\n", platform, err);

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err);
    printf("device %p err %d\n", device, err);

    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    printf("context %p err %d\n", context, err);

    queue = clCreateCommandQueue(context, device, 0, &err);
    printf("queue %p err %d\n", queue, err);

    program = clCreateProgramWithSource(context, 1, &source, &source_len, &err);
    printf("program %p err %d\n", program, err);

    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    printf("err %d\n", err);

    kernel = clCreateKernel(program, "kfft", NULL);
    printf("kernel %p\n", kernel);

    d_input_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        buf_size, input_r, &err);
    printf("d_input_r %p err %d\n", d_input_r, err);

    d_input_i = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        buf_size, input_i, &err);
    printf("d_input_i %p err %d\n", d_input_i, err);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_input_r);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_input_i);
    printf("err %d\n", err);

    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
        local_work_size, 0, NULL, NULL);
    printf("err %d\n", err);

    clFinish(queue);

    err = clEnqueueReadBuffer(queue, d_input_r, CL_TRUE, 0, buf_size, output_r,
        0, NULL, NULL);
    printf("err %d\n", err);

    err = clEnqueueReadBuffer(queue, d_input_i, CL_TRUE, 0, buf_size, output_i,
        0, NULL, NULL);
    printf("err %d\n", err);

    int i;
    for (i = 0; i < length; i++) {
        printf("%i %f %f\n", i, output_r[i], output_i[i]);
    }

    clReleaseMemObject(d_input_r);
    clReleaseMemObject(d_input_i);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);

}
Exemple #17
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
      
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned

    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
    
    cl_platform_id platform_id = NULL;  // compute device platform id
    cl_device_id device_id;             // compute device id 
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    cl_event event;
    
    // Fill our data set with random float values
    //
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;

    //Connect to a platform on device
    err = clGetPlatformIDs(1, &platform_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to locate opencl platform!\n");
        return EXIT_FAILURE;
    }
    
    // Connect to a compute device
    //
    int gpu = 0;
    err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
  
    // Create a compute context 
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    //Use function and load the kernel source from .cl files in the same folder
    //
    char *KernelSource = load_program_source("hello.cl");

    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel! - %d\n",err);
        exit(1);
    }

    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    
    
    // Write our data set into the input array in device memory 
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
    if (err)
    {
        printf("Error: Failed to execute kernel!-%d\n",err);
        return EXIT_FAILURE;
    }

    // Wait for the command commands to get serviced before reading back results
    //
    clWaitForEvents(1, &event);
    clFinish(commands);
    cl_ulong time_start, time_end;
    double total_time;
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
    printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);

    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if(results[i] == data[i] * data[i])
            correct++;
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return 0;
}
Exemple #18
0
int main(int argc, char* argv[])
{
int num_ker=0,num_queue;
num_ker=atoi(argv[2]);
num_queue=atoi(argv[3]);

	//variables
/*#define WA 1024
#define HA 1024
#define WB 1024
#define HB WA
#define WC WB
#define HC HA
*/
struct timeval tim,ftim;
  double t1,t2,tim1,tim2;

//    gettimeofday(&tim, NULL);
  //  t1=tim.tv_sec+(tim.tv_usec/1000000.0);
    gettimeofday(&ftim, NULL);
    tim1=ftim.tv_sec+(ftim.tv_usec/1000000.0);

int m,WA,HA,WB,HB,WC,HC;
m = atoi(argv[5]);
WA=(256*m);
HA = WA;
WB = WA;
HB = WB;
WC = WA;
HC = WA;
   // set seed for rand()
   srand(2006);
 
   // 1. allocate host memory for matrices A and B
	//automate the size of the matrix
   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(int) * size_A;
   int* h_A = (int*) malloc(mem_size_A);
 
   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(int) * size_B;
   int* h_B = (int*) malloc(mem_size_B);
 
   // 2. initialize host memory
   randomInit(h_A, size_A);
   randomInit(h_B, size_B);
 
/*   // 3. print out A and B
   printf("\n\nMatrix A\n");
   for(i = 0; i < size_A; i++)
   {
      printf("%f ", h_A[i]);
      if(((i + 1) % WA) == 0)
      printf("\n");
   }
 
   printf("\n\nMatrix B\n");
   for(i = 0; i < size_B; i++)
   {
      printf("%f ", h_B[i]);
      if(((i + 1) % WB) == 0)
      printf("\n");
   }
 */
   
// 4. allocate host memory for the result C
   unsigned int size_C = WC * HC;
   unsigned int  mem_size_C = sizeof(int) * size_C;
   int* h_C = (int*) malloc(mem_size_C);
 
   // 5. Initialize OpenCL
   // OpenCL specific variables
   cl_context clGPUContext;
//   cl_command_queue* clCommandQue;
   //cl_program clProgram;
   //cl_kernel clKernel;
cl_platform_id* cpPlatform;        // OpenCL platform
cl_uint platformCount; //keeps the divice count
  
   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;
 
   // OpenCL device memory for matrices
   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;
 
   /*****************************************/
   /* Initialize OpenCL */
   /*****************************************/
//cl_platform_id* cpPlatform;        // OpenCL platform
    //cl_device_id device_id;// = (cl_device_id)malloc(sizeof(cl_device_id)); 
    // Bind to platform
// errcode = clGetPlatformIDs(1, &cpPlatform, NULL);
clGetPlatformIDs(0, NULL, &platformCount);
    cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here

cl_device_id device_id;
int choice =atoi(argv[1]);
if(choice ==1)
{
 // Length of vectors
    // n = 64;

    // Connect to a compute device 
// we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here
//depending what device are we working on
// we can these multiple times depending on requirements
    errcode = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL);
    if (errcode != CL_SUCCESS)

        printf("Error: Failed to create a device group!\n");
}
else
{
 //   errcode = clGetPlatformIDs(1, &cpPlatform, NULL);
    // Get ID for the device
    errcode = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (errcode != CL_SUCCESS)

    {

        printf("Error: Failed to create a device group!\n");
}
}
//printf("here");
    // Create a context 
   clGPUContext = clCreateContext(0, 1, &device_id, NULL, NULL, &errcode);
    // Create a command queue
//printf("here");
   /*clGPUContext = clCreateContextFromType(NULL, 
                   CL_DEVICE_TYPE_GPU, 
                   NULL, NULL, &errcode);
   //shrCheckError(errcode, CL_SUCCESS);
 
   // get the list of GPU devices associated 
   // with context
   errcode = clGetContextInfo(clGPUContext, 
              CL_CONTEXT_DEVICES, 0, NULL, 
              &dataBytes);
   cl_device_id *clDevices = (cl_device_id *)
              malloc(dataBytes);
   errcode = clGetContextInfo(clGPUContext, 
              CL_CONTEXT_DEVICES, dataBytes, 
              clDevices, NULL);
   //shrCheckError(errcode, CL_SUCCESS);
 */
//malloc for command queue, kernel and program
cl_kernel *clKernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel));
cl_program *clProgram=(cl_program *)malloc(num_ker * sizeof(cl_kernel));

cl_command_queue * clCommandQue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue));
   //Create a command-queue
for(i=0;i<num_queue;i++)
{
   clCommandQue[i] = clCreateCommandQueue(clGPUContext, 
                  device_id, 0, &errcode);
 }  //shrCheckError(errcode, CL_SUCCESS);
  
  /* // Setup device memory
   d_C = clCreateBuffer(clGPUContext, 
          CL_MEM_READ_WRITE, 
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext, 
printf("\nhere"); 
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext, 
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
          mem_size_B, h_B, &errcode);
 */
 	char *file="matxm.cl";
	char *KernelSource =  load_program_source(file);
 for(i=0;i<num_ker;i++)
{
   clProgram[i] = clCreateProgramWithSource(clGPUContext, 
                1, (const char **) & KernelSource, 
                NULL, &errcode);
   //shrCheckError(errcode, CL_SUCCESS);
 
   errcode = clBuildProgram(clProgram[i], 0, 
              NULL, NULL, NULL, NULL);
   //shrCheckError(errcode, CL_SUCCESS);
 
   clKernel[i] = clCreateKernel(clProgram[i], 
               "matrixMul", &errcode);
} 
  //shrCheckError(errcode, CL_SUCCESS);
  // Setup device memory
   d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE,
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_B, h_B, &errcode);

     // Write our data set into the input array in device memory

for(i=0;i<num_queue;i++){
    errcode = clEnqueueWriteBuffer(clCommandQue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL);
errcode = clEnqueueWriteBuffer(clCommandQue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL);
}
    
   // 7. Launch OpenCL kernel
   size_t localWorkSize[2], globalWorkSize[2];
 
   int wA = WA;
   int wC = WC;
for(i=0;i<num_ker;i++)
{
   errcode = clSetKernelArg(clKernel[i], 0, 
              sizeof(cl_mem), (void *)&d_C);
   errcode = clSetKernelArg(clKernel[i], 1, 
              sizeof(cl_mem), (void *)&d_A);
   errcode = clSetKernelArg(clKernel[i], 2, 
              sizeof(cl_mem), (void *)&d_B);
   errcode = clSetKernelArg(clKernel[i], 3, 
              sizeof(int), (void *)&wA);
   errcode = clSetKernelArg(clKernel[i], 4, 
              sizeof(int), (void *)&wC);
}
//   shrCheckError(errcode, CL_SUCCESS);
//struct timespec start, finish;
 
 int value;
value =atoi(argv[4]);
   localWorkSize[0] = value ;
   localWorkSize[1] = value ;
   globalWorkSize[0] = HA;
   globalWorkSize[1] = HA;
//clFinish(clCommandQue);

//timer starting
// clock_gettime(CLOCK_MONOTONIC, &start);
//struct timeval tim;
  //double t1,t2;

//    gettimeofday(&tim, NULL);
  //  t1=tim.tv_sec+(tim.tv_usec/1000000.0);
    gettimeofday(&tim, NULL);
    t1=tim.tv_sec+(tim.tv_usec/1000000.0);
//multikernels inside queues
int j=0;
for(j=0;j<num_queue;j++)
{
for(i=0;i<num_ker;i++){
   errcode = clEnqueueNDRangeKernel(clCommandQue[j], 
              clKernel[i], 2, NULL, globalWorkSize, 
              localWorkSize, 0, NULL, NULL);
}
}
for(i=0;i<num_queue;i++)
{
 clFinish(clCommandQue[i]);
}

gettimeofday(&tim, NULL);
    t2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf\t",(t2-t1));
 
 // shrCheckError(errcode, CL_SUCCESS);
/*  clock_gettime(CLOCK_MONOTONIC, &finish);
        elapsed = (finish.tv_sec - start.tv_sec);
        elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0;

printf("Work Item/threads = %d \n",value);
printf("time taken by GPU = %le\n ",elapsed);
*/
   // 8. Retrieve result from device

for(i=0;i<num_queue;i++)
{
   errcode = clEnqueueReadBuffer(clCommandQue[i], 
              d_C, CL_TRUE, 0, mem_size_C, 
              h_C, 0, NULL, NULL);
   //shrCheckError(errcode, CL_SUCCESS);
}
for(i=0;i<num_queue;i++)
{
 clFinish(clCommandQue[i]);
}
 // shrCheckError(errcode, CL_SUCCESS);
  //clock_gettime(CLOCK_MONOTONIC, &finish);
    //    elapsed = (finish.tv_sec - start.tv_sec);
      //  elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0;

//printf("Work Item/threads = %d \n",value);
//printf("time taken by GPU = %le\n ",elapsed);

   // 9. print out the results
   /*printf("\n\nMatrix C (Results)\n");
   for(i = 0; i < size_C; i++)
   {
      printf("%f ", h_C[i]);
      if(((i + 1) % WC) == 0)
      printf("\n");
   }
   printf("\n");*/
 
   // 10. clean up memory
   free(h_A);
   free(h_B);
   free(h_C);
 
   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);
 
//   free(device_id);
 free(KernelSource);
   clReleaseContext(clGPUContext);
for(i=0;i<num_ker;i++)
{
   clReleaseKernel(clKernel[i]);
   clReleaseProgram(clProgram[i]);
}
for(i=0;i<num_queue;i++){
   clReleaseCommandQueue(clCommandQue[i]);
}	
gettimeofday(&ftim, NULL);
    tim2=ftim.tv_sec+(ftim.tv_usec/1000000.0);
printf("%.6lf\t",(tim2-tim1));
printf("\n");
exit(0);
}
Exemple #19
0
int main(int argc, char* argv[])
{
	int device_gpu = 1;

	const char *source_files[1] = {
		"mtgp32-opencl.cl"};
	const char *buildOptions="-I. -Werror";
	const char *program_source[1];

	cl_int clerr;
	cl_platform_id   platform_ids[32];

	unsigned int num_platforms;

	clerr = clGetPlatformIDs(32, platform_ids, &num_platforms); 
	CLERR;

	for (unsigned int i=0; i < num_platforms; ++i)
	{
		clerr = clGetDeviceIDs (platform_ids[i], device_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
		if (CL_SUCCESS == clerr)
		{
			platform_id = platform_ids[i];
			break;
		}
		else if (CL_DEVICE_NOT_FOUND == clerr)
			continue;
		CLERR;
	}

	{
		char platform_name[1024];
		char platform_vendor[1024];
		char device_name[1024];

		clerr = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
		CLERR;

		clerr = clGetPlatformInfo(platform_id, CL_PLATFORM_VENDOR, sizeof(platform_vendor), platform_vendor, NULL);
		CLERR;

		clerr = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL);
		CLERR;

		printf("Platform name: %s\nPlatform vendor: %s\nDevice name: %s\n", platform_name, platform_vendor, device_name);
	}

	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &clerr);
	CLERR;

	commands = clCreateCommandQueue(context, device_id, 0, &clerr);
	CLERR;

	program_source[0] = load_program_source(source_files[0]);
	program = clCreateProgramWithSource(context, 1, program_source, NULL, &clerr);
	CLERR;

	clerr = clBuildProgram(program, 0, NULL, buildOptions, NULL, NULL);
	//CLERR;

	size_t log_size;
	clerr = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
	CLERR;

	char* build_log = (char*) malloc(log_size);
	clerr = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
	CLERR;

	build_log[log_size-1] = '\0';
	printf("BUILD LOG %s\n", build_log);
	free(build_log);

	free((void*)program_source[0]);

	mtgp32_uint32_kernel = clCreateKernel(program, "mtgp32_uint32_kernel", &clerr); CLERR;
	mtgp32_single_kernel = clCreateKernel(program, "mtgp32_single_kernel", &clerr); CLERR;

	// LARGE_SIZE is a multiple of 16
	int num_data = 10000000;
	int block_num;
	int num_unit;
	int r;
	cl_mem d_status;
	cl_mem d_params;
	int mb, mp;


	block_num = 96;
/*	if (argc >= 2) {
		errno = 0;
		block_num = strtol(argv[1], NULL, 10);
		if (errno) {
			printf("%s number_of_block number_of_output\n", argv[0]);
			return 1;
		}
		if (block_num < 1 || block_num > BLOCK_NUM_MAX) {
			printf("%s block_num should be between 1 and %d\n",
					argv[0], BLOCK_NUM_MAX);
			return 1;
		}
		errno = 0;
		num_data = strtol(argv[2], NULL, 10);
		if (errno) {
			printf("%s number_of_block number_of_output\n", argv[0]);
			return 1;
		}
		argc -= 2;
		argv += 2;
	} else {
		printf("%s number_of_block number_of_output\n", argv[0]);
		block_num = get_suitable_block_num(device,
				&mb,
				&mp,
				sizeof(uint32_t),
				THREAD_NUM,
				LARGE_SIZE);
		if (block_num <= 0) {
			printf("can't calculate sutable number of blocks.\n");
			return 1;
		}
		printf("the suitable number of blocks for device 0 "
				"will be multiple of %d, or multiple of %d\n", block_num,
				(mb - 1) * mp);
		return 1;
	}
*/
	num_unit = LARGE_SIZE * block_num;
	d_status = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(struct mtgp32_kernel_status_t) * block_num, NULL, &clerr); CLERR;
	d_params = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(struct mtgp32_param_t), NULL, &clerr); CLERR;
//	ccudaMalloc((void**)&d_status, sizeof(mtgp32_kernel_status_t) * block_num);
	r = num_data % num_unit;
	if (r != 0) {
		num_data = num_data + num_unit - r;
	}
	make_constant(commands, d_params, MTGPDC_PARAM_TABLE, block_num);
	make_kernel_data32(commands, d_status, MTGPDC_PARAM_TABLE, block_num);
	make_uint32_random(d_status, d_params, num_data, block_num);
	make_single_random(d_status, d_params, num_data, block_num);

	clReleaseMemObject(d_status);
	clReleaseMemObject(d_params);

	/*Close connection with devices*/
	clReleaseKernel(mtgp32_uint32_kernel);
	clReleaseKernel(mtgp32_single_kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
}
Exemple #20
0
int main(int argc, char **argv)
{
    uint64_t         t1 = 0;
    uint64_t         t2 = 0;
    int              err;
    cl_device_id     device_id;
    cl_command_queue commands;
    cl_context       context;
    cl_mem			 output_buffer;
    cl_mem           input_buffer;
    cl_mem           partials_buffer;
    size_t           typesize;
    int              pass_count = 0;
    size_t*          group_counts = 0;
    size_t*          work_item_counts = 0;
    int*             operation_counts = 0;
    int*             entry_counts = 0;
    int              use_gpu = 1;

    int i;
    int c;

    // Parse command line options
    //
    for( i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;

        if(strstr(argv[i], "cpu"))
        {
            use_gpu = 0;
        }
        else if(strstr(argv[i], "gpu"))
        {
            use_gpu = 1;
        }
        else if(strstr(argv[i], "float2"))
        {
            integer = false;
            channels = 2;
        }
        else if(strstr(argv[i], "float4"))
        {
            integer = false;
            channels = 4;
        }
        else if(strstr(argv[i], "float"))
        {
            integer = false;
            channels = 1;
        }
        else if(strstr(argv[i], "int2"))
        {
            integer = true;
            channels = 2;
        }
        else if(strstr(argv[i], "int4"))
        {
            integer = true;
            channels = 4;
        }
        else if(strstr(argv[i], "int"))
        {
            integer = true;
            channels = 1;
        }
    }

    // Create some random input data on the host
    //
    float *float_data = (float*)malloc(count * channels * sizeof(float));
    int *integer_data = (int*)malloc(count * channels * sizeof(int));
    for (i = 0; i < count * channels; i++)
    {
        float_data[i] = ((float) rand() / (float) RAND_MAX);
        integer_data[i] = (int) (255.0f * float_data[i]);
    }

    // Connect to a compute device
    //
    err = clGetDeviceIDs(NULL, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to locate a compute device!\n");
        return EXIT_FAILURE;
    }

    size_t returned_size = 0;
    size_t max_workgroup_size = 0;
    err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
    err|= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    printf(SEPARATOR);
    printf("Connecting to %s %s...\n", vendor_name, device_name);

    // Load the compute program from disk into a cstring buffer
    //
    typesize = integer ? (sizeof(int)) : (sizeof(float));
    const char* filename = 0;
    switch(channels)
    {
    case 4:
        filename = integer ? "reduce_int4_kernel.cl" : "reduce_float4_kernel.cl";
        break;
    case 2:
        filename = integer ? "reduce_int2_kernel.cl" : "reduce_float2_kernel.cl";
        break;
    case 1:
        filename = integer ? "reduce_int_kernel.cl" : "reduce_float_kernel.cl";
        break;
    default:
        printf("Invalid channel count specified!\n");
        return EXIT_FAILURE;
    };

    printf(SEPARATOR);
    printf("Loading program '%s'...\n", filename);
    printf(SEPARATOR);

    char *source = load_program_source(filename);
    if(!source)
    {
        printf("Error: Failed to load compute program from file!\n");
        return EXIT_FAILURE;
    }

    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the input buffer on the device
    //
    size_t buffer_size = typesize * count * channels;
    input_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!input_buffer)
    {
        printf("Error: Failed to allocate input buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Fill the input buffer with the host allocated random data
    //
    void *input_data = (integer) ? (void*)integer_data : (void*)float_data;
    err = clEnqueueWriteBuffer(commands, input_buffer, CL_TRUE, 0, buffer_size, input_data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return EXIT_FAILURE;
    }

    // Create an intermediate data buffer for intra-level results
    //
    partials_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!partials_buffer)
    {
        printf("Error: Failed to allocate partial sum buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Create the output buffer on the device
    //
    output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!output_buffer)
    {
        printf("Error: Failed to allocate result buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Determine the reduction pass configuration for each level in the pyramid
    //
    create_reduction_pass_counts(
        count, max_workgroup_size,
        MAX_GROUPS, MAX_WORK_ITEMS,
        &pass_count, &group_counts,
        &work_item_counts, &operation_counts,
        &entry_counts);

    // Create specialized programs and kernels for each level of the reduction
    //
    cl_program *programs = (cl_program*)malloc(pass_count * sizeof(cl_program));
    memset(programs, 0, pass_count * sizeof(cl_program));

    cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel));
    memset(kernels, 0, pass_count * sizeof(cl_kernel));

    for(i = 0; i < pass_count; i++)
    {
        char *block_source = malloc(strlen(source) + 1024);
        size_t source_length = strlen(source) + 1024;
        memset(block_source, 0, source_length);

        // Insert macro definitions to specialize the kernel to a particular group size
        //
        const char group_size_macro[] = "#define GROUP_SIZE";
        const char operations_macro[] = "#define OPERATIONS";
        sprintf(block_source, "%s (%d) \n%s (%d)\n\n%s\n",
                group_size_macro, (int)group_counts[i],
                operations_macro, (int)operation_counts[i],
                source);

        // Create the compute program from the source buffer
        //
        programs[i] = clCreateProgramWithSource(context, 1, (const char **) & block_source, NULL, &err);
        if (!programs[i] || err != CL_SUCCESS)
        {
            printf("%s\n", block_source);
            printf("Error: Failed to create compute program!\n");
            return EXIT_FAILURE;
        }

        // Build the program executable
        //
        err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            size_t length;
            char build_log[2048];
            printf("%s\n", block_source);
            printf("Error: Failed to build program executable!\n");
            clGetProgramBuildInfo(programs[i], device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length);
            printf("%s\n", build_log);
            return EXIT_FAILURE;
        }

        // Create the compute kernel from within the program
        //
        kernels[i] = clCreateKernel(programs[i], "reduce", &err);
        if (!kernels[i] || err != CL_SUCCESS)
        {
            printf("Error: Failed to create compute kernel!\n");
            return EXIT_FAILURE;
        }

        free(block_source);
    }

    // Do the reduction for each level
    //
    cl_mem pass_swap;
    cl_mem pass_input = output_buffer;
    cl_mem pass_output = input_buffer;

    for(i = 0; i < pass_count; i++)
    {
        size_t global = group_counts[i] * work_item_counts[i];
        size_t local = work_item_counts[i];
        unsigned int operations = operation_counts[i];
        unsigned int entries = entry_counts[i];
        size_t shared_size = typesize * channels * local * operations;

        printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n",  i,
               (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries);

        // Swap the inputs and outputs for each pass
        //
        pass_swap = pass_input;
        pass_input = pass_output;
        pass_output = pass_swap;

        err = CL_SUCCESS;
        err |= clSetKernelArg(kernels[i],  0, sizeof(cl_mem), &pass_output);
        err |= clSetKernelArg(kernels[i],  1, sizeof(cl_mem), &pass_input);
        err |= clSetKernelArg(kernels[i],  2, shared_size,    NULL);
        err |= clSetKernelArg(kernels[i],  3, sizeof(int),    &entries);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments!\n");
            return EXIT_FAILURE;
        }

        // After the first pass, use the partial sums for the next input values
        //
        if(pass_input == input_buffer)
            pass_input = partials_buffer;

        err = CL_SUCCESS;
        err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel!\n");
            return EXIT_FAILURE;
        }
    }

    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }

    // Start the timing loop and execute the kernel over several iterations
    //
    printf(SEPARATOR);
    printf("Timing %d iterations of reduction with %d elements of type %s%s...\n",
           iterations, count, integer ? "int" : "float",
           (channels <= 1) ? (" ") : (channels == 2) ? "2" : "4");
    printf(SEPARATOR);

    int k;
    err = CL_SUCCESS;
    t1 = current_time();
    for (k = 0 ; k < iterations; k++)
    {
        for(i = 0; i < pass_count; i++)
        {
            size_t global = group_counts[i] * work_item_counts[i];
            size_t local = work_item_counts[i];

            err = clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
            if (err != CL_SUCCESS)
            {
                printf("Error: Failed to execute kernel!\n");
                return EXIT_FAILURE;
            }
        }
    }
    err = clFinish(commands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }
    t2 = current_time();

    // Calculate the statistics for execution time and throughput
    //
    double t = subtract_time_in_seconds(t2, t1);
    printf("Exec Time:  %.2f ms\n", 1000.0 * t / (double)(iterations));
    printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t);
    printf(SEPARATOR);

    // Read back the results that were computed on the device
    //
    void *computed_result = malloc(typesize * channels);
    memset(computed_result, 0, typesize * channels);
    err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0, typesize * channels, computed_result, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to read back results from the device!\n");
        return EXIT_FAILURE;
    }


    // Verify the results are correct
    //
    if(integer)
    {
        int reference[4] = { 0, 0, 0, 0};
        switch(channels)
        {
        case 4:
            reduce_validate_int4(integer_data, count, reference);
            break;
        case 2:
            reduce_validate_int2(integer_data, count, reference);
            break;
        case 1:
            reduce_validate_int(integer_data, count, reference);
            break;
        default:
            printf("Invalid channel count specified!\n");
            return EXIT_FAILURE;
        }

        int result[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
        for(c = 0; c < channels; c++)
        {
            int v = ((int*) computed_result)[c];
            result[c] += v;
        }

        float error = 0.0f;
        float diff = 0.0f;
        for(c = 0; c < channels; c++)
        {
            diff = fabs(reference[c] - result[c]);
            error = diff > error ? diff : error;
        }

        if (error > MIN_ERROR)
        {
            for(c = 0; c < channels; c++)
                printf("Result[%d] %d != %d\n", c, reference[c], result[c]);

            printf("Error:  Incorrect results obtained! Max error = %f\n", error);
            return EXIT_FAILURE;
        }
        else
        {
            printf("Results Validated!\n");
            printf(SEPARATOR);
        }
    }
    else
    {
        float reference[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
        switch(channels)
        {
        case 4:
            reduce_validate_float4(float_data, count, reference);
            break;
        case 2:
            reduce_validate_float2(float_data, count, reference);
            break;
        case 1:
            reduce_validate_float(float_data, count, reference);
            break;
        default:
            printf("Invalid channel count specified!\n");
            return EXIT_FAILURE;
        }

        float result[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
        for(c = 0; c < channels; c++)
        {
            float v = ((float*) computed_result)[c];
            result[c] += v;
        }

        float error = 0.0f;
        float diff = 0.0f;
        for(c = 0; c < channels; c++)
        {
            diff = fabs(reference[c] - result[c]);
            error = diff > error ? diff : error;
        }

        if (error > MIN_ERROR)
        {
            for(c = 0; c < channels; c++)
                printf("Result[%d] %f != %f\n", c, reference[c], result[c]);

            printf("Error:  Incorrect results obtained! Max error = %f\n", error);
            return EXIT_FAILURE;
        }
        else
        {
            printf("Results Validated!\n");
            printf(SEPARATOR);
        }
    }

    // Shutdown and cleanup
    //
    for(i = 0; i < pass_count; i++)
    {
        clReleaseKernel(kernels[i]);
        clReleaseProgram(programs[i]);
    }

    clReleaseMemObject(input_buffer);
    clReleaseMemObject(output_buffer);
    clReleaseMemObject(partials_buffer);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    free(group_counts);
    free(work_item_counts);
    free(operation_counts);
    free(entry_counts);
    free(computed_result);
    free(kernels);
    free(float_data);
    free(integer_data);


    return 0;
}
Exemple #21
0
int initGPU(int n)
{
	#pragma mark Device Information
	// Find the CPU CL device, as a fallback
	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
	assert(err == CL_SUCCESS);

	// Find the GPU CL device, this is what we really want
	// If there is no GPU device is CL capable, fall back to CPU
	err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
	if (err != CL_SUCCESS) device = cpu;
	assert(device);

	// Get some information about the returned device
	cl_char vendor_name[1024] = {0};
	cl_char device_name[1024] = {0};
	err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
	err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
	assert(err == CL_SUCCESS);
	printf("Connecting to %s %s...", vendor_name, device_name);

	#pragma mark Context and Command Queue
	// Now create a context to perform our calculation with the 
	// specified device 
	context = clCreateContext(0, 1, &device, NULL, NULL, &err);
	assert(err == CL_SUCCESS);

	// And also a command queue for the context
	cmd_queue = clCreateCommandQueue(context, device, 0, NULL);

	#pragma mark Program and Kernel Creation
	// Load the program source from disk
	// The kernel/program is the project directory and in Xcode the executable
	// is set to launch from that directory hence we use a relative path
	const char * filename = "kernel.cl";
	char *program_source = load_program_source(filename);
	program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err);
	assert(err == CL_SUCCESS);

	err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
	assert(err == CL_SUCCESS);

	// Now create the kernel "objects" that we want to use in the example file 
	kernel[0] = clCreateKernel(program[0], "add", &err);
	assert(err == CL_SUCCESS);

	#pragma mark Memory Allocation
	// Allocate memory on the device to hold our data and store the results into
	buffer_size = sizeof(int) * n;

	mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	assert(err == CL_SUCCESS);

	mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err);
	assert(err == CL_SUCCESS);

	// Get all of the stuff written and allocated
	clFinish(cmd_queue);

	printf(" done\n");

	return err; // CL_SUCCESS
}