Example #1
2
int main(int argc, char** argv)
{
  int err;                            // error code returned from api calls
  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel

  size_t global[2];                   // global domain size for our calculation
  size_t local[2];                    // local domain size for our calculation

  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
   

  cl_mem in_array;                     // device memory used for the input array
  //cl_mem synaptic_weights;             // device memory used for the input array
  cl_mem out_array;                    // device memory used for the output array
   
  if (argc != 2){
    printf("%s <inputfile>\n", argv[0]);
    return -1;
  }

	//float in_array[NO_NODES];
	//float out_array[NO_NODES];
	//float synaptic_weights[NO_NODES*NO_NODES];
	float in_array_tb[NO_NODES];
	float out_array_tb[NO_NODES];
	//float synaptic_weights_tb[NO_NODES*NO_NODES];
	float temp =0;
	int i = 0;
    	int j = 0;
	int index = 0;
	FILE* ifp;
	char* mode = "r";
  //
  // Connect to first platform
  //
  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return -1;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);
 
  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return -1;
  }
  
  //
  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return -1;
  }

  //relu_1(in_array,synaptic_weights,out_array);


  // Fill our data sets with pattern
  //
  //int i = 0;
  //for(i = 0; i < DATA_SIZE; i++) {
  //  a[i] = (int)i;
  //  b[i] = (int)i;
  //  results[i] = 0;
  //}
  //
  
  
  // Create a command commands
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return -1;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char *xclbin=argv[1];
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return -1;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    printf("err : %d %s\n",err,err);
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return -1;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "relu_1", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel!\n");
    printf("Test failed\n");
    return -1;
  }

  // Create the input and output arrays in device memory for our calculation
  //
  in_array = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES, NULL, NULL);
  //synaptic_weights = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES * NO_NODES, NULL, NULL);
  out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL);
  if (!in_array || /*!synaptic_weights ||*/ !out_array)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return -1;
  }    
    
	ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode);

	if(ifp == NULL)
	{
		printf("Input file not found \n");
  		return -1;
	}
	while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) {

		in_array_tb[index++] = temp;
	}
	index = 0;
	temp = 0;

	//ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode);
	//if(ifp == NULL)
	//{
	//	printf("Weight file not found \n");
  	//	return -1;
	//}
	//while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) {
	//	synaptic_weights_tb[index++] = temp;
	//}
   
  //
  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array a!\n");
    printf("Test failed\n");
    return -1;
  }

  // Write our data set into the input array in device memory 
  //
  //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) *  NO_NODES *  NO_NODES, synaptic_weights_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array b!\n");
    printf("Test failed\n");
    return -1;
  }
    
  // Set the arguments to our compute kernel
  //
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array);
  //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);

  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  clWaitForEvents(1, &readevent);
    
  //printf("A\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",a[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("B\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",b[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("res\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
    
  // Validate our results
  //
  //correct = 0;
  //for(i = 0; i < DATA_SIZE; i++)
  //{
  //  int row = i/MATRIX_RANK;
  //  int col = i%MATRIX_RANK;
  //  int running = 0;
  //  int index;
  //  for (index=0;index<MATRIX_RANK;index++) {
  //    int aIndex = row*MATRIX_RANK + index;
  //    int bIndex = col + index*MATRIX_RANK;
  //    running += a[aIndex] * b[bIndex];
  //  }
  //  sw_results[i] = running;
  //}
  //  
  //for (i = 0;i < DATA_SIZE; i++) 
  //  if(results[i] == sw_results[i])
  //    correct++;
  //printf("Software\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  //printf("%0.2f ",sw_results[i]);
  //  printf("%d ",sw_results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //  
  //  
  //// Print a brief summary detailing the results
  ////
  //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE);
  //  
        
  // Shutdown and cleanup
	int temp_ = 0;


 for (j = 0; j < NO_NODES; j++)
 {
 	if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0)
 	{
 		//printf("out_array[%d] = %f \n", j, out_array[j]);
 		temp_++;
 	}
 }


  clReleaseMemObject(in_array);
  //clReleaseMemObject(synaptic_weights);
  clReleaseMemObject(out_array);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

	if (temp_ == NO_NODES)
	{
		printf("*********************************************************** \n");
		printf("TEST PASSED !!!!!! The output matches the desired output. \n");
		printf("*********************************************************** \n");
		return EXIT_SUCCESS;
	}
	else
	{
		printf("**************************************************************** \n");
		printf("TEST Failed !!!!!! The output does not match the desired output. \n");
		printf("**************************************************************** \n");
		return -1;
	}

  //if(correct == DATA_SIZE){
  //  printf("Test passed!\n");
  //  return EXIT_SUCCESS;
  //}
  //else{
  //  printf("Test failed\n");
  //  return -1;
  //}
}
Example #2
0
int main(int argc, char** argv) {

   /* OpenCL 1.1 data structures */
   cl_platform_id* platforms;
   cl_program program;
   cl_context context;

   /* OpenCL 1.1 scalar data types */
   cl_uint numOfPlatforms;
   cl_int  error;

   /*
    Prepare an array of __cl_float4 via dynamic memory allocation
    This will map to the native vector type which is SSE / SSE2 / AVX on
    Intel-compatible processors.
   */
   cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device
   cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device
   for( int i = 0; i < DATA_SIZE; ++i) {
       ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i};
       ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f};
   }

   /* 
      Get the number of platforms 
      Remember that for each vendor's SDK installed on the computer,
      the number of available platform also increased. 
    */
   error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }

   platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
   printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);

   error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }
   // Search for a CPU/GPU device through the installed platforms
   // Build a OpenCL program and do not run it.
   for(cl_uint i = 0; i < numOfPlatforms; i++ ) {

        cl_uint numOfDevices = 0;

        /* Determine how many devices are connected to your platform */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);

        /* Load the information about your devices into the variable 'devices' */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        printf("Number of detected OpenCL devices: %d\n", numOfDevices);

	    /* Create a context */
        cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 };
	    context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a valid OpenCL context");
	        exit(1);
	    }

	    /* For each device, create a buffer and partition that data among the devices for compute! */
	    cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
	                                  sizeof(cl_float8) * DATA_SIZE, ud_in, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a buffer");
	        exit(1);
	    }

        int offset = 0; 
        for(int i = 0; i < numOfDevices; ++i, ++offset ) {
	        /* Load the two source files into temporary datastores */
	        const char *file_names[] = {"vectorization.cl"}; 
	        const int NUMBER_OF_FILES = 1;
	        char* buffer[NUMBER_OF_FILES];
	        size_t sizes[NUMBER_OF_FILES];
	        loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	
	        /* Create the OpenCL program object */
	        program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);				
		    if(error != CL_SUCCESS) {
		      perror("Can't create the OpenCL program object");
		      exit(1);   
		    }

	        /* Build OpenCL program object and dump the error message, if any */
	        char *program_log;
	        size_t log_size;
            char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe";
	        error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL);		
		    if(error != CL_SUCCESS) {
		      // If there's an error whilst building the program, dump the log
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		      program_log = (char*) malloc(log_size+1);
		      program_log[log_size] = '\0';
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 
		            log_size+1, program_log, NULL);
		      printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
		      free(program_log);
		      exit(1);
		    }
	  
	        /* Query the program as to how many kernels were detected */
	        cl_uint numOfKernels;
	        error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels);
	        if (error != CL_SUCCESS) {
	            perror("Unable to retrieve kernel count from program");
	            exit(1);
	        }
	        cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels);
	        error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);

            /* Loop thru each kernel and execute on device */
	        for(cl_uint j = 0; j < numOfKernels; j++) {
	            char kernelName[32];
	            cl_uint argCnt;
	            clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL);
	            clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL);
	            printf("Kernel name: %s with arity: %d\n", kernelName, argCnt);
	            printf("About to create command queue and enqueue this kernel...\n");
	
	            /* Create a command queue */
	            cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create command-queue");
	                exit(1);
	            }
	
                /* Create a buffer and copy the data from the main buffer */
	            cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 
	                                                sizeof(cl_float8) * DATA_SIZE, 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create sub-buffer object");
	                exit(1);
	            }

	            /* Let OpenCL know that the kernel is suppose to receive an argument */
	            error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj);
	            error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to set buffer object in kernel");
	                exit(1);
	            }
	
	            /* Enqueue the kernel to the command queue */
	            error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to enqueue task to command-queue");
	                exit(1);
	            }
	            printf("Task has been enqueued successfully!\n");

	            /* Enqueue the read-back from device to host */
	            error = clEnqueueReadBuffer(cQ, outobj,
	                                         CL_TRUE,               // blocking read
	                                         0,                      // read from the start
	                                         sizeof(cl_float8)*DATA_SIZE,          // how much to copy
	                                         ud_out, 0, NULL, NULL);
                /* Check the returned data */
	            if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) {
	                printf("Check passed!\n");
	            } else printf("Check failed!\n");
	
	            /* Release the command queue */
	            clReleaseCommandQueue(cQ);
	            clReleaseMemObject(outobj);
	        } 

        /* Clean up */
        
        for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); }
        for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
        clReleaseProgram(program);
    }// end of device loop and execution
    
	    clReleaseMemObject(inobj);
        clReleaseContext(context);
   }// end of platform loop

   free(ud_in);
   free(ud_out);
}
Example #3
0
int main( int argc, char **argv )
{	
  struct pb_Parameters *params;
  params = pb_ReadParameters(&argc, argv);
  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
    {
      fprintf(stderr, "Expecting one input filename\n");
      exit(-1);
    }

  int err = 0;
  if(argc != 3)
    err |= 1;
  else {
    char* numend;
    N = strtol(argv[1], &numend, 10);
    if(numend == argv[1])
      err |= 2;
    B = strtol(argv[2], &numend, 10);
    if(numend == argv[2])
      err |= 4;
  }

  if(err)
  {
    fprintf(stderr, "Expecting two integers for N and B\n");
    exit(-1);
  }

  //8*1024*1024;
  int n_bytes = N * B* sizeof(float2);
  int nthreads = T;
  
  struct pb_TimerSet timers;
  pb_InitializeTimerSet(&timers);
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  float *shared_source =(float *)malloc(n_bytes);  
  float2 *source    = (float2 *)malloc( n_bytes );
  float2 *result    = (float2 *)calloc( N*B, sizeof(float2) );

  inputData(params->inpFiles[0],(float*)source,N*B*2);

  // OpenCL Code 
  cl_int clErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(params);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_int clStatus;
  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;

  cl_program clProgram;

  cl_kernel fft_kernel;

  cl_mem d_source, d_work; //float2 *d_source, *d_work;
  cl_mem d_shared_source; //float *d_shared_source;

  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clErrNum);
  OCL_ERRCK_VAR(clErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  const char *source_path = "src/opencl_nvidia/fft_kernel.cl";
  char *sourceCode;
  sourceCode = readFile(source_path);
  if (sourceCode == NULL) {
    fprintf(stderr, "Could not load program source of '%s'\n", source_path); exit(1);
  }

  clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&sourceCode, NULL, &clErrNum);
  OCL_ERRCK_VAR(clErrNum);

  free(sourceCode);

  /*
    char compileOptions[1024];
  //                -cl-nv-verbose // Provides register info for NVIDIA devices
  // Set all Macros referenced by kernels
  sprintf(compileOptions, "\
                -D PRESCAN_THREADS=%u\
                -D KB=%u -D UNROLL=%u\
                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",

                prescanThreads,
                lmemKB, UNROLL,
                bins_per_block, blockX
            ); 
  */
  OCL_ERRCK_RETVAL ( clBuildProgram(clProgram, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) );


  char *build_log;
  size_t ret_val_size;
  OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
  build_log = (char *)malloc(ret_val_size+1);
  OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );

  // to be careful, terminate with \0
  build_log[ret_val_size] = '\0';

  fprintf(stderr, "%s\n", build_log );

  fft_kernel = clCreateKernel(clProgram, "GPU_FftShMem", &clErrNum);
  OCL_ERRCK_VAR(clErrNum);

  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
  // allocate & copy device memory
  d_shared_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, shared_source, &clErrNum);  OCL_ERRCK_VAR(clErrNum);

  d_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, source, &clErrNum);  OCL_ERRCK_VAR(clErrNum);
  
  //result is initially zero'd out
  d_work = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, result, &clErrNum);  OCL_ERRCK_VAR(clErrNum);

  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
  
  size_t block[1] = { nthreads };
  size_t grid[1] = { B*block[0] };
  
  OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 0, sizeof(cl_mem), (void *)&d_source) );
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, fft_kernel, 1, 0,
                            grid, block, 0, 0, 0) ); 	
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);

  // copy device memory to host
  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_source, CL_TRUE, 
                        0, // Offset in bytes
                        n_bytes, // Size of data to read
                        result, // Host Source
                        0, NULL, NULL) );
                        
  if (params->outFile) {
    /* Write result to file */
    pb_SwitchToTimer(&timers, pb_TimerID_IO);
    outputData(params->outFile, (float*)result, N*B*2);
    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
  }

  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_source) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_work) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_shared_source) );
  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  free(shared_source);  
  free(source);
  free(result);
  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
  pb_PrintTimerSet(&timers);
  
  pb_DestroyTimerSet(&timers);
  
  return 0;
}
Example #4
0
// main() for simple buffer and sub-buffer example
//
int main(int argc, char** argv)
{
    cl_int errNum;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;
    cl_context context;
    cl_program program;
    std::vector<cl_kernel> kernels;
    std::vector<cl_command_queue> queues;
    std::vector<cl_mem> buffers;
    int * inputOutput;
    std::cout << "Simple buffer and sub-buffer Example" << std::endl;
    // First, select an OpenCL platform to run on.
    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms);
    std::cout << "Number of platforms: \t" << numPlatforms << std::endl;
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    std::ifstream srcFile("simple.cl");
    
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl");
    
    std::string srcProg(
                        std::istreambuf_iterator<char>(srcFile),
                        (std::istreambuf_iterator<char>()));
    const char * src = srcProg.c_str();
    size_t length = srcProg.length();
    deviceIDs = NULL;
    DisplayPlatformInfo(
                        platformIDs[PLATFORM_INDEX],
                        CL_PLATFORM_VENDOR,
                        "CL_PLATFORM_VENDOR");
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            0,
                            NULL,
                            &numDevices);
    if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){
        checkErr(errNum, "clGetDeviceIDs");
    }
    
    deviceIDs = (cl_device_id *)alloca(
                                       sizeof(cl_device_id) * numDevices);
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            numDevices,
                            &deviceIDs[0],
                            NULL);
    checkErr(errNum, "clGetDeviceIDs");
    
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[PLATFORM_INDEX],
        0
    };
    
    context = clCreateContext(
                              contextProperties,
                              numDevices,
                              deviceIDs,
                              NULL,
                              NULL,
                              &errNum);
    
    checkErr(errNum, "clCreateContext");
    // Create program from source
    program = clCreateProgramWithSource(
                                        context,
                                        1,
                                        &src,
                                        &length,
                                        &errNum);
    checkErr(errNum, "clCreateProgramWithSource");
    
    // Build program
    errNum = clBuildProgram(
                            program,
                            numDevices,
                            deviceIDs,
                            "-I.",
                            NULL,
                            NULL);

    if (errNum != CL_SUCCESS){
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
                              program,
                              deviceIDs[0],
                              CL_PROGRAM_BUILD_LOG,
                              sizeof(buildLog),
                              buildLog,
                              NULL);
        std::cerr << "Error in OpenCL C source: " << std::endl;
        std::cerr << buildLog;
        checkErr(errNum, "clBuildProgram");
    }
        // create buffers and sub-buffers
        inputOutput = new int[NUM_BUFFER_ELEMENTS * numDevices];
        for (unsigned int i = 0; i < NUM_BUFFER_ELEMENTS * numDevices; i++)
        {
            inputOutput[i] = i;
        }
        
        // create a single buffer to cover all the input data
        cl_mem buffer = clCreateBuffer(
                                       context,
                                       CL_MEM_READ_WRITE,
                                       sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                       NULL,
                                       &errNum);
        checkErr(errNum, "clCreateBuffer");
        buffers.push_back(buffer);
        // now for all devices other than the first create a sub-buffer
        for (unsigned int i = 1; i < numDevices; i++)
        {
            cl_buffer_region region =
            {
                NUM_BUFFER_ELEMENTS * i * sizeof(int),
                NUM_BUFFER_ELEMENTS * sizeof(int)
            };
            buffer = clCreateSubBuffer(
                                       buffers[0],
                                       CL_MEM_READ_WRITE,
                                       CL_BUFFER_CREATE_TYPE_REGION,
                                       &region,
                                       &errNum);
            checkErr(errNum, "clCreateSubBuffer");
            buffers.push_back(buffer);
        }
        // Create command queues
        for (int i = 0; i < numDevices; i++)
        {
            InfoDevice<cl_device_type>::display(deviceIDs[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE");
            cl_command_queue queue =
            clCreateCommandQueue(
                                 context,
                                 deviceIDs[i],
                                 0,
                                 &errNum);
            checkErr(errNum, "clCreateCommandQueue");
            queues.push_back(queue);
            cl_kernel kernel = clCreateKernel(
                                              program,
                                              "square",
                                              &errNum);
            checkErr(errNum, "clCreateKernel(square)");
            errNum = clSetKernelArg(
                                    kernel,
                                    0,
                                    sizeof(cl_mem), (void *)&buffers[i]);
            checkErr(errNum, "clSetKernelArg(square)");
            kernels.push_back(kernel);
            // Write input data
            clEnqueueWriteBuffer(
                                 queues[0],
                                 buffers[0],
                                 CL_TRUE,
                                 0,
                                 sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                 (void*)inputOutput,
                                 0,
                                 NULL,
                                 NULL);
            std::vector<cl_event> events;
            // call kernel for each device
            for (int i = 0; i < queues.size(); i++)
            {
                cl_event event;
                size_t gWI = NUM_BUFFER_ELEMENTS;
                errNum = clEnqueueNDRangeKernel(
                                                queues[i],
                                                kernels[i],
                                                1,
                                                NULL,
                                                (const size_t*)&gWI,
                                                (const size_t*)NULL,
                                                0,
                                                0,
                                                &event);
                events.push_back(event);
            }
            // Technically don't need this as we are doing a blocking read
            // with in-order queue.
            clWaitForEvents(events.size(), events.data());
            // Read back computed data
            clEnqueueReadBuffer(
                                queues[0],
                                buffers[0],
                                CL_TRUE,
                                0,
                                sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                (void*)inputOutput,
                                0,
                                NULL,
                                NULL);
            // Display output in rows
            for (unsigned i = 0; i < numDevices; i++)
            {
                for (unsigned elems = i * NUM_BUFFER_ELEMENTS;
                     elems < ((i+1) * NUM_BUFFER_ELEMENTS);
                     elems++)
                {
                    std::cout << " " << inputOutput[elems];
                }
                std::cout << std::endl;
            }
            std::cout << "Program completed successfully" << std::endl;
            return 0; 
        }
}
Example #5
0
	int main()
	{
	cl_int num_rand = 4096*256; /* The number of random numbers generated using one generator */
	int count_all, i, num_generator = sizeof(mts)/sizeof(mts[0]); /* The number of generators */
	double pi;
	cl_platform_id platform_id = NULL;
	cl_uint ret_num_platforms;
	cl_device_id device_id = NULL;
	cl_uint ret_num_devices;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_program program = NULL;
	cl_kernel kernel_mt = NULL, kernel_pi = NULL;
	size_t kernel_code_size;
	char *kernel_src_str;
	cl_uint *result;
	cl_int ret;
	FILE *fp;
	cl_mem rand, count;
	size_t global_item_size[3], local_item_size[3];
	cl_mem dev_mts;
	cl_event ev_mt_end, ev_pi_end, ev_copy_end;
	cl_ulong prof_start, prof_mt_end, prof_pi_end, prof_copy_end;
 
	clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
	&ret_num_devices);
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
	result = (cl_uint*)malloc(sizeof(cl_uint)*num_generator);
 
	command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
	fp = fopen("mt.cl", "r");
	kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE);
	kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);
 
	/* Create output buffer */
	rand = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_rand*num_generator, NULL, &ret);
	count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_generator, NULL, &ret);
 
	/* Build Program*/
	program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
	(const size_t *)&kernel_code_size, &ret);
	clBuildProgram(program, 1, &device_id, "", NULL, NULL);
	kernel_mt = clCreateKernel(program, "genrand", &ret);
	kernel_pi = clCreateKernel(program, "calc_pi", &ret);
 
	/* Create input parameter */
	dev_mts = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(mts), NULL, &ret);
	clEnqueueWriteBuffer(command_queue, dev_mts, CL_TRUE, 0, sizeof(mts), mts, 0, NULL, NULL);
 
	/* Set Kernel Arguments */
	clSetKernelArg(kernel_mt, 0, sizeof(cl_mem), (void*)&rand); /* Random numbers (output of genrand) */
	clSetKernelArg(kernel_mt, 1, sizeof(cl_mem), (void*)&dev_mts); /* MT parameter (input to genrand) */
	clSetKernelArg(kernel_mt, 2, sizeof(num_rand), &num_rand); /* Number of random numbers to generate */
 
	clSetKernelArg(kernel_pi, 0, sizeof(cl_mem), (void*)&count); /* Counter for points within circle (output of calc_pi) */
	clSetKernelArg(kernel_pi, 1, sizeof(cl_mem), (void*)&rand); /* Random numbers (input to calc_pi) */
	clSetKernelArg(kernel_pi, 2, sizeof(num_rand), &num_rand); /* Number of random numbers used */
 
	global_item_size[0] = num_generator; global_item_size[1] = 1; global_item_size[2] = 1;
	local_item_size[0] = num_generator; local_item_size[1] = 1; local_item_size[2] = 1;
 
	/* Create a random number array */
	clEnqueueNDRangeKernel(command_queue, kernel_mt, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_mt_end);
 
	/* Compute PI */
	clEnqueueNDRangeKernel(command_queue, kernel_pi, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_pi_end);
 
	/* Get result */
	clEnqueueReadBuffer(command_queue, count, CL_TRUE, 0, sizeof(cl_uint)*num_generator, result, 0, NULL, &ev_copy_end);
 
	/* Average the values of PI */
	count_all = 0;
	for (i=0; i < num_generator; i++) {
	count_all += result[i];
	}
 
	pi = ((double)count_all)/(num_rand * num_generator) * 4;
	printf("pi = %f\n", pi);
 
	/* Get execution time info */
	clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &prof_start, NULL);
	clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_mt_end, NULL);
	clGetEventProfilingInfo(ev_pi_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_pi_end, NULL);
	clGetEventProfilingInfo(ev_copy_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_copy_end, NULL);
 
	printf(" mt: %f[ms]\n"
		" pi: %f[ms]\n"
		" copy: %f[ms]\n",
		(prof_mt_end - prof_start)/(1000000.0),
		(prof_pi_end - prof_mt_end)/(1000000.0),
		(prof_copy_end - prof_pi_end)/(1000000.0));
 
	clReleaseEvent(ev_mt_end);
	clReleaseEvent(ev_pi_end);
	clReleaseEvent(ev_copy_end);
 
	clReleaseMemObject(rand);
	clReleaseMemObject(count);
	clReleaseKernel(kernel_mt);
	clReleaseKernel(kernel_pi);
	clReleaseProgram(program);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
	free(kernel_src_str);
	free(result);
	return 0;
}
Example #6
0
void InitOpenCL()
{
    // 1. Get a platform.
    cl_platform_id platform;
    
    clGetPlatformIDs( 1, &platform, NULL );
    // 2. Find a gpu device.
    cl_device_id device;
    
    clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU,
                   1,
                   &device,
                   NULL);
    // 3. Create a context and command queue on that device.
    cl_context context = clCreateContext( NULL,
                                         1,
                                         &device,
                                         NULL, NULL, NULL);
    queue = clCreateCommandQueue( context,
                                 device,
                                 0, NULL );
    // 4. Perform runtime source compilation, and obtain kernel entry point.
    std::ifstream file("scene.cl");
    std::string source;
    if (file){
    while(!file.eof()){
        char line[256];
        file.getline(line,255);
        source += std::string(line) + "\n";
    }
    }
    if (source.length()==0)
    {
        std::string err = "fail to load shader";
    }
    
    cl_ulong maxSize;
    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), &maxSize, 0);
    
    const char* str = source.c_str();
    cl_program program = clCreateProgramWithSource( context,
                                                   1,
                                                   &str,
                                                   NULL, NULL );
    cl_int result = clBuildProgram( program, 1, &device, NULL, NULL, NULL );
    if ( result ){
        char* build_log;
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        build_log = new char[log_size+1];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
        build_log[log_size] = '\0';
        if( log_size > 2 ) {
            std::cout << "build log: " << build_log << std::endl;
        }
        delete[] build_log;
        std::cout << "Error during compilation! (" << result << ")" << std::endl;
    }
    kernel = clCreateKernel( program, "tracekernel", NULL );
    // 5. Create a data buffer.
    buffer        = clCreateBuffer( context,
                                   CL_MEM_WRITE_ONLY,
                                   kWidth * kHeight *sizeof(cl_float4),
                                   NULL, 0 );
    viewTransform = clCreateBuffer( context,
                                   CL_MEM_READ_WRITE,
                                   16 *sizeof(cl_float),
                                   NULL, 0 );
    
    worldTransforms = clCreateBuffer( context,
                                     CL_MEM_READ_WRITE,
                                     16 *sizeof(cl_float)*2,
                                     NULL, 0 );
    
    clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer);
    clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 3, sizeof(viewTransform), (void*) &viewTransform);
    clSetKernelArg(kernel, 4, sizeof(worldTransforms), (void*) &worldTransforms);
}
Example #7
0
int main(void) {
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // Generate the input array on the host.
    float h_a[ARRAY_SIZE];
    float h_b[ARRAY_SIZE];
    for (int i = 0; i < ARRAY_SIZE; i++) {
        h_a[i] = (float)i;
        h_b[i] = (float)(2 * i);
    }

    float h_c[ARRAY_SIZE];

    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("vectors_cl.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
                         &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                      ARRAY_BYTES, NULL, &ret);

    // Copy h_a and h_b to memory buffer
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_a, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_b, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
        (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (ret != 0) {
        printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret);
        exit(1);
    }

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != 0) {
        printf("clBuildProgram returned non-zero status %d: ", ret);

        if (ret == CL_INVALID_PROGRAM) {
            printf("invalid program\n");
        } else if (ret == CL_INVALID_VALUE) {
            printf("invalid value\n");
        } else if (ret == CL_INVALID_DEVICE) {
            printf("invalid device\n");
        } else if (ret == CL_INVALID_BINARY) {
            printf("invalid binary\n");
        } else if (ret == CL_INVALID_BUILD_OPTIONS) {
            printf("invalid build options\n");
        } else if (ret == CL_INVALID_OPERATION) {
            printf("invalid operation\n");
        } else if (ret == CL_COMPILER_NOT_AVAILABLE) {
            printf("compiler not available\n");
        } else if (ret == CL_BUILD_PROGRAM_FAILURE) {
            printf("build program failure\n");

            // Determine the size of the log
            size_t log_size;
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

            // Allocate memory for the log
            char *log = (char *) malloc(log_size);

            // Get the log
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

            // Print the log
            printf("%s\n", log);
        } else if (ret == CL_OUT_OF_HOST_MEMORY) {
            printf("out of host memory\n");
        }
        exit(1);
    }

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "add", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    size_t array_size = ARRAY_SIZE;
    ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size);

    // Execute the OpenCL kernel on the list
    size_t global_item_size = ARRAY_SIZE; // Process the entire lists
    size_t local_item_size = 1; // Divide work items into groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
                              ARRAY_BYTES, h_c, 0, NULL, NULL);

    // Print out the resulting array.
    for (int i = 0; i < 8; i++) {
        printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    printf("...\n");

    for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) {
        printf("%d + %d = %d",
               (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    return 0;
}
void buildOpenCLKernels_flux_calc_kernely(int xdim0, int ydim0, int xdim1,
                                          int ydim1, int xdim2, int ydim2,
                                          int xdim3, int ydim3) {

  // int ocl_fma = OCL_FMA;
  if (!isbuilt_flux_calc_kernely) {
    buildOpenCLKernels();
    // clSafeCall( clUnloadCompiler() );
    cl_int ret;
    char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernely.cl"};

    // Load the kernel source code into the array source_str
    FILE *fid;
    char *source_str[1];
    size_t source_size[1];

    for (int i = 0; i < 1; i++) {
      fid = fopen(source_filename[i], "r");
      if (!fid) {
        fprintf(stderr, "Can't open the kernel source file!\n");
        exit(1);
      }

      source_str[i] = (char *)malloc(4 * 0x1000000);
      source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid);
      if (source_size[i] != 4 * 0x1000000) {
        if (ferror(fid)) {
          printf("Error while reading kernel source file %s\n",
                 source_filename[i]);
          exit(-1);
        }
        if (feof(fid))
          printf("Kernel source file %s succesfuly read.\n",
                 source_filename[i]);
        // printf("%s\n",source_str[i]);
      }
      fclose(fid);
    }

    printf("Compiling flux_calc_kernely %d source -- start \n", OCL_FMA);

    // Create a program from the source
    OPS_opencl_core.program = clCreateProgramWithSource(
        OPS_opencl_core.context, 1, (const char **)&source_str,
        (const size_t *)&source_size, &ret);
    clSafeCall(ret);

    // Build the program
    char buildOpts[255 * 4];
    char *pPath = NULL;
    pPath = getenv("OPS_INSTALL_PATH");
    if (pPath != NULL)
      if (OCL_FMA)
        sprintf(buildOpts,
                "-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d  "
                "-Dxdim0_flux_calc_kernely=%d  -Dydim0_flux_calc_kernely=%d  "
                "-Dxdim1_flux_calc_kernely=%d  -Dydim1_flux_calc_kernely=%d  "
                "-Dxdim2_flux_calc_kernely=%d  -Dydim2_flux_calc_kernely=%d  "
                "-Dxdim3_flux_calc_kernely=%d  -Dydim3_flux_calc_kernely=%d ",
                pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3,
                ydim3);
      else
        sprintf(buildOpts,
                "-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d  "
                "-Dxdim0_flux_calc_kernely=%d  -Dydim0_flux_calc_kernely=%d  "
                "-Dxdim1_flux_calc_kernely=%d  -Dydim1_flux_calc_kernely=%d  "
                "-Dxdim2_flux_calc_kernely=%d  -Dydim2_flux_calc_kernely=%d  "
                "-Dxdim3_flux_calc_kernely=%d  -Dydim3_flux_calc_kernely=%d ",
                pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3,
                ydim3);
    else {
      sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath);
      exit(EXIT_FAILURE);
    }

    ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id,
                         buildOpts, NULL, NULL);

    if (ret != CL_SUCCESS) {
      char *build_log;
      size_t log_size;
      clSafeCall(clGetProgramBuildInfo(
          OPS_opencl_core.program, OPS_opencl_core.device_id,
          CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size));
      build_log = (char *)malloc(log_size + 1);
      clSafeCall(clGetProgramBuildInfo(
          OPS_opencl_core.program, OPS_opencl_core.device_id,
          CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL));
      build_log[log_size] = '\0';
      fprintf(
          stderr,
          "=============== OpenCL Program Build Info ================\n\n%s",
          build_log);
      fprintf(stderr,
              "\n========================================================= \n");
      free(build_log);
      exit(EXIT_FAILURE);
    }
    printf("compiling flux_calc_kernely -- done\n");

    // Create the OpenCL kernel
    OPS_opencl_core.kernel[107] =
        clCreateKernel(OPS_opencl_core.program, "ops_flux_calc_kernely", &ret);
    clSafeCall(ret);

    isbuilt_flux_calc_kernely = true;
  }
}
Example #9
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned
    
    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
    
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    // Fill our data set with random float values
    //
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;
    
    // Connect to a compute device
    //
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    
    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }
    
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
    
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }
    
    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
    
    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
    
    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }
    
    // Write our data set into the input array in device memory
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }
    
    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
    
    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    
    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }
    
    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);
    
    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if(results[i] == data[i] * data[i])
            correct++;
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    
    return 0;
}
Example #10
0
/**
 * @brief Creates an array of objects containing the OpenCL variables of each device
 * @param trDataBase The training database which will contain the instances and the features
 * @param selInstances The instances choosen as initial centroids
 * @param transposedTrDataBase The training database already transposed
 * @param conf The structure with all configuration parameters
 * @return A pointer containing the objects
 */
CLDevice *createDevices(const float *const trDataBase, const int *const selInstances, const float *const transposedTrDataBase, Config *const conf) {


	/********** Find the OpenCL devices specified in configuration ***********/

	// OpenCL variables
	cl_uint numPlatformsDevices;
	cl_device_type deviceType;
	cl_program program;
	cl_kernel kernel;
	cl_int status;

	// Others variables
	auto allDevices = getAllDevices();
	CLDevice *devices = new CLDevice[conf -> nDevices + (conf -> ompThreads > 0)];

	for (int dev = 0; dev < conf -> nDevices; ++dev) {

		bool found = false;
		for (int allDev = 0; allDev < allDevices.size() && !found; ++allDev) {

			// Get the specified OpenCL device
			char dbuff[120];
			check(clGetDeviceInfo(allDevices[allDev], CL_DEVICE_NAME, sizeof(dbuff), dbuff, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_NAME);

			// If the device exists...
			if (conf -> devices[dev] == dbuff) {
				devices[dev].device = allDevices[allDev];
				devices[dev].deviceName = dbuff;
				check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_TYPE, sizeof(cl_device_type), &(devices[dev].deviceType), NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_TYPE);


				/********** Device local memory usage ***********/

				long int usedMemory = conf -> nFeatures * sizeof(cl_uchar); // Chromosome of the individual
				usedMemory += conf -> trNInstances * sizeof(cl_uchar); // Mapping buffer
				usedMemory += conf -> K * conf -> nFeatures * sizeof(cl_float); // Centroids buffer
				usedMemory += conf -> trNInstances * sizeof(cl_float); // DistCentroids buffer
				usedMemory += conf -> K * sizeof(cl_int); // Samples_in_k buffer

				// Get the maximum local memory size
				long int maxMemory;
				check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long int), &maxMemory, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_MAXMEM);

				// Avoid exceeding the maximum local memory available. 1024 bytes of margin
				check(usedMemory > maxMemory - 1024, "%s:\n\tMax memory: %ld bytes\n\tAllow memory: %ld bytes\n\tUsed memory: %ld bytes\n", CL_ERROR_DEVICE_LOCALMEM, maxMemory, maxMemory - 1024, usedMemory);


				/********** Create context ***********/

				devices[dev].context = clCreateContext(NULL, 1, &(devices[dev].device), 0, 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_CONTEXT);


				/********** Create Command queue ***********/

				devices[dev].commandQueue = clCreateCommandQueue(devices[dev].context, devices[dev].device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_QUEUE);


				/********** Create kernel ***********/

				// Open the file containing the kernels
				std::fstream kernels(conf -> kernelsFileName.c_str(), std::fstream::in);
				check(!kernels.is_open(), "%s\n", CL_ERROR_FILE_OPEN);

				// Obtain the size
				kernels.seekg(0, kernels.end);
				size_t fSize = kernels.tellg();
				kernels.seekg(0, kernels.beg);

				char *kernelSource = new char[fSize];
				kernels.read(kernelSource, fSize);
				kernels.close();

				// Create program
				program = clCreateProgramWithSource(devices[dev].context, 1, (const char **) &kernelSource, &fSize, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_BUILD);

				// Build program for the device in the context
				char buildOptions[196];
				sprintf(buildOptions, "-I include -D N_INSTANCES=%d -D N_FEATURES=%d -D N_OBJECTIVES=%d -D K=%d -D MAX_ITER_KMEANS=%d", conf -> trNInstances, conf -> nFeatures, conf -> nObjectives, conf -> K, conf -> maxIterKmeans);
				if (clBuildProgram(program, 1, &(devices[dev].device), buildOptions, 0, 0) != CL_SUCCESS) {
					char buffer[4096];
					fprintf(stderr, "Error: Could not build the program\n");
					check(clGetProgramBuildInfo(program, devices[dev].device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_ERRORS);
					check(true, "%s\n", buffer);
				}

				// Create kernel
				const char *kernelName = (devices[dev].deviceType == CL_DEVICE_TYPE_GPU) ? "kmeansGPU" : "";
				devices[dev].kernel = clCreateKernel(program, kernelName, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_BUILD);


				/******* Work-items *******/

				devices[dev].computeUnits = atoi(conf -> computeUnits[dev].c_str());
				devices[dev].wiLocal = atoi(conf -> wiLocal[dev].c_str());
				devices[dev].wiGlobal = devices[dev].computeUnits * devices[dev].wiLocal;


				/******* Create and write the databases and centroids buffers. Create the subpopulations buffer. Set kernel arguments *******/

				// Create buffers
				devices[dev].objSubpopulations = clCreateBuffer(devices[dev].context, CL_MEM_READ_WRITE, conf -> familySize * sizeof(Individual), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_SUBPOPS);

				devices[dev].objTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TRDB);

				devices[dev].objTransposedTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TTRDB);

				devices[dev].objSelInstances = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> K * sizeof(cl_int), 0, &status);
				check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_CENTROIDS);

				// Sets kernel arguments
				check(clSetKernelArg(devices[dev].kernel, 0, sizeof(cl_mem), (void *)&(devices[dev].objSubpopulations)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT1);

				check(clSetKernelArg(devices[dev].kernel, 1, sizeof(cl_mem), (void *)&(devices[dev].objSelInstances)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT2);

				check(clSetKernelArg(devices[dev].kernel, 2, sizeof(cl_mem), (void *)&(devices[dev].objTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT3);

				check(clSetKernelArg(devices[dev].kernel, 5, sizeof(cl_mem), (void *)&(devices[dev].objTransposedTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT6);

				// Write buffers
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), trDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TRDB);
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objSelInstances, CL_FALSE, 0, conf -> K * sizeof(cl_int), selInstances, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_CENTROIDS);
				check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTransposedTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), transposedTrDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TTRDB);

				// Resources used are released
				delete[] kernelSource;
				clReleaseProgram(program);

				found = true;
				allDevices.erase(allDevices.begin() + allDev);
			}
		}

		check(!found, "%s\n", CL_ERROR_DEVICE_FOUND);
	}


	/********** Add the CPU if has been enabled in configuration ***********/

	if (conf -> ompThreads > 0) {
		devices[conf -> nDevices].deviceType = CL_DEVICE_TYPE_CPU;
		devices[conf -> nDevices].computeUnits = conf -> ompThreads;
		++(conf -> nDevices);
	}

	return devices;
}
Example #11
0
int main() {
	char buf[]="Hello, World!";
	size_t srcsize, worksize=strlen(buf);
	
	cl_int error;
	cl_platform_id platform;
	cl_device_id device;
	cl_uint platforms, devices;

	// Fetch the Platform and Device IDs; we only want one.
	error=clGetPlatformIDs(1, &platform, &platforms);
	error=clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &devices);
	cl_context_properties properties[]={
		CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
		0};
	// Note that nVidia's OpenCL requires the platform property
	cl_context context=clCreateContext(properties, 1, &device, NULL, NULL, &error);
	cl_command_queue cq = clCreateCommandQueue(context, device, 0, &error);
	
	rot13(buf);	// scramble using the CPU
	puts(buf);	// Just to demonstrate the plaintext is destroyed

	//char src[8192];
	//FILE *fil=fopen("rot13.cl","r");
	//srcsize=fread(src, sizeof src, 1, fil);
	//fclose(fil);
	
	const char *src=rot13_cl;
	srcsize=strlen(rot13_cl);

	const char *srcptr[]={src};
	// Submit the source code of the rot13 kernel to OpenCL
	cl_program prog=clCreateProgramWithSource(context,
		1, srcptr, &srcsize, &error);
	// and compile it (after this we could extract the compiled version)
	error=clBuildProgram(prog, 0, NULL, "", NULL, NULL);

	// Allocate memory for the kernel to work with
	cl_mem mem1, mem2;
	mem1=clCreateBuffer(context, CL_MEM_READ_ONLY, worksize, NULL, &error);
	mem2=clCreateBuffer(context, CL_MEM_WRITE_ONLY, worksize, NULL, &error);
	
	// get a handle and map parameters for the kernel
	cl_kernel k_rot13=clCreateKernel(prog, "rot13", &error);
	clSetKernelArg(k_rot13, 0, sizeof(mem1), &mem1);
	clSetKernelArg(k_rot13, 1, sizeof(mem2), &mem2);

	// Target buffer just so we show we got the data from OpenCL
	char buf2[sizeof buf];
	buf2[0]='?';
	buf2[worksize]=0;

	// Send input data to OpenCL (async, don't alter the buffer!)
	error=clEnqueueWriteBuffer(cq, mem1, CL_FALSE, 0, worksize, buf, 0, NULL, NULL);
	// Perform the operation
	error=clEnqueueNDRangeKernel(cq, k_rot13, 1, NULL, &worksize, &worksize, 0, NULL, NULL);
	// Read the result back into buf2
	error=clEnqueueReadBuffer(cq, mem2, CL_FALSE, 0, worksize, buf2, 0, NULL, NULL);
	// Await completion of all the above
	error=clFinish(cq);
	
	// Finally, output out happy message.
	puts(buf2);
}
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
{
    cl_int ciErrNum;

	if (!m_cxMainContext)
	{
		
		cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
		m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0);
	
		int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext);
		if (!numDev)
		{
			btAssert(0);
			exit(0);//this is just a demo, exit now
		}

		m_cdDevice =  btOpenCLUtils::getDevice(m_cxMainContext,0);
    	oclCHECKERROR(ciErrNum, CL_SUCCESS);

		btOpenCLDeviceInfo clInfo;
		btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo);
		btOpenCLUtils::printDeviceInfo(m_cdDevice);

		// create a command-queue
		m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
	// Program Setup
	size_t program_length;


#ifdef LOAD_FROM_MEMORY
	program_length = strlen(source);
	printf("OpenCL compiles ParticlesOCL.cl ... ");
#else

	const char* fileName = "ParticlesOCL.cl";
	FILE * fp = fopen(fileName, "rb");
	char newFileName[512];
	
	if (fp == NULL)
	{
		sprintf(newFileName,"..//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}
	
	if (fp == NULL)
	{
		sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}

	if (fp == NULL)
	{
		sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
		else
		{
			printf("cannot find %s\n",newFileName);
			exit(0);
		}
	}

//	char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length);
	//char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length);

	char *source = btOclLoadProgSource(fileName, "", &program_length);
	if(source == NULL)
	{
		printf("ERROR : OpenCL can't load file %s\n", fileName);
	}
//	oclCHECKERROR (source == NULL, oclFALSE);   
	btAssert(source != NULL);

	// create the program
	printf("OpenCL compiles %s ...", fileName);

#endif //LOAD_FROM_MEMORY


	//printf("%s\n", source);

	m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);
#ifndef LOAD_FROM_MEMORY
	free(source);
#endif //LOAD_FROM_MEMORY

	//#define LOCAL_SIZE_LIMIT 1024U
#define LOCAL_SIZE_MAX 1024U

		    // Build the program with 'mad' Optimization option
#ifdef MAC
	const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG";
#else
	const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= ";
#endif
	// build the program
	ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
	if(ciErrNum != CL_SUCCESS)
	{
		// write out standard error
//		oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
		// write out the build log and ptx, then exit
		char cBuildLog[10240];
//		char* cPtx;
//		size_t szPtxLength;
		clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG, 
							  sizeof(cBuildLog), cBuildLog, NULL );
//		oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength);
//		oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx);
		printf("\n\n%s\n\n\n", cBuildLog);
		printf("Press ENTER key to terminate the program\n");
		getchar();
		exit(-1); 
	}
	printf("OK\n");

	// create the kernels

	postInitDeviceData();

	initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId");
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);


	initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int),		(void *) &m_numGridCells);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dCellStart);

	initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart");
//	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int),	(void*) &m_numParticles);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSortedVel);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dSortedVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSimParams);

	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal");
}
int main()
{
	srand(unsigned(time(nullptr)));
	int err;                            // error code returned from api calls

	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel;                   // compute kernel

										// OpenCL device memory for matrices
	cl_mem d_A;
	cl_mem d_B;
	cl_mem d_C;

	// set seed for rand()
	srand(2014);

	//Allocate host memory for matrices A and B
	unsigned int size_A = WA * HA;
	unsigned int mem_size_A = sizeof(float) * size_A;
	float* h_A = (float*)malloc(mem_size_A);

	unsigned int size_B = WB * HB;
	unsigned int mem_size_B = sizeof(float) * size_B;
	float* h_B = (float*)malloc(mem_size_B);

	//Initialize host memory
	randomMemInit(h_A, size_A);
	randomMemInit(h_B, size_B);

	//Allocate host memory for the result C
	unsigned int size_C = WC * HC;
	unsigned int mem_size_C = sizeof(float) * size_C;
	float* h_C = (float*)malloc(mem_size_C);

	printf("Initializing OpenCL device...\n");

	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[100];
	clGetPlatformIDs(dev_cnt, platform_ids, NULL);

	// Connect to a compute device
	int gpu = 1;
	err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS){
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context){
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, 0, &err);
	if (!commands){
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource);
	if (lFileSize < 0L){
		perror("File read failed");
		return 1;
	}
	//const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl");

	program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);
	if (!program){
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS){
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	// Create the compute kernel in the program we wish to run
	kernel = clCreateKernel(program, "matrixMul", &err);
	if (!kernel || err != CL_SUCCESS){
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	// Create the input and output arrays in device memory for our calculation
	d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err);
	d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err);
	d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err);

	if (!d_A || !d_B || !d_C){
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}

	printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB);

	//Launch OpenCL kernel
	size_t localWorkSize[2], globalWorkSize[2];

	int wA = WA;
	int wC = WC;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B);
	err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA);
	err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC);

	if (err != CL_SUCCESS){
		printf("Error: Failed to set kernel arguments! %d\n", err);
		exit(1);
	}

	localWorkSize[0] = 16;
	localWorkSize[1] = 16;
	globalWorkSize[0] = 1024;
	globalWorkSize[1] = 1024;

	err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to execute kernel! %d\n", err);
		exit(1);
	}

	//Retrieve result from device
	err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}
	//print table A

	printf("\nMatrix A\n");
	for (int i = 0; i < size_A; i++){
		printf("%f\t", h_A[i]);
		if (((i + 1) % WA) == 0)
			printf("\n");
	}

	//print table B

	printf("\nMatrix B\n");
	for (int i = 0; i < size_B; i++){
		printf("%f\t", h_B[i]);
		if (((i + 1) % WB) == 0)
			printf("\n");
	}

	//print out the results

	printf("\nMatrix C (Results)\n");
	for (int i = 0; i < size_C; i++){
		printf("%f\t", h_C[i]);
		if (((i + 1) % WC) == 0)
			printf("\n");
	}
	printf("\n");


	printf("Matrix multiplication completed...\n");

	//Shutdown and cleanup
	free(h_A);
	free(h_B);
	free(h_C);

	clReleaseMemObject(d_A);
	clReleaseMemObject(d_C);
	clReleaseMemObject(d_B);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	std::cin.clear();
	std::cin.sync();
	std::cin.get();
}
Example #14
0
void initopencl(void) {
	
	int i;


	// Get Platform and Device Info
	CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms));
	
	// Currently this program only runs on a SINGLE GPU.
	CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices));
	printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
	printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
	
	
	char buffer[10240];
	cl_uint buf_uint;
	cl_ulong buf_ulong;
	printf("  -- %d --\n", i);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_NAME = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VENDOR = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DRIVER_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
	printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);

	if (num_devices == 0)
	{	
		fprintf(stderr, "No Devices found that can run OpenCL.");
		exit(0);	
	}
	// Create OpenCL context
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating context: Function returned %d \n\n", ret);
		exit(1);
	
	}
	// Create Command Queue
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret);
		exit(1);
	
	}
	
	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;
	
	fp = fopen("integrate.cl", "r");
	if (!fp) {
	    fprintf(stderr, "Failed to load kernel.\n");
	    exit(1);
	}
	
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose( fp );	
	
	
	// Create a program from the kernel source
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);	
    if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret);
			exit(1);
	}
    // Build the program
    
    ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); 
    if (ret != CL_SUCCESS)
    {
    
    	size_t length;
    	char buffer[10240];
    	clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
    	fprintf(stderr, "Error returned %d. \n\n", (int)ret);
    	printf("Error Log: \n\n %s \n\n", buffer);
    	exit(0);
    }
	
/*    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
*/	
	// Create the OpenCL kernel (check_int)
    kernel2 = clCreateKernel(program, "check_int", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret);
			exit(1);
	}
	
    
    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
	kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel4 = clCreateKernel(program, "LocalSearch3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	
	
	printf("\n\n");
}
Example #15
0
int vadd(void) {
    // Create the two input vectors
    int i;
    const int LIST_SIZE = 1024;
    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = i;
        B[i] = LIST_SIZE - i;
    }

    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    const char *fname = "/home/ckit/program/workspace_java/OpenCLHookSample/jni/vector_add_kernel.cl";
//    const char *fname = "vadd.ir";
    fp = fopen(fname, "rb");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;

      cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platform_id );

	//#define XXX CL_DEVICE_TYPE_DEFAULT
//	#define XXX CL_DEVICE_TYPE_ALL
//	#define XXX CL_DEVICE_TYPE_GPU
	#define XXX CL_DEVICE_TYPE_CPU

	cl_uint num_platforms = 2;
	cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)* num_platforms);
	if(NULL == platforms){
		printf("malloc err!\n");
	}
	ret = clGetPlatformIDs(2, platforms, &ret_num_platforms);
	printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platforms[1] );
    ret = clGetDeviceIDs( platforms[0], XXX, 1, &device_id, &ret_num_devices);
    printf("clGetDeviceIDs err=%d,num_platforms=%d, device_id=%x\n", ret, ret_num_platforms, (unsigned int)device_id );
    char name[64];
    ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char)*64, name, NULL);
    printf("device_name : %s\n", name);

    // Create an OpenCL context
//    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    cl_context_properties cps[3] =
    {
    	(cl_context_properties)CL_CONTEXT_PLATFORM,
        (cl_context_properties)platforms[0],
        (cl_context_properties)0
    };
    cl_context context = clCreateContextFromType( cps, XXX, NULL, NULL, &ret);
	printf("clCreateContextFromType err=%d,device_type=%x\n", ret, (unsigned int)XXX);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector 
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);

    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
            (const char **)&source_str, (const size_t *)&source_size, &ret);
//    cl_int status;
//    cl_int err;
//    cl_program program = clCreateProgramWithBinary(
//    		context, 1, &device_id, &source_size, (const unsigned char **)&source_str, &status, &err);
    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    
    // Execute the OpenCL kernel on the list
    size_t global_item_size = LIST_SIZE; // Process the entire lists
    size_t local_item_size = 64; // Process in groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);

    // Display the result to the screen
    for(i = 0; i < /*LIST_SIZE*/10; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    return 0;
}
Example #16
0
int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 100000;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;
 
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
 
    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }
 
    size_t globalSize, localSize;
    cl_int err;
 
    // Number of work items in each local work group
    localSize = 64;
 
    // Number of total work items - localSize must be devisor
    globalSize = ceil(n/(float)localSize)*localSize;
 
    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
 
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 
    // Create a context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 
    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
 
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
 
    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
 
    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);
 
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
 
    // Execute the kernel over the entire range of the data set 
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);
 
    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);
 
    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );
 
    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Example #17
0
/** 
 * @brief Create an OpenCL program given a set of source kernel files. 
 * 
 * @param zone OpenCL zone where to create program.
 * @param kernelFiles Array of strings identifying filenames containing kernels.
 * @param numKernelFiles Number of strings identifying filenames containing kernels.
 * @param compilerOpts OpenCL compiler options.
 * @param err Error structure, to be populated if an error occurs.
 * @return @link clu_error_codes::CLU_SUCCESS @endlink operation
 * successfully completed or another value of #clu_error_codes if an 
 * error occurs.
 */
int clu_program_create(CLUZone* zone, char** kernelFiles, cl_uint numKernelFiles, char* compilerOpts, GError **err) {

	/* Helper variables */
	cl_int ocl_status, ocl_build_status;
	int ret_status;
	char * build_log = NULL;
	size_t logsize;
	char** source = NULL;
	
	/* Import kernels */
	source = (char**) malloc(numKernelFiles * sizeof(char*));
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		NULL == source, 
		ret_status = CLU_ERROR_NOALLOC, 
		error_handler, 
		"Unable to allocate memory for kernels source file.");
	for (unsigned int i = 0; i < numKernelFiles; i++) { source[i] = NULL; }
	for (unsigned int i = 0; i < numKernelFiles; i++) {
		source[i] = clu_source_load(kernelFiles[i], err);
		gef_if_error_goto(*err, GEF_USE_GERROR, ret_status, error_handler);
	}
	
	/* Load kernels sources and create program */
	cl_program program = clCreateProgramWithSource(
		zone->context, 
		numKernelFiles, 
		(const char**) source, 
		NULL, 
		&ocl_status);
	gef_if_error_create_goto(
		*err, 
		CLU_UTILS_ERROR, 
		CL_SUCCESS != ocl_status, 
		ret_status = CLU_OCL_ERROR, 
		error_handler, 
		"Create program with source (OpenCL error %d :%s).", 
		ocl_status,
		clerror_get(ocl_status));
	
	/* Perform runtime source compilation of program */
	ocl_build_status = clBuildProgram(
		program, 
		1, 
		&zone->device_info.device_id, 
		compilerOpts, 
		NULL, 
		NULL);
	/* Check for errors. */
	if (ocl_build_status != CL_SUCCESS) {
		/* If where here it's because program failed to build. However, error will only be thrown after getting build information. */
		/* Get build log size. */
		ocl_status = clGetProgramBuildInfo(
			program, 
			zone->device_info.device_id, 
			CL_PROGRAM_BUILD_LOG, 
			0, 
			NULL, 
			&logsize);
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			CL_SUCCESS != ocl_status, 
			ret_status = CLU_OCL_ERROR, 
			error_handler, 
			"Error getting program build info (log size, OpenCL error %d: %s) after program failed to build (OpenCL error %d: %s).", 
			ocl_status,
			clerror_get(ocl_status),
			ocl_build_status,
			clerror_get(ocl_build_status));
		/* Alocate memory for build log. */
		build_log = (char*) malloc(logsize);
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			NULL == build_log, 
			ret_status = CLU_ERROR_NOALLOC, 
			error_handler, 
			"Unable to allocate memory for build log after program failed to build with OpenCL error %d (%s).", 
			ocl_build_status,
			clerror_get(ocl_build_status));
		/* Get build log. */
		ocl_status = clGetProgramBuildInfo(
			program, 
			zone->device_info.device_id, 
			CL_PROGRAM_BUILD_LOG, 
			logsize, 
			build_log, 
			NULL);
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			CL_SUCCESS != ocl_status, 
			ret_status = CLU_OCL_ERROR, 
			error_handler, 
			"Error getting program build info (build log, OpenCL error %d: %s) after program failed to build (OpenCL error %d: %s).", 
			ocl_status,
			clerror_get(ocl_status),
			ocl_build_status,
			clerror_get(ocl_build_status));
		/* Throw error. */
		gef_if_error_create_goto(
			*err, 
			CLU_UTILS_ERROR, 
			1, 
			ret_status = CLU_OCL_ERROR, 
			error_handler, 
			"Failed to build program (OpenCL error %d: %s). \n\n **** Start of build log **** \n\n%s\n **** End of build log **** \n", 
			ocl_build_status, 
			clerror_get(ocl_build_status),
			build_log);
	}
	zone->program = program;

	/* If we got here, everything is OK. */
	g_assert (err == NULL || *err == NULL);
	ret_status = CLU_SUCCESS;
	goto finish;
	
error_handler:
	/* If we got here there was an error, verify that it is so. */
	g_assert (err == NULL || *err != NULL);

finish:	

	/* Free stuff. */
	if (source != NULL) {
		for (unsigned int i = 0; i < numKernelFiles; i++) {
			if (source[i] != NULL) {
				clu_source_free(source[i]);
			}
		}
		free(source);
	}
	if (build_log != NULL) {
		free(build_log);
	}
	
	/* Return. */
	return ret_status;
}
Example #18
0
int main() {

   // Set the image rotation (in degrees)
   float theta = 3.14159/6;
   float cos_theta = cosf(theta);
   float sin_theta = sinf(theta);
   printf("theta = %f (cos theta = %f, sin theta = %f)\n", theta, cos_theta, 
      sin_theta);

   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";

   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth,
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);

   // Set up the OpenCL environment
   cl_int status;

   // Discovery platform
   cl_platform_id platforms[2];
   cl_platform_id platform;
   status = clGetPlatformIDs(2, platforms, NULL);
   chk(status, "clGetPlatformIDs");
   platform = platforms[PLATFORM_TO_USE];

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
   chk(status, "clGetDeviceIDs");

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
       (cl_context_properties)(platform), 0};
   cl_context context;
   context = clCreateContext(props, 1, &device, NULL, NULL, &status);
   chk(status, "clCreateContext");

   // Create command queue
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, 0, &status);
   chk(status, "clCreateCommandQueue");

   // Create the input and output buffers
   cl_mem d_input;
   d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL,
       &status);
   chk(status, "clCreateBuffer");

   cl_mem d_output;
   d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL,
       &status);
   chk(status, "clCreateBuffer");

   // Copy the input image to the device
   status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, 
         inputImage, 0, NULL, NULL);
   chk(status, "clEnqueueWriteBuffer");

   const char* source = readSource("rotation.cl");

   // Create a program object with source and build it
   cl_program program;
   program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
   chk(status, "clCreateProgramWithSource");
   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   chk(status, "clBuildProgram");
   
   // Create the kernel object
   cl_kernel kernel;
   kernel = clCreateKernel(program, "img_rotate", &status);
   chk(status, "clCreateKernel");

   // Set the kernel arguments
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output);
   status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input);
   status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth);
   status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight);
   status |= clSetKernelArg(kernel, 4, sizeof(float), &sin_theta);
   status |= clSetKernelArg(kernel, 5, sizeof(float), &cos_theta);
   chk(status, "clSetKernelArg");

   // Set the work item dimensions
   size_t globalSize[2] = {imageWidth, imageHeight};
   status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
      NULL, NULL);
   chk(status, "clEnqueueNDRange");

   // Read the image back to the host
   status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, 
         outputImage, 0, NULL, NULL); 
   chk(status, "clEnqueueReadBuffer");

   // Write the output image to file
   storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);

   return 0;
}
Example #19
0
int main()
{
  typedef float       ScalarType;
  
  
  /////////////////////////////////////////////////////////////////////////////////////////////////////////
  //////////////////////// Part 1: Set up a custom context and perform a sample operation. ////////////////
  ////////////////////////         This is rather lengthy due to the OpenCL framework.     ////////////////
  ////////////////////////         The following does essentially the same as the          ////////////////
  ////////////////////////         'custom_kernels'-tutorial!                               ////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////////////////
  
  //manually set up a custom OpenCL context:
  std::vector<cl_device_id> device_id_array;
  
  //get all available devices
  viennacl::ocl::platform pf;
  std::cout << "Platform info: " << pf.info() << std::endl;
  std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT);
  std::cout << devices[0].name() << std::endl;
  std::cout << "Number of devices for custom context: " << devices.size() << std::endl;
  
  //set up context using all found devices:
  for (size_t i=0; i<devices.size(); ++i)
  {
      device_id_array.push_back(devices[i].id());
  }
     
  std::cout << "Creating context..." << std::endl;
  cl_int err;
  cl_context my_context = clCreateContext(0, device_id_array.size(), &(device_id_array[0]), NULL, NULL, &err);
  VIENNACL_ERR_CHECK(err);
   
  
  //create two Vectors:
  unsigned int vector_size = 10;
  std::vector<ScalarType> vec1(vector_size);
  std::vector<ScalarType> vec2(vector_size);
  std::vector<ScalarType> result(vector_size);
  
  //
  // fill the operands vec1 and vec2:
  //
  for (unsigned int i=0; i<vector_size; ++i)
  {
    vec1[i] = static_cast<ScalarType>(i);
    vec2[i] = static_cast<ScalarType>(vector_size-i);
  }
  
  //
  // create memory in OpenCL context:
  //
  cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
  VIENNACL_ERR_CHECK(err);

  // 
  // create a command queue for each device:
  // 
  
  std::vector<cl_command_queue> queues(devices.size());
  for (size_t i=0; i<devices.size(); ++i)
  {
    queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
    VIENNACL_ERR_CHECK(err);
  }
  
  // 
  // create and build a program in the context:
  // 
  size_t source_len = std::string(my_compute_program).length();
  cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
  err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);
  
/*            char buffer[1024];
            cl_build_status status;
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
            std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
            std::cout << "Log: " << buffer << std::endl;*/
  
  VIENNACL_ERR_CHECK(err);
  
  // 
  // create a kernel from the program:
  // 
  const char * kernel_name = "elementwise_prod";
  cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
  VIENNACL_ERR_CHECK(err);

  
  //
  // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2;
  //
  err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
  VIENNACL_ERR_CHECK(err);
  size_t global_size = vector_size;
  size_t local_size = vector_size;
  err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);
  
  
  //
  // Read and output result:
  //
  err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);
  err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);

  std::cout << "vec1  : ";
  for (size_t i=0; i<vec1.size(); ++i)
    std::cout << vec1[i] << " ";
  std::cout << std::endl;

  std::cout << "vec2  : ";
  for (size_t i=0; i<vec2.size(); ++i)
    std::cout << vec2[i] << " ";
  std::cout << std::endl;

  std::cout << "result: ";
  for (size_t i=0; i<result.size(); ++i)
    std::cout << result[i] << " ";
  std::cout << std::endl;
  
  ////////////////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////// Part 2: Let ViennaCL use the already created context: //////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////

  //Tell ViennaCL to use the previously created context.
  //This context is assigned an id '0' when using viennacl::ocl::switch_context().
  viennacl::ocl::setup_context(0, my_context, device_id_array, queues);
  viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero)
  
  //
  // Proof that ViennaCL really uses the new context:
  //
  std::cout << "Existing context: " << my_context << std::endl;
  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;

  //
  // Wrap existing OpenCL objects into ViennaCL:
  //
  viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size);
  viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size);
  viennacl::vector<ScalarType> vcl_result(mem_result, vector_size);
  viennacl::scalar<ScalarType> vcl_s = 2.0;

  std::cout << "Standard vector operations within ViennaCL:" << std::endl;
  vcl_result = vcl_s * vcl_vec1 + vcl_vec2;
  
  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;
  
  //
  // We can also reuse the existing elementwise_prod kernel. 
  // Therefore, we first have to make the existing program known to ViennaCL
  // For more details on the three lines, see tutorial 'custom-kernels'
  //
  std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
  viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program");
  viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel("elementwise_prod");
  viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size())));  //Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary here.
  
  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;
  
  
  //
  // Since a linear piece of memory can be interpreted in several ways, 
  // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/
  // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
  //
  viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3);
  
  vcl_vec2.resize(3);   //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2);

  std::cout << "result of matrix-vector product: ";
  std::cout << vcl_result << std::endl;

  //
  //  That's it.
  //
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
  
  return 0;
}
Example #20
0
int
NBody::setupCL()
{
    cl_int status = CL_SUCCESS;

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    assert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
    assert(numPlatforms > 0);
    cl_platform_id* platforms = new cl_platform_id[numPlatforms];
    assert(clGetPlatformIDs(numPlatforms, platforms, NULL) == CL_SUCCESS);
    platform = platforms[0];
    delete[] platforms;

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
    assert(status == CL_SUCCESS);

    size_t deviceListSize;
    /* First, get the size of device list data */
    assert(clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize) == 0);
    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    assert(deviceCount > 0);

    devices = (cl_device_id*)malloc(deviceListSize);
    assert(clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL) == 0);

    commandQueue = clCreateCommandQueue(context, devices[0], 0, &status);
    assert(status == CL_SUCCESS);

    /*
    * Create and initialize memory objects
    */

    /* Create memory objects for position */
    currPos = clCreateBuffer(context, CL_MEM_READ_WRITE,
                             numBodies * sizeof(cl_float4), 0, &status);
    assert(status == CL_SUCCESS);

    /* Initialize position buffer */
    status = clEnqueueWriteBuffer(commandQueue, currPos, 1, 0,
                                  numBodies * sizeof(cl_float4), pos,
                                  0, 0, 0);
    assert(status == CL_SUCCESS);

    /* Create memory objects for position */
    newPos = clCreateBuffer(context, CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4), 0, &status);
    assert(status == CL_SUCCESS);
    /* Create memory objects for velocity */
    currVel = clCreateBuffer(context, CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4), 0, &status);
    assert(status == CL_SUCCESS);

    /* Initialize velocity buffer */
    status = clEnqueueWriteBuffer(commandQueue, currVel, 1, 0,
                                  numBodies * sizeof(cl_float4),
                                  vel, 0, 0, 0);
    assert(status == CL_SUCCESS);

    /* Create memory objects for velocity */
    newVel = clCreateBuffer(context, CL_MEM_READ_ONLY,
        numBodies * sizeof(cl_float4), 0, &status);
    assert(status == CL_SUCCESS);

    std::ifstream in("NBody_Kernels.cl", std::ios_base::in);
    if (!in.good()) {
        std::cout << "Failed to load kernel file : " << "NBody_Kernels.cl" << std::endl;
        abort();
    }
    in.seekg(0, std::ios::end);
    size_t length = size_t(in.tellg());
    in.seekg(0, std::ios::beg);
    // Allocate memory for the code
    char *source = new char[length+1];
    // Read data as a block
    in.read(source,length);
    source[length] = '\0';
    in.close();

    const char *constSrc = source;

    program = clCreateProgramWithSource(context, 1, &constSrc, &length, &status);
    assert(status == CL_SUCCESS);

    // Get additional options
    /* create a cl program executable for all the devices specified */
    assert(clBuildProgram(program, 1, devices, NULL, NULL, NULL) ==
           CL_SUCCESS);
    
    /* get a kernel object handle for a kernel with the given name */
    kernel = clCreateKernel(program, "nbody_sim", &status);
    assert(status == CL_SUCCESS);

    delete [] source;

    return 0;
}
Example #21
0
int main(int argc, char **argv) {




	if (find_option(argc, argv, "-h") >= 0)
	{
		printf("Options:\n");
		printf("-h to see this help\n");
		printf("-n <int> to set the number of particles\n");
		printf("-o <filename> to specify the output file name\n");
		printf("-s <filename> to specify the summary output file name\n");
		return 0;
	}


	int n = read_int(argc, argv, "-n", 1000);

	char *savename = read_string(argc, argv, "-o", NULL);
	char *sumname = read_string(argc, argv, "-s", NULL);

	// For return values.
	cl_int ret;

	// OpenCL stuff.
	// Loading kernel files.
	FILE *kernelFile;
	char *kernelSource;
	size_t kernelSize;

	kernelFile = fopen("simulationKernel.cl", "r");

	if (!kernelFile) {
		fprintf(stderr, "No file named simulationKernel.cl was found\n");
		exit(-1);
	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);

	// Getting platform and device information
	cl_platform_id platformId = NULL;
	cl_device_id deviceID = NULL;
	cl_uint retNumDevices;
	cl_uint retNumPlatforms;
	ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
	// Different types of devices to pick from. At the moment picks the default opencl device.
	//CL_DEVICE_TYPE_GPU
	//CL_DEVICE_TYPE_ACCELERATOR
	//CL_DEVICE_TYPE_DEFAULT
	//CL_DEVICE_TYPE_CPU
	ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices);

	// Max workgroup size
	size_t max_available_local_wg_size;
	ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL);
	// Creating context.
	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret);


	// Creating command queue
        cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret);
	
	// Build program
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret);
//	printf("program = ret %i \n", ret);
	ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
//	printf("clBuildProgram: ret %i \n", ret);
	
	// Create kernels
	cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret);

	cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret);

	cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret);
	cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret);

	FILE *fsave = savename ? fopen(savename, "w") : NULL;
	FILE *fsum = sumname ? fopen(sumname, "a") : NULL;
	particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t));

	// GPU particle data structure
	cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret);

	// Set size
	set_size(n);

	init_particles(n, particles);

	double copy_time = read_timer();

	// Copy particles to device.
	ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL);
	copy_time = read_timer() - copy_time;
	

	// Calculating thread and thread block counts.
	// sizes
	size_t globalItemSize;
	size_t localItemSize;
	// Global item size
	if (n <= NUM_THREADS) {
		globalItemSize = NUM_THREADS;
		localItemSize = 16;
	}
	else if (n % NUM_THREADS != 0) {
		globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS;
	}
	else {
		globalItemSize = n;
	}

	// Local item size
	localItemSize = globalItemSize / NUM_THREADS;	

	// Bins and bin sizes.
	// Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10.
	// There will never be 10 particles in one bin.
	int maxParticles = 10;
	
	// Calculating the number of bins.
	int numberOfBins = (int)ceil(size/(2*cutoff)) + 2;
	
	// Bins will only exist on the device.
	particle_t* bins;
	
	// How many particles are there in each bin - also only exists on the device.
	volatile int* binSizes;
	
	// Number of bins to be initialized.
	size_t clearAmt = numberOfBins*numberOfBins;
	
	// Allocate memory for bins on the device.
	cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret);
	cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret);
	
	// SETTING ARGUMENTS FOR THE KERNELS
	
	// Set arguments for the init / clear kernel
	ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins);

	// Set arguments for the binning kernel
	ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(binKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins);

	// Set arguments for force kernel.
	ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins);


	// Set arguments for move kernel
	ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size);
	
	
	// Variable to check if kernel execution is done.
	cl_event kernelDone;
	
	
	double simulation_time = read_timer();
	int step = 0;
	for (step = 0; step < NSTEPS; step++) {


		// Execute bin initialization (clearing after first iteration)
		ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute binning kernel
		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
//		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);	
		// Execute force kernel
		ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute move kernel
		ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);

		if (fsave && (step%SAVEFREQ) == 0) {
			// Copy the particles back to the CPU
			ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone);
			ret = clWaitForEvents(1, &kernelDone);

			save(fsave, n, particles);
		}

	}
	simulation_time = read_timer() - simulation_time;
	printf("CPU-GPU copy time = %g seconds\n", copy_time);
	printf("n = %d, simulation time = %g seconds\n", n, simulation_time);

	if (fsum)
		fprintf(fsum, "%d %lf \n", n, simulation_time);

	if (fsum)
		fclose(fsum);
	free(particles);
	if (fsave)
		fclose(fsave);


	ret = clFlush(commandQueue);
	ret = clFinish(commandQueue);
	ret = clReleaseCommandQueue(commandQueue);
	ret = clReleaseKernel(forceKernel);
	ret = clReleaseKernel(moveKernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(d_particles);
	ret = clReleaseContext(context);


	return 0;
}
Example #22
0
int main(int argc, char** argv)
{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();

	cl_int err;

	size_t global_size;
	size_t local_size;

	cl_program program;
	cl_kernel kernel_compute_flux;
	cl_kernel kernel_compute_flux_contributions;
	cl_kernel kernel_compute_step_factor;
	cl_kernel kernel_time_step;
	cl_kernel kernel_initialize_variables;

	cl_mem ff_variable;
	cl_mem ff_fc_momentum_x;
	cl_mem ff_fc_momentum_y;
	cl_mem ff_fc_momentum_z;
	cl_mem ff_fc_density_energy;

	if (argc < 2)
	{
		printf("Usage ./cfd <data input file>\n");
		return 0;
	}


	const char* data_file_name = argv[1];


	// set far field conditions and load them into constant memory on the gpu
	{
		float h_ff_variable[NVAR];
		const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack);

		h_ff_variable[VAR_DENSITY] = (float)(1.4);

		float ff_pressure = (float)(1.0);
		float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
		float ff_speed = (float)(ff_mach)*ff_speed_of_sound;

		float3 ff_velocity;
		ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack));
		ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack));
		ff_velocity.z = 0.0;

		h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
		h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
		h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;

		h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0));

		float3 h_ff_momentum;
		h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
		h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
		h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
		float3 h_ff_fc_momentum_x;
		float3 h_ff_fc_momentum_y;
		float3 h_ff_fc_momentum_z;
		float3 h_ff_fc_density_energy;
		compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum,
				&h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity,
				&h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z,
				&h_ff_fc_density_energy);

		// copy far field conditions to the gpu
		ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err);
		CHKERR(err, "Unable to allocate ff data");
	}
	int nel;
	int nelr;

	// read in domain geometry
	cl_mem areas;
	cl_mem elements_surrounding_elements;
	cl_mem normals;
	{
		std::ifstream file(data_file_name);

		file >> nel;

		nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));

		float* h_areas = new float[nelr];
		int* h_elements_surrounding_elements = new int[nelr*NNB];
		float* h_normals = new float[nelr*NDIM*NNB];


		// read in data
		for(int i = 0; i < nel; i++)
		{
			file >> h_areas[i];
			for(int j = 0; j < NNB; j++)
			{
				file >> h_elements_surrounding_elements[i + j*nelr];
				if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
				h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering

				for(int k = 0; k < NDIM; k++)
				{
					file >> h_normals[i + (j + k*NNB)*nelr];
					h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
				}
			}
		}

		// fill in remaining data
		int last = nel-1;
		for(int i = nel; i < nelr; i++)
		{
			h_areas[i] = h_areas[last];
			for(int j = 0; j < NNB; j++)
			{
				// duplicate the last element
				h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
				for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
			}
		}

		areas = alloc<float>(context, nelr);
		upload<float>(commands, areas, h_areas, nelr);

		elements_surrounding_elements = alloc<int>(context, nelr*NNB);
		upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);

		normals = alloc<float>(context, nelr*NDIM*NNB);
		upload<float>(commands, normals, h_normals, nelr*NDIM*NNB);

		delete[] h_areas;
		delete[] h_elements_surrounding_elements;
		delete[] h_normals;
	}

	// Get program source.
	long kernelSize = getKernelSize();
	char* kernelSource = new char[kernelSize];
	getKernelSource(kernelSource, kernelSize);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err);
	CHKERR(err, "Failed to create a compute program!");

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err == CL_BUILD_PROGRAM_FAILURE)
	{
		char *log;
		size_t logLen;
		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen);
		log = (char *) malloc(sizeof(char)*logLen);
		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL);
		fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log);
		free(log);
		exit(1);
	}
	CHKERR(err, "Failed to build program!");
	delete[] kernelSource;

	// Create the compute kernel in the program we wish to run
	kernel_compute_flux = clCreateKernel(program, "compute_flux", &err);
	CHKERR(err, "Failed to create a compute kernel!");

	// Create the reduce kernel in the program we wish to run
	kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err);
	CHKERR(err, "Failed to create a compute_flux_contributions kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err);
	CHKERR(err, "Failed to create a compute_step_factor kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_time_step = clCreateKernel(program, "time_step", &err);
	CHKERR(err, "Failed to create a time_step kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err);
	CHKERR(err, "Failed to create a initialize_variables kernel!");

	// Create arrays and set initial conditions
	cl_mem variables = alloc<cl_float>(context, nelr*NVAR);

	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	//err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	local_size = 1;//std::min(local_size, (size_t)nelr);
	global_size = nelr;
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	err = clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0");


	cl_mem old_variables = alloc<float>(context, nelr*NVAR);
	cl_mem fluxes = alloc<float>(context, nelr*NVAR);
	cl_mem step_factors = alloc<float>(context, nelr);
	clFinish(commands);
	cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM);
	cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM);
	clFinish(commands);

	// make sure all memory is floatly allocated before we start timing
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1");
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");

	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2");
	std::cout << "About to memcopy" << std::endl;
	err = clReleaseMemObject(step_factors);
	float temp[nelr];
	for(int i = 0; i < nelr; i++)
		temp[i] = 0;
	step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err);
	CHKERR(err, "Unable to memset step_factors");
	// make sure CUDA isn't still doing something before we start timing

	clFinish(commands);

	// these need to be computed the first time in order to compute time step
	std::cout << "Starting..." << std::endl;


	// Begin iterations
	for(int i = 0; i < iterations; i++)
	{
		copy<float>(commands, old_variables, variables, nelr*NVAR);

		// for the first iteration we compute the time step
		err = 0;
		err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr);
		err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables);
		err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas);
		err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors);
		CHKERR(err, "Failed to set kernel arguments!");
		// Get the maximum work group size for executing the kernel on the device
		err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
		CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");
		err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!");
		for(int j = 0; j < RK; j++)
		{
			err = 0;
			err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer)
			//compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!");
			err = 0;
			err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements);
			err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals);
			err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy);
			err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes);
			err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable);
			err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!");
			err = 0;
			err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j);
			err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables);
			err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors);
			err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_time_step work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_time_step]!");
		}
	}

	clFinish(commands);
	std::cout << "Finished" << std::endl;
	std::cout << "Saving solution..." << std::endl;
	dump(commands, variables, nel, nelr);
	std::cout << "Saved solution..." << std::endl;
	std::cout << "Cleaning up..." << std::endl;

	clReleaseProgram(program);
	clReleaseKernel(kernel_compute_flux);
	clReleaseKernel(kernel_compute_flux_contributions);
	clReleaseKernel(kernel_compute_step_factor);
	clReleaseKernel(kernel_time_step);
	clReleaseKernel(kernel_initialize_variables);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	dealloc<float>(areas);
	dealloc<int>(elements_surrounding_elements);
	dealloc<float>(normals);

	dealloc<float>(variables);
	dealloc<float>(old_variables);
	dealloc<float>(fluxes);
	dealloc<float>(step_factors);
	dealloc<float>(fc_momentum_x);
	dealloc<float>(fc_momentum_y);
	dealloc<float>(fc_momentum_z);
	dealloc<float>(fc_density_energy);

	std::cout << "Done..." << std::endl;
	ocd_finalize();
	return 0;
}
Example #23
0
void runProgram(int N, char *fileName)
{
	printf("GPU Symmetrize()..."
		"\nSquareMatrix[%d][%d]\n", N, N);

	int i,j;

	// initialize input array
	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = j;	
		}
	}

	//  result
	float *Aout;
	Aout = (float*)malloc(sizeof(float)*N*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);
#endif

	int NumK = 1;
	int NumE = 2;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);



#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	//printf("binary size = %ld\n", binary_size);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL);
	OCL_CHECK(err);

	//puts("save binaries");

	// Print the binary out to the output file
	fh = fopen("kernel.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

	puts("done save binaries");

#endif


	kernel[0] = clCreateKernel(program, "kernel_a", &err);
	OCL_CHECK(err);

	// memory on device
	cl_mem A_d    	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem Aout_d   = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);


	// copy data to device
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , &event[0]); 
	OCL_CHECK(err);

	size_t localsize[2];
	size_t globalsize[2];

	localsize[0] = 16; 
	localsize[1] = 16;

	globalsize[0] = N;
	globalsize[1] = N;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL);
	OCL_CHECK(err);

	clFinish(queue);

	// read device data back to host
	clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]);

	err = clWaitForEvents(1,&event[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("Output");
	check_2d_f(Aout,N,N);
#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(Aout_d);	


	// // check
	// int flag = 1;
	// for(i=0;i<N;++i){
	// 	for(j=0;j<N;++j){
	// 		if(A[i*N+j] != At[j*N+i])		
	// 		{
	// 			flag  = 0;
	// 			break;
	// 		}
	// 	}
	// }
	// if( flag == 0 )
	// {
	// 	puts("Bugs! Check program.");
	// }else{
	// 	puts("Succeed!");	
	// }



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif



	free(A);
	free(Aout);

	return;
}
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads)
{
	/* initialize highlighting */
	if (!g_highlightInitialized) {
		if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead);
		if (g_highlightedNodes)     MEM_freeN(g_highlightedNodes);

		g_highlightedNodesRead = NULL;
		g_highlightedNodes = NULL;

		COM_startReadHighlights();

		g_highlightInitialized = true;
	}

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
	/* deinitialize if number of threads doesn't match */
	if (g_cpudevices.size() != num_cpu_threads) {
		Device *device;

		while (g_cpudevices.size() > 0) {
			device = g_cpudevices.back();
			g_cpudevices.pop_back();
			device->deinitialize();
			delete device;
		}

		g_cpuInitialized = false;
	}

	/* initialize CPU threads */
	if (!g_cpuInitialized) {
		for (int index = 0; index < num_cpu_threads; index++) {
			CPUDevice *device = new CPUDevice();
			device->initialize();
			g_cpudevices.push_back(device);
		}

		g_cpuInitialized = true;
	}

#ifdef COM_OPENCL_ENABLED
	/* deinitialize OpenCL GPU's */
	if (use_opencl && !g_openclInitialized) {
		g_context = NULL;
		g_program = NULL;

		if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */
			return;

		if (clCreateContextFromType) {
			cl_uint numberOfPlatforms = 0;
			cl_int error;
			error = clGetPlatformIDs(0, 0, &numberOfPlatforms);
			if (error == -1001) { }   /* GPU not supported */
			else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
			if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms);
			cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__);
			error = clGetPlatformIDs(numberOfPlatforms, platforms, 0);
			unsigned int indexPlatform;
			for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) {
				cl_platform_id platform = platforms[indexPlatform];
				cl_uint numberOfDevices = 0;
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices);
				if (numberOfDevices <= 0)
					continue;

				cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__);
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0);

				g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error);
				if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
				const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL};
				g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error);
				error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0);
				if (error != CL_SUCCESS) {
					cl_int error2;
					size_t ret_val_size = 0;
					printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__);
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					build_log[ret_val_size] = '\0';
					printf("%s", build_log);
					MEM_freeN(build_log);
				}
				else {
					unsigned int indexDevices;
					for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) {
						cl_device_id device = cldevices[indexDevices];
						cl_int vendorID = 0;
						cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL);
						if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); }
						OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID);
						clDevice->initialize();
						g_gpudevices.push_back(clDevice);
					}
				}
				MEM_freeN(cldevices);
			}
			MEM_freeN(platforms);
		}

		g_openclInitialized = true;
	}
#endif
#endif
}
Example #25
0
void sum_gpu(long long *in, long long *out, unsigned int n)
{
	size_t global_size;
	size_t local_size;

	char *kernel_src;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	cl_uint max_compute_units;
	size_t max_workgroup_size;

	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;
	cl_mem d_array;

	cl_event event;
	cl_ulong start, end;

	/* start OpenCL */
	err = clGetPlatformIDs(1, &platform_id,NULL);
	clErrorHandling("clGetPlatformIDs");

	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	clErrorHandling("clGetDeviceIDs");

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	clErrorHandling("clCreateContext");

	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	clErrorHandling("clCreateCommandQueue");

	/* create kernel */
	kernel_src = file_to_string(KERNEL_SRC);
	program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err);
	free(kernel_src);
	clErrorHandling("clCreateProgramWithSource");

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	clErrorHandling("clBuildProgram");

	kernel = clCreateKernel(program, "matrix_mult", &err);
	clErrorHandling("clCreateKernel");

	/* allocate memory and send to gpu */
	d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err);
	clErrorHandling("clCreateBuffer");

	err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL);
	clErrorHandling("clEnqueueWriteBuffer");

	err  = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
	err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL);
	clErrorHandling("clGetDeviceInfo");

	/* prepare kernel args */
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array);
	err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n);

	/* execute */
	local_size = n / max_compute_units / 8;
	if (local_size > max_workgroup_size)
		local_size = max_workgroup_size;

	/*
	 *	Usually it would be
	 *	global_size = local_size * max_compute_units;
	 *	but that would only be valid if local_size = n / max_compute_units;
	 *	local_size is n / max_compute_units / 8 because it obtains its hightest performance.
	 */
	for (global_size = local_size; global_size < n; global_size += local_size);

	err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event);
	clErrorHandling("clEnqueueNDRangeKernel");

	clWaitForEvents(1, &event);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
	fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0);

	err = clFinish(commands);
	clErrorHandling("clFinish");

	/* transfer back */
	err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long
	clErrorHandling("clEnqueueReadBuffer");

	/* cleanup*/
	clReleaseMemObject(d_array);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
	clReleaseEvent(event);
}
/**
 * The implementation of the particle filter using OpenMP for many frames
 * @see http://openmp.org/wp/
 * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods
 * @param I The video to be run
 * @param IszX The x dimension of the video
 * @param IszY The y dimension of the video
 * @param Nfr The number of frames
 * @param seed The seed array used for random number generation
 * @param Nparticles The number of particles to be used
 */
int particleFilter(unsigned char * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles) {
    int max_size = IszX * IszY*Nfr;
    //original particle centroid
    double xe = roundDouble(IszY / 2.0);
    double ye = roundDouble(IszX / 2.0);

    //expected object locations, compared to center
    int radius = 5;
    int diameter = radius * 2 - 1;
    int * disk = (int*) calloc(diameter * diameter, sizeof (int));
    strelDisk(disk, radius);
    int countOnes = 0;
    int x, y;
    for (x = 0; x < diameter; x++) {
        for (y = 0; y < diameter; y++) {
            if (disk[x * diameter + y] == 1)
                countOnes++;
        }
    }
    int * objxy = (int *) calloc(countOnes * 2, sizeof(int));
    getneighbors(disk, countOnes, objxy, radius);
    //initial weights are all equal (1/Nparticles)
    double * weights = (double *) calloc(Nparticles, sizeof(double));
    for (x = 0; x < Nparticles; x++) {
        weights[x] = 1 / ((double) (Nparticles));
    }
    /****************************************************************
     **************   B E G I N   A L L O C A T E *******************
     ****************************************************************/

    /***** kernel variables ******/
    cl_kernel kernel_likelihood;
    cl_kernel kernel_sum;
    cl_kernel kernel_normalize_weights;
    cl_kernel kernel_find_index;

    int sourcesize = 2048 * 2048;
    char * source = (char *) calloc(sourcesize, sizeof (char));
    if (!source) {
        printf("ERROR: calloc(%d) failed\n", sourcesize);
        return -1;
    }

    // read the kernel core source
    char * tempchar = "./particle_double.cl";
    FILE * fp = fopen(tempchar, "rb");
    if (!fp) {
        printf("ERROR: unable to open '%s'\n", tempchar);
        return -1;
    }
    fread(source + strlen(source), sourcesize, 1, fp);
    fclose(fp);

    // OpenCL initialization
    int use_gpu = 1;
    if (initialize(use_gpu)) return -1;

    // compile kernel
    cl_int err = 0;
    const char * slist[2] = {source, 0};
    cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateProgramWithSource() => %d\n", err);
        return -1;
    }

    err = clBuildProgram(prog, 1, device_list, "-cl-fast-relaxed-math", NULL, NULL);

    if (err != CL_SUCCESS) {
        if (err == CL_INVALID_PROGRAM)
            printf("CL_INVALID_PROGRAM\n");
        else if (err == CL_INVALID_VALUE)
            printf("CL_INVALID_VALUE\n");
        else if (err == CL_INVALID_DEVICE)
            printf("CL_INVALID_DEVICE\n");
        else if (err == CL_INVALID_BINARY)
            printf("CL_INVALID_BINARY\n");
        else if (err == CL_INVALID_BUILD_OPTIONS)
            printf("CL_INVALID_BUILD_OPTIONS\n");
        else if (err == CL_INVALID_OPERATION)
            printf("CL_INVALID_OPERATION\n");
        else if (err == CL_COMPILER_NOT_AVAILABLE)
            printf("CL_COMPILER_NOT_AVAILABLE\n");
        else if (err == CL_BUILD_PROGRAM_FAILURE)
            printf("CL_BUILD_PROGRAM_FAILURE\n");
        else if (err == CL_INVALID_OPERATION)
            printf("CL_INVALID_OPERATION\n");
        else if (err == CL_OUT_OF_RESOURCES)
            printf("CL_OUT_OF_RESOURCES\n");
        else if (err == CL_OUT_OF_HOST_MEMORY)
            printf("CL_OUT_OF_HOST_MEMORY\n");

        printf("ERROR: clBuildProgram() => %d\n", err);

        static char log[65536];
        memset(log, 0, sizeof (log));

        err = clGetProgramBuildInfo(prog, device_list[0], CL_PROGRAM_BUILD_LOG, sizeof (log) - 1, log, NULL);
        if (err != CL_SUCCESS) {
            printf("ERROR: clGetProgramBuildInfo() => %d\n", err);
        }
        if (strstr(log, "warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);


    }
    // { // show warnings/errors
    //     static char log[65536];
    //     memset(log, 0, sizeof (log));
    //     cl_device_id device_id[2] = {0};
    //     err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof (device_id), device_id, NULL);
    //     if (err != CL_SUCCESS) {
    //         if (err == CL_INVALID_CONTEXT)
    //             printf("ERROR: clGetContextInfo() => CL_INVALID_CONTEXT\n");
    //         if (err == CL_INVALID_VALUE)
    //             printf("ERROR: clGetContextInfo() => CL_INVALID_VALUE\n");
    //     }
    // }//*/

    char * s_likelihood_kernel = "likelihood_kernel";
    char * s_sum_kernel = "sum_kernel";
    char * s_normalize_weights_kernel = "normalize_weights_kernel";
    char * s_find_index_kernel = "find_index_kernel";

    kernel_likelihood = clCreateKernel(prog, s_likelihood_kernel, &err);
    if (err != CL_SUCCESS) {
        if (err == CL_INVALID_PROGRAM)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM %d\n", err);
        if (err == CL_INVALID_PROGRAM_EXECUTABLE)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM EXECUTABLE %d\n", err);
        if (err == CL_INVALID_KERNEL_NAME)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL NAME %d\n", err);
        if (err == CL_INVALID_KERNEL_DEFINITION)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL DEFINITION %d\n", err);
        if (err == CL_INVALID_VALUE)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID CL_INVALID_VALUE %d\n", err);
        printf("ERROR: clCreateKernel(likelihood_kernel) failed.\n");
        return -1;
    }
    kernel_sum = clCreateKernel(prog, s_sum_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(sum_kernel) 0 => %d\n", err);
        return -1;
    }
    kernel_normalize_weights = clCreateKernel(prog, s_normalize_weights_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(normalize_weights_kernel) 0 => %d\n", err);
        return -1;
    }
    kernel_find_index = clCreateKernel(prog, s_find_index_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(find_index_kernel) 0 => %d\n", err);
        return -1;
    }


    //initial likelihood to 0.0
    double * likelihood = (double *) calloc(Nparticles + 1, sizeof (double));
    double * arrayX = (double *) calloc(Nparticles, sizeof (double));
    double * arrayY = (double *) calloc(Nparticles, sizeof (double));
    double * xj = (double *) calloc(Nparticles, sizeof (double));
    double * yj = (double *) calloc(Nparticles, sizeof (double));
    double * CDF = (double *) calloc(Nparticles, sizeof(double));

    //GPU copies of arrays
    cl_mem arrayX_GPU;
    cl_mem arrayY_GPU;
    cl_mem xj_GPU;
    cl_mem yj_GPU;
    cl_mem CDF_GPU;
    cl_mem likelihood_GPU;
    cl_mem I_GPU;
    cl_mem weights_GPU;
    cl_mem objxy_GPU;

    int * ind = (int*) calloc(countOnes, sizeof(int));
    cl_mem ind_GPU;
    double * u = (double *) calloc(Nparticles, sizeof(double));
    cl_mem u_GPU;
    cl_mem seed_GPU;
    cl_mem partial_sums;


    //OpenCL memory allocation

    arrayX_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    arrayY_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    xj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer xj_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    yj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer yj_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    CDF_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) * Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer CDF_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    u_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer u_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    likelihood_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer likelihood_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    weights_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer weights_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    I_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (unsigned char) *IszX * IszY * Nfr, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer I_GPU (size:%d) => %d\n", IszX * IszY * Nfr, err);
        return -1;
    }
    objxy_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 2*sizeof (int) *countOnes, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer objxy_GPU (size:%d) => %d\n", countOnes, err);
        return -1;
    }
    ind_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *countOnes * Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer ind_GPU (size:%d) => %d\n", countOnes * Nparticles, err);
        return -1;
    }
    seed_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer seed_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    partial_sums = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof (double) * Nparticles + 1, likelihood, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer partial_sums (size:%d) => %d\n", Nparticles, err);
        return -1;
    }

	//Donnie - this loop is different because in this kernel, arrayX and arrayY
    //  are set equal to xj before every iteration, so effectively, arrayX and
    //  arrayY will be set to xe and ye before the first iteration.
    for (x = 0; x < Nparticles; x++) {

        xj[x] = xe;
        yj[x] = ye;
    }

    int k;
    //double * Ik = (double *)calloc(IszX*IszY, sizeof(double));
    int indX, indY;
    //start send
    long long send_start = get_time();

    //OpenCL memory copy
    err = clEnqueueWriteBuffer(cmd_queue, I_GPU, 1, 0, sizeof (unsigned char) *IszX * IszY*Nfr, I, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer I_GPU (size:%d) => %d\n", IszX * IszY*Nfr, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, objxy_GPU, 1, 0, 2*sizeof (int) *countOnes, objxy, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer objxy_GPU (size:%d) => %d\n", countOnes, err);
        return -1; }
    err = clEnqueueWriteBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer weights_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, xj_GPU, 1, 0, sizeof (double) *Nparticles, xj, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, yj_GPU, 1, 0, sizeof (double) *Nparticles, yj, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, seed_GPU, 1, 0, sizeof (int) *Nparticles, seed, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer seed_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    /**********************************************************************
     *********** E N D    A L L O C A T E ********************************
     *********************************************************************/

    long long send_end = get_time();
    printf("TIME TO SEND TO GPU: %f\n", elapsed_time(send_start, send_end));
    int num_blocks = ceil((double) Nparticles / (double) threads_per_block);
    printf("threads_per_block=%d \n",threads_per_block);
    size_t local_work[3] = {threads_per_block, 1, 1};
    size_t global_work[3] = {num_blocks*threads_per_block, 1, 1};

    for (k = 1; k < Nfr; k++) {
        /****************** L I K E L I H O O D ************************************/
        clSetKernelArg(kernel_likelihood, 0, sizeof (void *), (void*) &arrayX_GPU);
        clSetKernelArg(kernel_likelihood, 1, sizeof (void *), (void*) &arrayY_GPU);
        clSetKernelArg(kernel_likelihood, 2, sizeof (void *), (void*) &xj_GPU);
        clSetKernelArg(kernel_likelihood, 3, sizeof (void *), (void*) &yj_GPU);
        clSetKernelArg(kernel_likelihood, 4, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_likelihood, 5, sizeof (void *), (void*) &ind_GPU);
        clSetKernelArg(kernel_likelihood, 6, sizeof (void *), (void*) &objxy_GPU);
        clSetKernelArg(kernel_likelihood, 7, sizeof (void *), (void*) &likelihood_GPU);
        clSetKernelArg(kernel_likelihood, 8, sizeof (void *), (void*) &I_GPU);
        clSetKernelArg(kernel_likelihood, 9, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_likelihood, 10, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_likelihood, 11, sizeof (cl_int), (void*) &Nparticles);
        clSetKernelArg(kernel_likelihood, 12, sizeof (cl_int), (void*) &countOnes);
        clSetKernelArg(kernel_likelihood, 13, sizeof (cl_int), (void*) &max_size);
        clSetKernelArg(kernel_likelihood, 14, sizeof (cl_int), (void*) &k);
        clSetKernelArg(kernel_likelihood, 15, sizeof (cl_int), (void*) &IszY);
        clSetKernelArg(kernel_likelihood, 16, sizeof (cl_int), (void*) &Nfr);
        clSetKernelArg(kernel_likelihood, 17, sizeof (void *), (void*) &seed_GPU);
        clSetKernelArg(kernel_likelihood, 18, sizeof (void *), (void*) &partial_sums);
        clSetKernelArg(kernel_likelihood, 19, threads_per_block * sizeof (double), NULL);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_likelihood, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(kernel_likelihood)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /****************** E N D    L I K E L I H O O D **********************/
        /*************************** S U M ************************************/
        clSetKernelArg(kernel_sum, 0, sizeof (void *), (void*) &partial_sums);
        clSetKernelArg(kernel_sum, 1, sizeof (cl_int), (void*) &Nparticles);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_sum, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(kernel_sum)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }/*************************** E N D   S U M ****************************/



        /**************** N O R M A L I Z E     W E I G H T S *****************/
        clSetKernelArg(kernel_normalize_weights, 0, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_normalize_weights, 1, sizeof (cl_int), (void*) &Nparticles);
        clSetKernelArg(kernel_normalize_weights, 2, sizeof (void *), (void*) &partial_sums); //*/
        clSetKernelArg(kernel_normalize_weights, 3, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_normalize_weights, 4, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_normalize_weights, 5, sizeof (void *), (void*) &seed_GPU);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_normalize_weights, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(normalize_weights)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /************* E N D    N O R M A L I Z E     W E I G H T S ***********/

	  //	ocl_print_double_array(cmd_queue, partial_sums, 40);
        // /********* I N T E R M E D I A T E     R E S U L T S ***************/
        // //OpenCL memory copying back from GPU to CPU memory
         err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0);
         err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0);
         err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);

         xe = 0;
         ye = 0;
         double total=0.0;
         // estimate the object location by expected values
	 for (x = 0; x < Nparticles; x++) {
            // if( 0.0000000 < arrayX[x]*weights[x]) printf("arrayX[%d]:%f, arrayY[%d]:%f, weights[%d]:%0.10f\n",x,arrayX[x], x, arrayY[x], x, weights[x]);
	//	printf("arrayX[%d]:%f | arrayY[%d]:%f | weights[%d]:%f\n",
 	//		x, arrayX[x], x, arrayY[x], x, weights[x]); 
             xe += arrayX[x] * weights[x];
             ye += arrayY[x] * weights[x];
             total+= weights[x];
         }
         printf("total weight: %lf\n", total);
         printf("XE: %lf\n", xe);
         printf("YE: %lf\n", ye);
         double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2));
         printf("%lf\n", distance);
        // /********* E N D    I N T E R M E D I A T E     R E S U L T S ***************/

        /******************** F I N D    I N D E X ****************************/
        //Set number of threads

        clSetKernelArg(kernel_find_index, 0, sizeof (void *), (void*) &arrayX_GPU);
        clSetKernelArg(kernel_find_index, 1, sizeof (void *), (void*) &arrayY_GPU);
        clSetKernelArg(kernel_find_index, 2, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_find_index, 3, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_find_index, 4, sizeof (void *), (void*) &xj_GPU);
        clSetKernelArg(kernel_find_index, 5, sizeof (void *), (void*) &yj_GPU);
        clSetKernelArg(kernel_find_index, 6, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_find_index, 7, sizeof (cl_int), (void*) &Nparticles);
        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_find_index, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(find_index)=>%d failed\n", err);
	    //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /******************* E N D    F I N D    I N D E X ********************/

    }//end loop

    //block till kernels are finished
    //clFinish(cmd_queue);
    long long back_time = get_time();

    //OpenCL freeing of memory
    clReleaseProgram(prog);
    clReleaseMemObject(u_GPU);
    clReleaseMemObject(CDF_GPU);
    clReleaseMemObject(yj_GPU);
    clReleaseMemObject(xj_GPU);
    clReleaseMemObject(likelihood_GPU);
    clReleaseMemObject(I_GPU);
    clReleaseMemObject(objxy_GPU);
    clReleaseMemObject(ind_GPU);
    clReleaseMemObject(seed_GPU);
    clReleaseMemObject(partial_sums);

    long long free_time = get_time();

    //OpenCL memory copying back from GPU to CPU memory
    err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long arrayX_time = get_time();
    err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long arrayY_time = get_time();
    err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long back_end_time = get_time();

    printf("GPU Execution: %lf\n", elapsed_time(send_end, back_time));
    printf("FREE TIME: %lf\n", elapsed_time(back_time, free_time));
    printf("SEND TO SEND BACK: %lf\n", elapsed_time(back_time, back_end_time));
    printf("SEND ARRAY X BACK: %lf\n", elapsed_time(free_time, arrayX_time));
    printf("SEND ARRAY Y BACK: %lf\n", elapsed_time(arrayX_time, arrayY_time));
    printf("SEND WEIGHTS BACK: %lf\n", elapsed_time(arrayY_time, back_end_time));

    xe = 0;
    ye = 0;
    // estimate the object location by expected values
    for (x = 0; x < Nparticles; x++) {
        xe += arrayX[x] * weights[x];
        ye += arrayY[x] * weights[x];
    }
    double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2));

    //Output results
    FILE *fid;
    fid=fopen("output.txt", "w+");
    if( fid == NULL ){
      printf( "The file was not opened for writing\n" );
      return -1;
    }
    fprintf(fid, "XE: %lf\n", xe);
    fprintf(fid, "YE: %lf\n", ye);
    fprintf(fid, "distance: %lf\n", distance);
    fclose(fid);


    //OpenCL freeing of memory
    clReleaseMemObject(weights_GPU);
    clReleaseMemObject(arrayY_GPU);
    clReleaseMemObject(arrayX_GPU);

    //free regular memory
    free(likelihood);
    free(arrayX);
    free(arrayY);
    free(xj);
    free(yj);
    free(CDF);
    free(ind);
    free(u);
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'sub_sat_short16short16.cl' */
        source_code = read_buffer("sub_sat_short16short16.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "sub_sat_short16short16", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_short16 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_short16));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_short16 *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_short16));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_short16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_short16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_short16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_short16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_short16));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Example #28
0
int 
exec_trig_kernel(const char *program_source, 
                 int n, void *srcA, void *dst) 
{ 
  cl_context  context; 
  cl_command_queue cmd_queue; 
  cl_device_id  *devices; 
  cl_program  program; 
  cl_kernel  kernel; 
  cl_mem       memobjs[2]; 
  size_t       global_work_size[1]; 
  size_t       local_work_size[1]; 
  size_t       cb; 
  cl_int       err; 

  float c = 7.3f; // a scalar number to test non-pointer args
 
  // create the OpenCL context on a GPU device 
  context = poclu_create_any_context();
  if (context == (cl_context)0) 
    return -1; 
 
  // get the list of GPU devices associated with context 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
  devices = (cl_device_id *) malloc(cb); 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
 
  // create a command-queue 
  cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (cmd_queue == (cl_command_queue)0) 
    { 
      clReleaseContext(context); 
      free(devices); 
      return -1; 
    } 
  free(devices); 
 
  // allocate the buffer memory objects 
  memobjs[0] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcA, NULL); 
  if (memobjs[0] == (cl_mem)0) 
    { 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  memobjs[1] = clCreateBuffer(context, 
			      CL_MEM_READ_WRITE, 
			      sizeof(cl_float4) * n, NULL, NULL); 
  if (memobjs[1] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 1); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the program 
  program = clCreateProgramWithSource(context, 
				      1, (const char**)&program_source, NULL, NULL); 
  if (program == (cl_program)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // build the program 
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the kernel 
  kernel = clCreateKernel(program, "trig", NULL); 
  if (kernel == (cl_kernel)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set the args values 
  err = clSetKernelArg(kernel,  0,  
		       sizeof(cl_mem), (void *) &memobjs[0]); 
  err |= clSetKernelArg(kernel, 1,
			sizeof(cl_mem), (void *) &memobjs[1]); 
  err |= clSetKernelArg(kernel, 2,
			sizeof(float), (void *) &c); 
 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set work-item dimensions 
  global_work_size[0] = n; 
  local_work_size[0]= 2; 
 
  // execute kernel 
  err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, 
			       global_work_size, local_work_size,  
			       0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // read output image 
  err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 
			    0, n * sizeof(cl_float4), dst, 
			    0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // release kernel, program, and memory objects 
  delete_memobjs(memobjs, 2); 
  clReleaseKernel(kernel); 
  clReleaseProgram(program); 
  clReleaseCommandQueue(cmd_queue); 
  clReleaseContext(context); 
  return 0; // success... 
}
Example #29
0
b2CLNarrowPhase::b2CLNarrowPhase()
{
#if defined(NARROWPHASE_OPENCL)
	if (b2clGlobal_OpenCLSupported)
	{
		printf("Initializing b2CLNarrowPhase...\n");
    
		int err;
    
		//load opencl programs from files
		char* narrowPhaseKernelSource = 0;
		size_t narrowPhaseKernelSourceLen = 0;

		shrLog("...loading b2CLNarrowPhase.cl\n");
	#if defined(SCAN_OPENCL)
    #ifdef linux
    	narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/opt/usr/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen);
	#elif defined (_WIN32)   
		narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen);
	#else
		narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/usr/local/include/Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen);
	#endif
	#else
    #ifdef linux
    	narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/opt/usr/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen);
	#elif defined (_WIN32)
		narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen);
	#else
		narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/usr/local/include/Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen);
	#endif
	#endif
		if(narrowPhaseKernelSource == NULL)
		{
			b2Log("Could not load program source, is path 'b2CLNarrowPhase.cl' correct?");
		}

		//create the compute program from source kernel code
		narrowPhaseProgram = clCreateProgramWithSource(b2CLDevice::instance().GetContext(), 1, (const char**)&narrowPhaseKernelSource, NULL, &err);
		if (!narrowPhaseProgram)
		{
			printf("Error: Failed to create compute program!\n");
			exit(1);
		}
    
		//build the program
		err = clBuildProgram(narrowPhaseProgram, 0, NULL, OPENCL_BUILD_PATH, NULL, NULL);
		if (err != CL_SUCCESS)
		{
			size_t len;
			char buffer[204800];
        
			printf("Error: Failed to build program executable!\n");
			clGetProgramBuildInfo(narrowPhaseProgram, b2CLDevice::instance().GetCurrentDevice(), CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			printf("%s\n", buffer);
			exit(1);
		}
    
		//create the compute kernel
		collidePolygonsKernel = clCreateKernel(narrowPhaseProgram, "b2clCollidePolygons", &err);
		if (!collidePolygonsKernel || err != CL_SUCCESS)
		{
			printf("Error: Failed to create compute kernel!\n");
			exit(1);
		}
		collideCirclesKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideCircles", &err);
		if (!collideCirclesKernel || err != CL_SUCCESS)
		{
			printf("Error: Failed to create compute kernel!\n");
			exit(1);
		}
		collidePolygonAndCircleKernel = clCreateKernel(narrowPhaseProgram, "b2clCollidePolygonAndCircle", &err);
		if (!collidePolygonAndCircleKernel || err != CL_SUCCESS)
		{
			printf("Error: Failed to create compute kernel!\n");
			exit(1);
		}
		collideEdgeAndCircleKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideEdgeAndCircle", &err);
		if (!collideEdgeAndCircleKernel || err != CL_SUCCESS)
		{
			printf("Error: Failed to create compute kernel!\n");
			exit(1);
		}
		collideEdgeAndPolygonKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideEdgeAndPolygon", &err);
		if (!collideEdgeAndPolygonKernel || err != CL_SUCCESS)
		{
			printf("Error: Failed to create compute kernel!\n");
			exit(1);
		}

		b2CLDevice::instance().getMaximumKernelWorkGroupSize(collidePolygonsKernel, kernel_work_group_size);
		//clGetKernelWorkGroupInfo(collidePolygonsKernel, b2CLDevice::instance().GetCurrentDevice(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &kernel_preferred_work_group_size_multiple, NULL);

		//create the compute kernel
		compactForOneContactKernel = clCreateKernel(narrowPhaseProgram, "b2clCompactForOneContact", &err);
		if (!compactForOneContactKernel || err != CL_SUCCESS)
		{
			printf("Error: Failed to create compute kernel!\n");
			exit(1);
		}

		/*old_xf_num = */old_contact_num = 0;
		xfListData = NULL;
		b2CLCommonData::instance().manifoldListData = NULL;
		globalIndicesData = pairIndicesData = NULL;
	#if !defined(BROADPHASE_OPENCL)
		b2CLCommonData::instance().manifoldListBuffers[0] = 
			b2CLCommonData::instance().manifoldListBuffers[1] =
			b2CLCommonData::instance().pairIndicesBuffer = 
	#endif
	#if defined(SCAN_OPENCL)
			b2CLCommonData::instance().manifoldBinaryBitListBuffer = 
			b2CLCommonData::instance().validContactIndicesBuffer = 
	#endif
			NULL;
		
		b2CLScan::instance(); // initializing b2CLScan
	}
#endif
	b2CLCommonData::instance().currentManifoldBuffer = 0;
	manifold_nums[0] = 0;
	manifold_nums[1] = 0;
}
int main(int argc, char** argv)
{
    int          rank, size;         // MPI rank & size
    int          err;                // error code returned from OpenCL calls
    float        h_a[LENGTH];        // a vector
    float        h_b[LENGTH];        // b vector
    float        h_c[LENGTH];        // c vector (a+b) returned from the compute device (local per task)
    float        _h_c[LENGTH];       // c vector (a+b) returned from the compute device (global for master)
    unsigned int correct;            // number of correct results

    size_t global;                   // global domain size
    size_t local;                    // local  domain size

    cl_device_id     device_id;      // compute device id
    cl_context       context;        // compute context
    cl_command_queue commands;       // compute command queue
    cl_program       program;        // compute program
    cl_kernel        ko_vadd;        // compute kernel

    cl_mem d_a;                      // device memory used for the input  a vector
    cl_mem d_b;                      // device memory used for the input  b vector
    cl_mem d_c;                      // device memory used for the output c vector

    int mycount, i;

    err = MPI_Init (&argc, &argv);

    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Init failed!\n");
        exit (-1);
    }

    err = MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_rank failed!\n");
        exit (-1);
    }

    err = MPI_Comm_size (MPI_COMM_WORLD, &size);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_size failed\n");
        exit (-1);
    }

    if (LENGTH % size != 0)
    {
        printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH);
        exit (-1);
    }

    mycount = LENGTH / size;

    if (rank == 0)
    {
        for (i = 0; i < LENGTH; i++)
        {
            h_a[i] = rand() / (float)RAND_MAX;
            h_b[i] = rand() / (float)RAND_MAX;
            h_a[i] = i;
            h_b[i] = i*2;
        }
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_b\n");
            exit (-1);
        }
    }
    else
    {
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_b\n");
            exit (-1);
        }
    }

    // Set up platform
    cl_uint numPlatforms;

    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n");
        return EXIT_FAILURE;
    }

    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n");
        return EXIT_FAILURE;
    }

    // Secure a GPU
    for (i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }

    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    else
    {
        if (output_device_info (rank, device_id) != CL_SUCCESS)
            return EXIT_FAILURE;
    }

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel from the program
    ko_vadd = clCreateKernel(program, "vadd", &err);
    if (!ko_vadd || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Create the input (a, b) and output (c) arrays in device memory
    d_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_b = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_c = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL);
    if (!d_a || !d_b || !d_c)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }

    // Write a and b vectors into compute device memory
    err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_a to source array!\n");
        exit(1);
    }

    err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_b to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    global = LENGTH;
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the commands to complete before reading back results
    clFinish(commands);

    // Read back the results from the compute device
    err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

    err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Gather failed receiving h_c\n");
        exit (-1);
    }

    if (rank == 0)
    {
        // Test the results
        correct = 0;
        float tmp;

        for(i = 0; i < LENGTH; i++)
        {
            tmp = h_a[i] + h_b[i];     // assign element i of a+b to tmp
            tmp -= _h_c[i];             // compute deviation of expected and output result
            if(tmp*tmp < TOL*TOL)      // correct if square deviation is less than tolerance squared
                correct++;
            else
                printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]);
        }

        // summarize results
        printf("C = A+B:  %d out of %d results were correct.\n", correct, LENGTH);
    }

    // cleanup then shutdown
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(ko_vadd);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    err = MPI_Finalize ();
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Finalize failed!\n");
        exit (-1);
    }

    return 0;
}