Example #1
2
int main(int argc, char** argv)
{
  int err;                            // error code returned from api calls
  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel

  size_t global[2];                   // global domain size for our calculation
  size_t local[2];                    // local domain size for our calculation

  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
   

  cl_mem in_array;                     // device memory used for the input array
  //cl_mem synaptic_weights;             // device memory used for the input array
  cl_mem out_array;                    // device memory used for the output array
   
  if (argc != 2){
    printf("%s <inputfile>\n", argv[0]);
    return -1;
  }

	//float in_array[NO_NODES];
	//float out_array[NO_NODES];
	//float synaptic_weights[NO_NODES*NO_NODES];
	float in_array_tb[NO_NODES];
	float out_array_tb[NO_NODES];
	//float synaptic_weights_tb[NO_NODES*NO_NODES];
	float temp =0;
	int i = 0;
    	int j = 0;
	int index = 0;
	FILE* ifp;
	char* mode = "r";
  //
  // Connect to first platform
  //
  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return -1;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);
 
  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return -1;
  }
  
  //
  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return -1;
  }

  //relu_1(in_array,synaptic_weights,out_array);


  // Fill our data sets with pattern
  //
  //int i = 0;
  //for(i = 0; i < DATA_SIZE; i++) {
  //  a[i] = (int)i;
  //  b[i] = (int)i;
  //  results[i] = 0;
  //}
  //
  
  
  // Create a command commands
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return -1;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char *xclbin=argv[1];
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return -1;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    printf("err : %d %s\n",err,err);
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return -1;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "relu_1", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel!\n");
    printf("Test failed\n");
    return -1;
  }

  // Create the input and output arrays in device memory for our calculation
  //
  in_array = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES, NULL, NULL);
  //synaptic_weights = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES * NO_NODES, NULL, NULL);
  out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL);
  if (!in_array || /*!synaptic_weights ||*/ !out_array)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return -1;
  }    
    
	ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode);

	if(ifp == NULL)
	{
		printf("Input file not found \n");
  		return -1;
	}
	while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) {

		in_array_tb[index++] = temp;
	}
	index = 0;
	temp = 0;

	//ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode);
	//if(ifp == NULL)
	//{
	//	printf("Weight file not found \n");
  	//	return -1;
	//}
	//while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) {
	//	synaptic_weights_tb[index++] = temp;
	//}
   
  //
  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array a!\n");
    printf("Test failed\n");
    return -1;
  }

  // Write our data set into the input array in device memory 
  //
  //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) *  NO_NODES *  NO_NODES, synaptic_weights_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array b!\n");
    printf("Test failed\n");
    return -1;
  }
    
  // Set the arguments to our compute kernel
  //
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array);
  //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);

  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  clWaitForEvents(1, &readevent);
    
  //printf("A\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",a[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("B\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",b[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("res\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
    
  // Validate our results
  //
  //correct = 0;
  //for(i = 0; i < DATA_SIZE; i++)
  //{
  //  int row = i/MATRIX_RANK;
  //  int col = i%MATRIX_RANK;
  //  int running = 0;
  //  int index;
  //  for (index=0;index<MATRIX_RANK;index++) {
  //    int aIndex = row*MATRIX_RANK + index;
  //    int bIndex = col + index*MATRIX_RANK;
  //    running += a[aIndex] * b[bIndex];
  //  }
  //  sw_results[i] = running;
  //}
  //  
  //for (i = 0;i < DATA_SIZE; i++) 
  //  if(results[i] == sw_results[i])
  //    correct++;
  //printf("Software\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  //printf("%0.2f ",sw_results[i]);
  //  printf("%d ",sw_results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //  
  //  
  //// Print a brief summary detailing the results
  ////
  //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE);
  //  
        
  // Shutdown and cleanup
	int temp_ = 0;


 for (j = 0; j < NO_NODES; j++)
 {
 	if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0)
 	{
 		//printf("out_array[%d] = %f \n", j, out_array[j]);
 		temp_++;
 	}
 }


  clReleaseMemObject(in_array);
  //clReleaseMemObject(synaptic_weights);
  clReleaseMemObject(out_array);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

	if (temp_ == NO_NODES)
	{
		printf("*********************************************************** \n");
		printf("TEST PASSED !!!!!! The output matches the desired output. \n");
		printf("*********************************************************** \n");
		return EXIT_SUCCESS;
	}
	else
	{
		printf("**************************************************************** \n");
		printf("TEST Failed !!!!!! The output does not match the desired output. \n");
		printf("**************************************************************** \n");
		return -1;
	}

  //if(correct == DATA_SIZE){
  //  printf("Test passed!\n");
  //  return EXIT_SUCCESS;
  //}
  //else{
  //  printf("Test failed\n");
  //  return -1;
  //}
}
Example #2
0
cl_uint SetKernelArguments()
{
    cl_int err = CL_SUCCESS;

    err = clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void *)&ocl.Lights);
    if (CL_SUCCESS != err)
    {
        printf("error: Failed to set argument Lights, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 1, sizeof(cl_uint), (void *)&ocl.LightCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument LightCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void *)&ocl.Shapes);
    if (CL_SUCCESS != err)
    {
        printf("error: Failed to set argument Shapes, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 3, sizeof(cl_uint), (void *)&ocl.ShapeCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 4, sizeof(cl_uint), (void *)&ocl.sampleCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 5, sizeof(cl_uint), (void *)&ocl.width);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 6, sizeof(cl_uint), (void *)&ocl.height);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 7, sizeof(cl_mem), (void *)&ocl.cam);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    return err;
}
Example #3
0
int lcs::GPUExclusiveScanForInt(int workGroupSize, int numOfBanks,
				cl_kernel scanKernel, cl_kernel reverseUpdateKernel, cl_mem d_arr, cl_int length,
				cl_command_queue commandQueue) {
	cl_int err;

	// Get the work group size
	size_t localWorkSize = workGroupSize;

	// Up-sweep and down-sweep	
	clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr);
	clSetKernelArg(scanKernel, 1, sizeof(cl_int), &length);
	clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / numOfBanks + 1), NULL);

	static int records[10];
	
	int problemSize = length;
	int numOfRecords = 0;

	cl_int d_step = 1;

	/// DEBUG ///
	printf("length = %d\n", length);

	for (; problemSize > 1; problemSize = (problemSize - 1) / (localWorkSize * 2) + 1) {
		if (numOfRecords) d_step *= localWorkSize * 2;
		records[numOfRecords++] = problemSize;
		clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);

		size_t globalWorkSize = ((problemSize - 1) / (localWorkSize * 2) + 1) * localWorkSize;

		err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize,
					     0, NULL, NULL);
		if (err) lcs::Error("Fail to enqueue scan");

		/// DEBUG ///
		err = clFinish(commandQueue);

		printf("err = %d\n", err);

		if (err) lcs::Error("Non-zero err in pre-scan");
	}

	int zero = 0, sum;
	err = clEnqueueReadBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &sum, 0, NULL, NULL);
	if (err) lcs::Error("Fail to read d_arr[0]");

	err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL);
	if (err) lcs::Error("Fail to clean d_arr[0]");

	// Reverse updates
	clSetKernelArg(reverseUpdateKernel, 0, sizeof(cl_mem), &d_arr);
	clSetKernelArg(reverseUpdateKernel, 1, sizeof(cl_int), &length);

	size_t globalWorkSize;

	for (int i = numOfRecords - 1; i >= 0; i--, d_step /= localWorkSize * 2) {
		clSetKernelArg(reverseUpdateKernel, 2, sizeof(cl_int), &d_step);
		globalWorkSize = ((records[i] - 1) / (localWorkSize * 2) + 1) * localWorkSize;
		err = clEnqueueNDRangeKernel(commandQueue, reverseUpdateKernel, 1, NULL, &globalWorkSize, &localWorkSize,
					     0, NULL, NULL);
		if (err) lcs::Error("Fail to enqueue scan");
		clFinish(commandQueue);
	}

	return sum;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'sub_sat_short16short16.cl' */
        source_code = read_buffer("sub_sat_short16short16.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "sub_sat_short16short16", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_short16 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_short16));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_short16 *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_short16));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_short16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_short16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_short16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_short16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_short16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_short16));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Example #5
0
void sum_gpu(long long *in, long long *out, unsigned int n)
{
	size_t global_size;
	size_t local_size;

	char *kernel_src;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	cl_uint max_compute_units;
	size_t max_workgroup_size;

	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;
	cl_mem d_array;

	cl_event event;
	cl_ulong start, end;

	/* start OpenCL */
	err = clGetPlatformIDs(1, &platform_id,NULL);
	clErrorHandling("clGetPlatformIDs");

	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	clErrorHandling("clGetDeviceIDs");

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	clErrorHandling("clCreateContext");

	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	clErrorHandling("clCreateCommandQueue");

	/* create kernel */
	kernel_src = file_to_string(KERNEL_SRC);
	program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err);
	free(kernel_src);
	clErrorHandling("clCreateProgramWithSource");

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	clErrorHandling("clBuildProgram");

	kernel = clCreateKernel(program, "matrix_mult", &err);
	clErrorHandling("clCreateKernel");

	/* allocate memory and send to gpu */
	d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err);
	clErrorHandling("clCreateBuffer");

	err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL);
	clErrorHandling("clEnqueueWriteBuffer");

	err  = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
	err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL);
	clErrorHandling("clGetDeviceInfo");

	/* prepare kernel args */
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array);
	err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n);

	/* execute */
	local_size = n / max_compute_units / 8;
	if (local_size > max_workgroup_size)
		local_size = max_workgroup_size;

	/*
	 *	Usually it would be
	 *	global_size = local_size * max_compute_units;
	 *	but that would only be valid if local_size = n / max_compute_units;
	 *	local_size is n / max_compute_units / 8 because it obtains its hightest performance.
	 */
	for (global_size = local_size; global_size < n; global_size += local_size);

	err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event);
	clErrorHandling("clEnqueueNDRangeKernel");

	clWaitForEvents(1, &event);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
	fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0);

	err = clFinish(commands);
	clErrorHandling("clFinish");

	/* transfer back */
	err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long
	clErrorHandling("clEnqueueReadBuffer");

	/* cleanup*/
	clReleaseMemObject(d_array);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
	clReleaseEvent(event);
}
Example #6
0
int main() {
    // Get platform information
    err = clGetPlatformIDs(0, NULL, &numOfPlatforms);
    if (err) Error("Fail to get the number of platforms");
    printf("The machine has %d platform(s) for OpenCL.\n", numOfPlatforms);

    platformIDs = new cl_platform_id [numOfPlatforms];
    err = clGetPlatformIDs(numOfPlatforms, platformIDs, NULL);
    if (err) Error("Fail to get the platform list");

    int cudaPlatformID = -1;

    for (int i = 0; i < numOfPlatforms; i++) {
        char platformName[50];
        err = clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, 50, platformName, NULL);
        if (err) Error("Fail to get the platform name");
        printf("Platform %d is %s\n", i + 1, platformName);
        if (!strcmp(platformName, "NVIDIA CUDA")) cudaPlatformID = i;
    }
    printf("\n");

    if (cudaPlatformID == -1) Error("Fail to find an NVIDIA CUDA platform");

    printf("Platform %d (NVIDIA CUDA) is chosen for use.\n", cudaPlatformID + 1);
    printf("\n");

    // Get device information
    err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, 0, NULL, &numOfDevices);
    if (err) Error("Fail to get the number of devices");
    printf("CUDA platform has %d device(s).\n", numOfDevices);

    deviceIDs = new cl_device_id [numOfDevices];
    err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, numOfDevices, deviceIDs, NULL);
    if (err) Error("Fail to get the device list");
    for (int i = 0; i < numOfDevices; i++) {
        char deviceName[50];
        err = clGetDeviceInfo(deviceIDs[i], CL_DEVICE_NAME, 50, deviceName, NULL);
        if (err) Error("Fail to get the device name");
        printf("Device %d is %s\n", i + 1, deviceName);
    }
    printf("\n");

    // Create a context
    context = clCreateContext(NULL, numOfDevices, deviceIDs, NULL, NULL, &err);
    if (err) Error("Fail to create a context");

    printf("Device 1 is chosen for use.\n");
    printf("\n");

    // Create a command queue for the first device
    commandQueue = clCreateCommandQueue(context, deviceIDs[0],
                                        CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &err);
    if (err) Error("Fail to create a command queue");

    // create the program
    cl_program program = CreateProgram(exclusiveScanKernels, "exclusive scan");

    // create two kernels
    cl_kernel scanKernel = clCreateKernel(program, "Scan", &err);
    if (err) Error("Fail to create the kernel for scan");

    cl_kernel reverseUpdateKernel = clCreateKernel(program, "ReverseUpdate", &err);
    if (err) Error("Fail to create the kernel for reverse update");

    // Get the work group size
    size_t maxWorkGroupSize;
    err = clGetKernelWorkGroupInfo(scanKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof(size_t), &maxWorkGroupSize, NULL);
    printf("maxWorkGroupSize = %d\n", maxWorkGroupSize);

    err = clGetKernelWorkGroupInfo(reverseUpdateKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof(size_t), &maxWorkGroupSize, NULL);
    printf("maxWorkGroupSize = %d\n", maxWorkGroupSize);

    // Set work group size to 64

    int workGroupSize = 512;

    int length = 2048000;
    int *arr = new int [length];
    for (int i = 0; i < length; i++)
        arr[i] = rand() % 100;

    int *prefixSum = new int [length];
    prefixSum[0] = 0;

    int t0 = clock();

    for (int i = 1; i < length; i++)
        prefixSum[i] = prefixSum[i - 1] + arr[i - 1];

    int t1 = clock();

    printf("time1: %lf\n", (t1 - t0) * 1.0 / CLOCKS_PER_SEC);

    cl_mem d_arr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * length, NULL, &err);
    if (err) Error("Fail to create d_arr");

    err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, arr, 0, NULL, NULL);
    if (err) Error("Fail to write d_arr");

    clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr);
    cl_int d_length = length;
    clSetKernelArg(scanKernel, 1, sizeof(cl_int), &d_length);
    cl_int d_step = 1;
    clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);
    clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / 16 + 1), NULL);

    int problemSize = length;
    int records[10];
    int num = 0;

    int t2 = clock();

    for (; problemSize > 1; problemSize = (problemSize - 1) / (workGroupSize * 2) + 1) {

        if (num) d_step *= workGroupSize * 2;

        printf("d_step = %d\n", d_step);

        records[num++] = problemSize;

        printf("problemSize = %d\n", problemSize);

        clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step);

        size_t globalWorkSize = ((problemSize - 1) / (workGroupSize * 2) + 1) * workGroupSize;
        size_t localWorkSize = workGroupSize;

        err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize,
                                     0, NULL, NULL);
        if (err) Error("Fail to enqueue scan");
        clFinish(commandQueue);
    }

    //CheckValues(length, d_arr);

    int zero = 0;
    clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL);

    printf("d_step = %d\n", d_step);

    //scanf("%*c");

    clSetKernelArg(reverseUpdateKernel, 0, sizeof(cl_mem), &d_arr);
    clSetKernelArg(reverseUpdateKernel, 1, sizeof(cl_int), &d_length);

    for (int i = num - 1; i >= 0; i--, d_step /= workGroupSize * 2) {
        printf("d_step = %d\n", d_step);

        clSetKernelArg(reverseUpdateKernel, 2, sizeof(cl_int), &d_step);
        size_t globalWorkSize = ((records[i] - 1) / (workGroupSize * 2) + 1) * workGroupSize;
        size_t localWorkSize = workGroupSize;

        printf("globalWorkSize = %d, localWorkSize = %d\n", globalWorkSize, localWorkSize);

        err = clEnqueueNDRangeKernel(commandQueue, reverseUpdateKernel, 1, NULL, &globalWorkSize, &localWorkSize,
                                     0, NULL, NULL);
        if (err) Error("Fail to enqueue scan");
        clFinish(commandQueue);
    }

    int t3 = clock();

    printf("time: %lf\n", (t3 - t2) * 1.0 / CLOCKS_PER_SEC);

    int *GPUResult = new int [length];
    memset(GPUResult, 0, sizeof(int) * length);
    err = clEnqueueReadBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, GPUResult, 0, NULL, NULL);
    printf("err = %d\n", err);
    if (err) Error("Fail to read d_arr");

    for (int i = 0; i < length; i++)
        if (GPUResult[i] != prefixSum[i]) printf("at i = %d, GPUResult[%d] = %d, prefixSum[%d] = %d\n", i, i, GPUResult[i], i, prefixSum[i]);

    system("pause");
    return 0;
}
Example #7
0
int simpleExample()
{
    
    /* Create device and determine local size */
    device = create_device();
    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);	
    if(err < 0) {
        perror("Couldn't obtain device information");
        exit(1);   
    }

    /* Create a context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err < 0) {
        perror("Couldn't create a context");
        exit(1);
    }
    
    /* Build program */
    program = build_program(context, device, PROGRAM_FILE);
    
    
    /* Create data buffer */
    data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
    sum_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float), NULL, &err);
    if(err < 0) {
        perror("Couldn't create a buffer");
        exit(1);   
    };

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if(err < 0) {
        perror("Couldn't create a command queue");
        exit(1);   
    };
    
    /* Create kernels */
    vector_kernel = clCreateKernel(program, KERNEL_1, &err);
    complete_kernel = clCreateKernel(program, KERNEL_2, &err);
    if(err < 0) {
        perror("Couldn't create a kernel");
        exit(1);
    };

    /* Set arguments for vector kernel */
    err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);
    err |= clSetKernelArg(vector_kernel, 1, local_size * 4 * sizeof(float), NULL);

    /* Set arguments for complete kernel */
    err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);
    err |= clSetKernelArg(complete_kernel, 1, local_size * 4 * sizeof(float), NULL);
    err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);
    if(err < 0) {
        perror("Couldn't create a kernel argument");
        exit(1);   
    }
    
    
    /* Enqueue kernels */
    global_size = ARRAY_SIZE/4;
    err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, &start_event);
    if(err < 0) {
        perror("Couldn't enqueue the kernel");
        exit(1);   
    }
    printf("Global size = %lu\n", global_size);

    /* Perform successive stages of the reduction */
    while(global_size/local_size > local_size) {
        global_size = global_size/local_size;
        err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
        printf("Global size = %lu\n", global_size);
        if(err < 0) {
            perror("Couldn't enqueue the kernel");
            exit(1);   
        }
    }
    global_size = global_size/local_size;
    err = clEnqueueNDRangeKernel(queue, complete_kernel, 1, NULL, &global_size, NULL, 0, NULL, &end_event);
    printf("Global size = %lu\n", global_size);
    
    
    /* Finish processing the queue and get profiling information */
    clFinish(queue);
    clGetEventProfilingInfo(start_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(end_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
 
    /* Read the result */
    err = clEnqueueReadBuffer(queue, sum_buffer, CL_TRUE, 0, sizeof(float), &sum, 0, NULL, NULL);
    if(err < 0) {
        perror("Couldn't read the buffer");
        exit(1);   
    }    
    
    
    /* Check result */
    actual_sum = 1.0f * (ARRAY_SIZE/2)*(ARRAY_SIZE-1);
    if(fabs(sum - actual_sum) > 0.01*fabs(sum))
        printf("Check failed.\n");
    else
        printf("Check passed.\n");
    printf("Total time = %lu\n", total_time);
 
    /* Deallocate resources */
    clReleaseEvent(start_event);
    clReleaseEvent(end_event);
    clReleaseMemObject(sum_buffer);
    clReleaseMemObject(data_buffer);
    clReleaseKernel(vector_kernel);
    clReleaseKernel(complete_kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    return 0;
   
}
Example #8
0
void InitOpenCL()
{
    // 1. Get a platform.
    cl_platform_id platform;
    
    clGetPlatformIDs( 1, &platform, NULL );
    // 2. Find a gpu device.
    cl_device_id device;
    
    clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU,
                   1,
                   &device,
                   NULL);
    // 3. Create a context and command queue on that device.
    cl_context context = clCreateContext( NULL,
                                         1,
                                         &device,
                                         NULL, NULL, NULL);
    queue = clCreateCommandQueue( context,
                                 device,
                                 0, NULL );
    // 4. Perform runtime source compilation, and obtain kernel entry point.
    std::ifstream file("scene.cl");
    std::string source;
    if (file){
    while(!file.eof()){
        char line[256];
        file.getline(line,255);
        source += std::string(line) + "\n";
    }
    }
    if (source.length()==0)
    {
        std::string err = "fail to load shader";
    }
    
    cl_ulong maxSize;
    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), &maxSize, 0);
    
    const char* str = source.c_str();
    cl_program program = clCreateProgramWithSource( context,
                                                   1,
                                                   &str,
                                                   NULL, NULL );
    cl_int result = clBuildProgram( program, 1, &device, NULL, NULL, NULL );
    if ( result ){
        char* build_log;
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        build_log = new char[log_size+1];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
        build_log[log_size] = '\0';
        if( log_size > 2 ) {
            std::cout << "build log: " << build_log << std::endl;
        }
        delete[] build_log;
        std::cout << "Error during compilation! (" << result << ")" << std::endl;
    }
    kernel = clCreateKernel( program, "tracekernel", NULL );
    // 5. Create a data buffer.
    buffer        = clCreateBuffer( context,
                                   CL_MEM_WRITE_ONLY,
                                   kWidth * kHeight *sizeof(cl_float4),
                                   NULL, 0 );
    viewTransform = clCreateBuffer( context,
                                   CL_MEM_READ_WRITE,
                                   16 *sizeof(cl_float),
                                   NULL, 0 );
    
    worldTransforms = clCreateBuffer( context,
                                     CL_MEM_READ_WRITE,
                                     16 *sizeof(cl_float)*2,
                                     NULL, 0 );
    
    clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer);
    clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*) &kWidth);
    clSetKernelArg(kernel, 3, sizeof(viewTransform), (void*) &viewTransform);
    clSetKernelArg(kernel, 4, sizeof(worldTransforms), (void*) &worldTransforms);
}
int main(int argc, char* argv[])
{



		const size_t SIZE_execution_bit = (input_length - 3*filter_length +1);
		const size_t SIZE_input_bit = sizeof(gint32)*(input_length+1);
		const size_t SIZE_settings_bit = sizeof(gint32)*4;

		size_t output_bit_on_counts;
		size_t* SIZE_execution_pointer = &SIZE_execution_bit;

		gint32* filtersettings = (gint32*) malloc(SIZE_settings_bit);
		gint32* input_vector = (gint32*) malloc(SIZE_input_bit);
		gint32* positions = (gint32*) malloc(SIZE_input_bit);

		filtersettings[0] = filter_length;
		filtersettings[1] = threshhold;
		filtersettings[2] = input_length;
		filtersettings[3] = 0;



		//GPU-Init
		ocl = ocl_new(CL_DEVICE_TYPE_GPU,1);
		context = ocl_get_context(ocl);
		queue = ocl_get_cmd_queues (ocl)[0];
		clFinish(queue);

		program = ocl_create_program_from_file(ocl, "edel_kernel_secondder.cl", NULL, &errcode);
		OCL_CHECK_ERROR(errcode);

		filter1 = clCreateKernel(program, "second_filter", &errcode);
		OCL_CHECK_ERROR(errcode);

		//GPU-Buffer which can be done before the Computation
		settings = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, SIZE_settings_bit, filtersettings, &errcode);
		OCL_CHECK_ERROR(errcode);

		input = clCreateBuffer(context, CL_MEM_READ_ONLY, SIZE_input_bit, NULL, &errcode);
		OCL_CHECK_ERROR(errcode);


		if(debugmode != 0)
		{
			srand((unsigned) time( NULL ));
			counter = rand_rects(expected,1,input_length,3*filter_length,3*filter_length,3*filter_length,peak_length,base+peak, input_vector, noise, base, 0,positions);
			if(harddebug != 0)
			{
				for(i = 0; i < input_length;i++)
				{
					if(input_length < 10000)
					{
						printf("input_vector[%i] = %d\n",i,input_vector[i]);
					}
					else
					{
						printf("input_vector[%i] = %d\t",i,input_vector[i]);
					}
				}
			}

			printf("\n counts = %d\n", counter);
			printf("%lu Bits needed for Output-Vector \n", output_bit_on_counts);

		}

		output_bit_on_counts = sizeof(gint32) * safetyfactor * 2*((counter + 2));

		clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, SIZE_input_bit, input_vector, 0, NULL, NULL);

		gint32* energy_time = (gint32*)malloc(output_bit_on_counts);


		for(i = 0; i < safetyfactor * (2*counter+2); i++)
		{
			energy_time[i] = -9999;
		}


		output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, output_bit_on_counts, NULL , &errcode);
		OCL_CHECK_ERROR(errcode);


		OCL_CHECK_ERROR(clSetKernelArg(filter1, 0, sizeof(cl_mem), &input));
		OCL_CHECK_ERROR(clSetKernelArg(filter1, 1, sizeof(cl_mem), &output));
		OCL_CHECK_ERROR(clSetKernelArg(filter1, 2, sizeof(cl_mem), &settings));

		size_t local_item_size;
		size_t global_item_size = (size_t) (input_length - 3*filter_length +1);

		local_item_size = ocl_get_local_size(global_item_size, 2,1);

		             
                if(debugmode != 0)
                {
                        printf("local item size = %lu \n %lu", &local_item_size, local_item_size);
                        if(local_item_size != 0)
                        {
                              printf("This works because you divide %lu / %lu \n and this is %lu", global_item_size,local_item_size, global_item_size/local_item_size);
                        }
                        else
                        {
                              	FILE* attention;
				attention = fopen("filterlengthbad", "a+");
				if(attention == NULL)
				{
					printf("error in opening debug file \n");
					exit(1);
				}
				fprintf(attention, "The filterlength %d is not good for this filter, choose another filterlength ! \n", filter_length);
				fclose(attention);
				printf("There is no way to fit it evenly divided to workgroups, just let OpenCL do it \n");
                        }
                        if(harddebug != 0)
                        {
                                getchar();
                        }

                }


		if(local_item_size == 0)
		{
			OCL_CHECK_ERROR(clEnqueueNDRangeKernel(queue, filter1, 1, NULL, &global_item_size, NULL, 0, NULL, NULL));	
		}
		else
		{
			OCL_CHECK_ERROR(clEnqueueNDRangeKernel(queue, filter1, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL));
		}


		
		
		//local_item_size = NULL;	
		clEnqueueReadBuffer(queue, output, CL_TRUE, 0, output_bit_on_counts, energy_time, 0, NULL, NULL);
		clEnqueueReadBuffer(queue, settings, CL_TRUE, 0, SIZE_settings_bit, filtersettings, 0, NULL, NULL);


		//Writing back the data
		for(i = 0; i < filtersettings[3]; i++)
		{
			writing_back(filemode, filename, filename_e,filename_t, energy_time,i);
		}

		if(debugmode != 0)
		{
			printf("The Positions are:\n");
			for(i=0; i < counter; i++)
			{
				printf("%d\t", positions[i]);
				printf("note that this postion is the middle of the rect \n");
			}
		}
		//Safetychanges
		if(filtersettings[3] > counter)
		{
			safetyfactor = safetyfactor + 5*(filtersettings[3] - counter);
			if(safetyfactor <= 0)
			{
				safetyfactor = 10;
			}

			notexpect = filtersettings[3] - expected;
			if(safemode != 0 && notexpect >= notexpect_max)
			{
				printf("The Filter found to many peaks it. It expected %d. It found %d times more than expected.\n", expected, notexpect);
				printf("Safemode is on. Exit program \n");
				OCL_CHECK_ERROR(clReleaseMemObject(input));
				OCL_CHECK_ERROR(clReleaseMemObject(output));
				OCL_CHECK_ERROR(clReleaseMemObject(settings));
				OCL_CHECK_ERROR(clReleaseKernel(filter1));
				OCL_CHECK_ERROR(clReleaseProgram(program));

				ocl_free(ocl);

				free(input_vector);
				free(energy_time);
				free(positions);
				free(filtersettings);

			}
			else
			{
				printf("The Filter found to many peaks it. It expected %d. It found %d times more than expected \n", expected, notexpect);
			}
		}






		OCL_CHECK_ERROR(clReleaseMemObject(input));
		OCL_CHECK_ERROR(clReleaseMemObject(output));
		OCL_CHECK_ERROR(clReleaseMemObject(settings));
		OCL_CHECK_ERROR(clReleaseKernel(filter1));
		OCL_CHECK_ERROR(clReleaseProgram(program));

		ocl_free(ocl);
		free(input_vector);
		free(energy_time);
		free(positions);
		free(filtersettings);




}
// host stub function
void ops_par_loop_PdV_kernel_nopredict(
    char const *name, ops_block block, int dim, int *range, ops_arg arg0,
    ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5,
    ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10,
    ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15,
    ops_arg arg16) {

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[17] = {arg0,  arg1,  arg2,  arg3,  arg4,  arg5,
                      arg6,  arg7,  arg8,  arg9,  arg10, arg11,
                      arg12, arg13, arg14, arg15, arg16};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 17, range, 103))
    return;
#endif

  if (OPS_diags > 1) {
    ops_timing_realloc(103, "PdV_kernel_nopredict");
    OPS_kernels[103].count++;
    ops_timers_core(&c1, &t1);
  }

  // compute locally allocated range for the sub-block
  int start[3];
  int end[3];
#ifdef OPS_MPI
  sub_block_list sb = OPS_sub_block_list[block->index];
  if (!sb->owned)
    return;
  for (int n = 0; n < 3; n++) {
    start[n] = sb->decomp_disp[n];
    end[n] = sb->decomp_disp[n] + sb->decomp_size[n];
    if (start[n] >= range[2 * n]) {
      start[n] = 0;
    } else {
      start[n] = range[2 * n] - start[n];
    }
    if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0)
      start[n] = range[2 * n];
    if (end[n] >= range[2 * n + 1]) {
      end[n] = range[2 * n + 1] - sb->decomp_disp[n];
    } else {
      end[n] = sb->decomp_size[n];
    }
    if (sb->id_p[n] == MPI_PROC_NULL &&
        (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n]))
      end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]);
  }
#else
  for (int n = 0; n < 3; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }
#endif

  int x_size = MAX(0, end[0] - start[0]);
  int y_size = MAX(0, end[1] - start[1]);
  int z_size = MAX(0, end[2] - start[2]);

  int xdim0 = args[0].dat->size[0];
  int ydim0 = args[0].dat->size[1];
  int xdim1 = args[1].dat->size[0];
  int ydim1 = args[1].dat->size[1];
  int xdim2 = args[2].dat->size[0];
  int ydim2 = args[2].dat->size[1];
  int xdim3 = args[3].dat->size[0];
  int ydim3 = args[3].dat->size[1];
  int xdim4 = args[4].dat->size[0];
  int ydim4 = args[4].dat->size[1];
  int xdim5 = args[5].dat->size[0];
  int ydim5 = args[5].dat->size[1];
  int xdim6 = args[6].dat->size[0];
  int ydim6 = args[6].dat->size[1];
  int xdim7 = args[7].dat->size[0];
  int ydim7 = args[7].dat->size[1];
  int xdim8 = args[8].dat->size[0];
  int ydim8 = args[8].dat->size[1];
  int xdim9 = args[9].dat->size[0];
  int ydim9 = args[9].dat->size[1];
  int xdim10 = args[10].dat->size[0];
  int ydim10 = args[10].dat->size[1];
  int xdim11 = args[11].dat->size[0];
  int ydim11 = args[11].dat->size[1];
  int xdim12 = args[12].dat->size[0];
  int ydim12 = args[12].dat->size[1];
  int xdim13 = args[13].dat->size[0];
  int ydim13 = args[13].dat->size[1];
  int xdim14 = args[14].dat->size[0];
  int ydim14 = args[14].dat->size[1];
  int xdim15 = args[15].dat->size[0];
  int ydim15 = args[15].dat->size[1];
  int xdim16 = args[16].dat->size[0];
  int ydim16 = args[16].dat->size[1];

  // build opencl kernel if not already built

  buildOpenCLKernels_PdV_kernel_nopredict(
      xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, xdim4, ydim4,
      xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, ydim8, xdim9, ydim9,
      xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, xdim13, ydim13, xdim14,
      ydim14, xdim15, ydim15, xdim16, ydim16);

  // set up OpenCL thread blocks
  size_t globalWorkSize[3] = {
      ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x,
      ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y,
      ((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z};
  size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y,
                             OPS_block_size_z};

  // set up initial pointers
  int d_m[OPS_MAX_DIM];
#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[0].dat->d_m[d];
#endif
  int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] -
                       args[0].dat->base[0] - d_m[0]);
  base0 = base0 +
          args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] -
                                      args[0].dat->base[1] - d_m[1]);
  base0 = base0 +
          args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 *
              (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[1].dat->d_m[d];
#endif
  int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] -
                       args[1].dat->base[0] - d_m[0]);
  base1 = base1 +
          args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] -
                                      args[1].dat->base[1] - d_m[1]);
  base1 = base1 +
          args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 *
              (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[2].dat->d_m[d];
#endif
  int base2 = 1 * 1 * (start[0] * args[2].stencil->stride[0] -
                       args[2].dat->base[0] - d_m[0]);
  base2 = base2 +
          args[2].dat->size[0] * 1 * (start[1] * args[2].stencil->stride[1] -
                                      args[2].dat->base[1] - d_m[1]);
  base2 = base2 +
          args[2].dat->size[0] * 1 * args[2].dat->size[1] * 1 *
              (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[3].dat->d_m[d];
#endif
  int base3 = 1 * 1 * (start[0] * args[3].stencil->stride[0] -
                       args[3].dat->base[0] - d_m[0]);
  base3 = base3 +
          args[3].dat->size[0] * 1 * (start[1] * args[3].stencil->stride[1] -
                                      args[3].dat->base[1] - d_m[1]);
  base3 = base3 +
          args[3].dat->size[0] * 1 * args[3].dat->size[1] * 1 *
              (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[4].dat->d_m[d];
#endif
  int base4 = 1 * 1 * (start[0] * args[4].stencil->stride[0] -
                       args[4].dat->base[0] - d_m[0]);
  base4 = base4 +
          args[4].dat->size[0] * 1 * (start[1] * args[4].stencil->stride[1] -
                                      args[4].dat->base[1] - d_m[1]);
  base4 = base4 +
          args[4].dat->size[0] * 1 * args[4].dat->size[1] * 1 *
              (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[5].dat->d_m[d];
#endif
  int base5 = 1 * 1 * (start[0] * args[5].stencil->stride[0] -
                       args[5].dat->base[0] - d_m[0]);
  base5 = base5 +
          args[5].dat->size[0] * 1 * (start[1] * args[5].stencil->stride[1] -
                                      args[5].dat->base[1] - d_m[1]);
  base5 = base5 +
          args[5].dat->size[0] * 1 * args[5].dat->size[1] * 1 *
              (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[6].dat->d_m[d];
#endif
  int base6 = 1 * 1 * (start[0] * args[6].stencil->stride[0] -
                       args[6].dat->base[0] - d_m[0]);
  base6 = base6 +
          args[6].dat->size[0] * 1 * (start[1] * args[6].stencil->stride[1] -
                                      args[6].dat->base[1] - d_m[1]);
  base6 = base6 +
          args[6].dat->size[0] * 1 * args[6].dat->size[1] * 1 *
              (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[7].dat->d_m[d];
#endif
  int base7 = 1 * 1 * (start[0] * args[7].stencil->stride[0] -
                       args[7].dat->base[0] - d_m[0]);
  base7 = base7 +
          args[7].dat->size[0] * 1 * (start[1] * args[7].stencil->stride[1] -
                                      args[7].dat->base[1] - d_m[1]);
  base7 = base7 +
          args[7].dat->size[0] * 1 * args[7].dat->size[1] * 1 *
              (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[8].dat->d_m[d];
#endif
  int base8 = 1 * 1 * (start[0] * args[8].stencil->stride[0] -
                       args[8].dat->base[0] - d_m[0]);
  base8 = base8 +
          args[8].dat->size[0] * 1 * (start[1] * args[8].stencil->stride[1] -
                                      args[8].dat->base[1] - d_m[1]);
  base8 = base8 +
          args[8].dat->size[0] * 1 * args[8].dat->size[1] * 1 *
              (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[9].dat->d_m[d];
#endif
  int base9 = 1 * 1 * (start[0] * args[9].stencil->stride[0] -
                       args[9].dat->base[0] - d_m[0]);
  base9 = base9 +
          args[9].dat->size[0] * 1 * (start[1] * args[9].stencil->stride[1] -
                                      args[9].dat->base[1] - d_m[1]);
  base9 = base9 +
          args[9].dat->size[0] * 1 * args[9].dat->size[1] * 1 *
              (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[10].dat->d_m[d];
#endif
  int base10 = 1 * 1 * (start[0] * args[10].stencil->stride[0] -
                        args[10].dat->base[0] - d_m[0]);
  base10 = base10 +
           args[10].dat->size[0] * 1 * (start[1] * args[10].stencil->stride[1] -
                                        args[10].dat->base[1] - d_m[1]);
  base10 = base10 +
           args[10].dat->size[0] * 1 * args[10].dat->size[1] * 1 *
               (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] -
                d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[11].dat->d_m[d];
#endif
  int base11 = 1 * 1 * (start[0] * args[11].stencil->stride[0] -
                        args[11].dat->base[0] - d_m[0]);
  base11 = base11 +
           args[11].dat->size[0] * 1 * (start[1] * args[11].stencil->stride[1] -
                                        args[11].dat->base[1] - d_m[1]);
  base11 = base11 +
           args[11].dat->size[0] * 1 * args[11].dat->size[1] * 1 *
               (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] -
                d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[12].dat->d_m[d];
#endif
  int base12 = 1 * 1 * (start[0] * args[12].stencil->stride[0] -
                        args[12].dat->base[0] - d_m[0]);
  base12 = base12 +
           args[12].dat->size[0] * 1 * (start[1] * args[12].stencil->stride[1] -
                                        args[12].dat->base[1] - d_m[1]);
  base12 = base12 +
           args[12].dat->size[0] * 1 * args[12].dat->size[1] * 1 *
               (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] -
                d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[13].dat->d_m[d];
#endif
  int base13 = 1 * 1 * (start[0] * args[13].stencil->stride[0] -
                        args[13].dat->base[0] - d_m[0]);
  base13 = base13 +
           args[13].dat->size[0] * 1 * (start[1] * args[13].stencil->stride[1] -
                                        args[13].dat->base[1] - d_m[1]);
  base13 = base13 +
           args[13].dat->size[0] * 1 * args[13].dat->size[1] * 1 *
               (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] -
                d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[14].dat->d_m[d] + OPS_sub_dat_list[args[14].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[14].dat->d_m[d];
#endif
  int base14 = 1 * 1 * (start[0] * args[14].stencil->stride[0] -
                        args[14].dat->base[0] - d_m[0]);
  base14 = base14 +
           args[14].dat->size[0] * 1 * (start[1] * args[14].stencil->stride[1] -
                                        args[14].dat->base[1] - d_m[1]);
  base14 = base14 +
           args[14].dat->size[0] * 1 * args[14].dat->size[1] * 1 *
               (start[2] * args[14].stencil->stride[2] - args[14].dat->base[2] -
                d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[15].dat->d_m[d] + OPS_sub_dat_list[args[15].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[15].dat->d_m[d];
#endif
  int base15 = 1 * 1 * (start[0] * args[15].stencil->stride[0] -
                        args[15].dat->base[0] - d_m[0]);
  base15 = base15 +
           args[15].dat->size[0] * 1 * (start[1] * args[15].stencil->stride[1] -
                                        args[15].dat->base[1] - d_m[1]);
  base15 = base15 +
           args[15].dat->size[0] * 1 * args[15].dat->size[1] * 1 *
               (start[2] * args[15].stencil->stride[2] - args[15].dat->base[2] -
                d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[16].dat->d_m[d] + OPS_sub_dat_list[args[16].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[16].dat->d_m[d];
#endif
  int base16 = 1 * 1 * (start[0] * args[16].stencil->stride[0] -
                        args[16].dat->base[0] - d_m[0]);
  base16 = base16 +
           args[16].dat->size[0] * 1 * (start[1] * args[16].stencil->stride[1] -
                                        args[16].dat->base[1] - d_m[1]);
  base16 = base16 +
           args[16].dat->size[0] * 1 * args[16].dat->size[1] * 1 *
               (start[2] * args[16].stencil->stride[2] - args[16].dat->base[2] -
                d_m[2]);

  ops_H_D_exchanges_device(args, 17);
  ops_halo_exchanges(args, 17, range);
  ops_H_D_exchanges_device(args, 17);

  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[103].mpi_time += t2 - t1;
  }

  if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) {

    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 0, sizeof(cl_mem),
                              (void *)&arg0.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 1, sizeof(cl_mem),
                              (void *)&arg1.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 2, sizeof(cl_mem),
                              (void *)&arg2.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 3, sizeof(cl_mem),
                              (void *)&arg3.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 4, sizeof(cl_mem),
                              (void *)&arg4.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 5, sizeof(cl_mem),
                              (void *)&arg5.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 6, sizeof(cl_mem),
                              (void *)&arg6.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 7, sizeof(cl_mem),
                              (void *)&arg7.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 8, sizeof(cl_mem),
                              (void *)&arg8.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 9, sizeof(cl_mem),
                              (void *)&arg9.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 10, sizeof(cl_mem),
                              (void *)&arg10.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 11, sizeof(cl_mem),
                              (void *)&arg11.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 12, sizeof(cl_mem),
                              (void *)&arg12.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 13, sizeof(cl_mem),
                              (void *)&arg13.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 14, sizeof(cl_mem),
                              (void *)&arg14.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 15, sizeof(cl_mem),
                              (void *)&arg15.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 16, sizeof(cl_mem),
                              (void *)&arg16.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 17,
                              sizeof(cl_double), (void *)&dt));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 18, sizeof(cl_int),
                              (void *)&base0));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 19, sizeof(cl_int),
                              (void *)&base1));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 20, sizeof(cl_int),
                              (void *)&base2));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 21, sizeof(cl_int),
                              (void *)&base3));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 22, sizeof(cl_int),
                              (void *)&base4));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 23, sizeof(cl_int),
                              (void *)&base5));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 24, sizeof(cl_int),
                              (void *)&base6));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 25, sizeof(cl_int),
                              (void *)&base7));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 26, sizeof(cl_int),
                              (void *)&base8));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 27, sizeof(cl_int),
                              (void *)&base9));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 28, sizeof(cl_int),
                              (void *)&base10));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 29, sizeof(cl_int),
                              (void *)&base11));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 30, sizeof(cl_int),
                              (void *)&base12));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 31, sizeof(cl_int),
                              (void *)&base13));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 32, sizeof(cl_int),
                              (void *)&base14));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 33, sizeof(cl_int),
                              (void *)&base15));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 34, sizeof(cl_int),
                              (void *)&base16));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 35, sizeof(cl_int),
                              (void *)&x_size));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 36, sizeof(cl_int),
                              (void *)&y_size));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 37, sizeof(cl_int),
                              (void *)&z_size));

    // call/enque opencl kernel wrapper function
    clSafeCall(clEnqueueNDRangeKernel(
        OPS_opencl_core.command_queue, OPS_opencl_core.kernel[103], 3, NULL,
        globalWorkSize, localWorkSize, 0, NULL, NULL));
  }
  if (OPS_diags > 1) {
    clSafeCall(clFinish(OPS_opencl_core.command_queue));
  }

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[103].time += t1 - t2;
  }

  ops_set_dirtybit_device(args, 17);
  ops_set_halo_dirtybit3(&args[6], range);
  ops_set_halo_dirtybit3(&args[10], range);
  ops_set_halo_dirtybit3(&args[13], range);

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c2, &t2);
    OPS_kernels[103].mpi_time += t2 - t1;
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15);
    OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16);
  }
}
Example #11
0
int main(int argc, char *argv[])
{
  struct ocl_ds     *o_ds;
  cl_int err;
  cl_event evt;
  cl_mem            o_in;
  cl_int4           *o_out;
  struct ocl_kernel *o_k;
  int len = LEN;
  int i;
  
  size_t workGroupSize[2], localz[2];

  localz[1] = 16;
  localz[0] = 16;
  workGroupSize[0] = 1024*1024;
  workGroupSize[1] = 1;

  o_ds = create_ocl_ds(KERNELDIR KERNELS_FILE);
  if (o_ds == NULL){
    return 1;
  }

  o_in = create_ocl_mem(o_ds, sizeof(cl_int4)*len);
  if (o_in == NULL){
    goto free_mem_in;
  }
  
  o_out = malloc(sizeof(cl_int4) * len);
  if (o_out == NULL){
    goto free_mem_out;
  }

  //bzero(o_out, len*sizeof(cl_int4));

  o_k = create_ocl_kernel(o_ds, "ocl_layout");
  if (o_k == NULL){
    goto free_kernel;
  }

#if 0
  err  = clSetKernelArg(o_k->k_kernel, 0, sizeof(cl_mem), (void *) &o_in);
  err |= clSetKernelArg(o_k->k_kernel, 1, sizeof(int), (void *) &len);
  if (err != CL_SUCCESS){
#ifdef DEBUG
    fprintf(stderr, "clSetKernelArg return %s\n", oclErrorString(err));
#endif
    goto clean_up;
  }

  err = clEnqueueNDRangeKernel(o_ds->d_cq, o_k->k_kernel, 2, NULL, workGroupSize, localz, 0, NULL, &evt);
  //err = clEnqueueNDRangeKernel(o_ds->d_cq, o_k->k_kernel, 3, NULL, workGroupSize, NULL, 0, NULL, &evt);
  if (err != CL_SUCCESS){
#ifdef DEBUG
    fprintf(stderr, "clEnqueueNDRangeKernel: %s\n", oclErrorString(err));
#endif
    goto clean_up;
  }

  clReleaseEvent(evt);
  clFinish(o_ds->d_cq);
#endif

#if 0
  fprintf(stderr, "%s: pointers s:[%ld] p:<%p>\n", __func__, sizeof(cl_mem), &o_in);
  fprintf(stderr, "%s: pointers s:[%ld] p:<%p>\n", __func__, sizeof(int), &len);
#endif

#if 0
  //if (run_1d_ocl_kernel(o_ds, o_k, workGroupSize, ((void*)(&(o_in))), (sizeof(o_in)), ((void*)(&(len))), (sizeof(len)), NULL) < 0){
  if (run_1d_ocl_kernel(o_ds, o_k, workGroupSize, OCL_PARAM(o_in), OCL_PARAM(len), NULL) < 0){
#ifdef DEBUG
    fprintf(stderr, "%s: error in run kernel\n", __func__);
#endif
    goto clean_up;
  }
#endif

  //for (i=1024; i<len; i+=1024){
    
    //if (xfer_from_ocl_mem(o_ds, o_in, sizeof(cl_int4) * i, o_out) < 0){
    if (xfer_from_ocl_mem(o_ds, o_in, sizeof(cl_int4) * len, o_out) < 0){
#ifdef DEBUG
      fprintf(stderr, "%s: xfer from ocl error\n", __func__);
#endif
      goto clean_up;
    }

 // }
  
#if 0
#if 1
#ifdef DEBUG
  for (i=0; i<len; i++){
    fprintf(stderr, "%d %d %d %d\n", o_out[i].w, o_out[i].x, o_out[i].y, o_out[i].z);
  }
#endif
#else
    fwrite(o_out, len, sizeof(cl_int4), stdout);
#endif
#endif


clean_up:

  destroy_ocl_kernel(o_k);
free_kernel:
  if (o_out)
    free(o_out);
free_mem_out:
  destroy_ocl_mem(o_in);
free_mem_in:
  destroy_ocl_ds(o_ds);

  return 0;
}
Example #12
0
int 
exec_trig_kernel(const char *program_source, 
                 int n, void *srcA, void *dst) 
{ 
  cl_context  context; 
  cl_command_queue cmd_queue; 
  cl_device_id  *devices; 
  cl_program  program; 
  cl_kernel  kernel; 
  cl_mem       memobjs[2]; 
  size_t       global_work_size[1]; 
  size_t       local_work_size[1]; 
  size_t       cb; 
  cl_int       err; 

  float c = 7.3f; // a scalar number to test non-pointer args
 
  // create the OpenCL context on a GPU device 
  context = poclu_create_any_context();
  if (context == (cl_context)0) 
    return -1; 
 
  // get the list of GPU devices associated with context 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
  devices = (cl_device_id *) malloc(cb); 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
 
  // create a command-queue 
  cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (cmd_queue == (cl_command_queue)0) 
    { 
      clReleaseContext(context); 
      free(devices); 
      return -1; 
    } 
  free(devices); 
 
  // allocate the buffer memory objects 
  memobjs[0] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcA, NULL); 
  if (memobjs[0] == (cl_mem)0) 
    { 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  memobjs[1] = clCreateBuffer(context, 
			      CL_MEM_READ_WRITE, 
			      sizeof(cl_float4) * n, NULL, NULL); 
  if (memobjs[1] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 1); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the program 
  program = clCreateProgramWithSource(context, 
				      1, (const char**)&program_source, NULL, NULL); 
  if (program == (cl_program)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // build the program 
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the kernel 
  kernel = clCreateKernel(program, "trig", NULL); 
  if (kernel == (cl_kernel)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set the args values 
  err = clSetKernelArg(kernel,  0,  
		       sizeof(cl_mem), (void *) &memobjs[0]); 
  err |= clSetKernelArg(kernel, 1,
			sizeof(cl_mem), (void *) &memobjs[1]); 
  err |= clSetKernelArg(kernel, 2,
			sizeof(float), (void *) &c); 
 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set work-item dimensions 
  global_work_size[0] = n; 
  local_work_size[0]= 2; 
 
  // execute kernel 
  err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, 
			       global_work_size, local_work_size,  
			       0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // read output image 
  err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 
			    0, n * sizeof(cl_float4), dst, 
			    0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // release kernel, program, and memory objects 
  delete_memobjs(memobjs, 2); 
  clReleaseKernel(kernel); 
  clReleaseProgram(program); 
  clReleaseCommandQueue(cmd_queue); 
  clReleaseContext(context); 
  return 0; // success... 
}
extern "C" void 
magmablas_zlacpy( magma_uplo_t uplo, magma_int_t m, magma_int_t n,
                  magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t lda,
                  magmaDoubleComplex_ptr dB, size_t dB_offset, magma_int_t ldb,
				  magma_queue_t queue)
{
/*
    Note
  ========
  - UPLO Parameter is disabled
  - Do we want to provide a generic function to the user with all the options?

  Purpose
  =======

  ZLACPY copies all or part of a two-dimensional matrix A to another
  matrix B.

  Arguments
  =========

  UPLO    (input) INTEGER
          Specifies the part of the matrix A to be copied to B.
          = 'U':      Upper triangular part
          = 'L':      Lower triangular part
          Otherwise:  All of the matrix A

  M       (input) INTEGER
          The number of rows of the matrix A.  M >= 0.

  N       (input) INTEGER
          The number of columns of the matrix A.  N >= 0.

  A       (input) COMPLEX DOUBLE PRECISION array, dimension (LDA,N)
          The m by n matrix A.  If UPLO = 'U', only the upper triangle
          or trapezoid is accessed; if UPLO = 'L', only the lower
          triangle or trapezoid is accessed.

  LDA     (input) INTEGER
          The leading dimension of the array A.  LDA >= max(1,M).

  B       (output) COMPLEX DOUBLE PRECISION array, dimension (LDB,N)
          On exit, B = A in the locations specified by UPLO.

  LDB     (input) INTEGER
          The leading dimension of the array B.  LDB >= max(1,M).

  =====================================================================   */

	size_t LocalWorkSize[1] = {64};
	size_t GlobalWorkSize[1] = {(m/64+(m%64 != 0))*64};
    
    if ( m == 0 || n == 0 )
        return;
    
    if ( uplo == MagmaUpper ) {
        fprintf(stderr, "lacpy upper is not implemented\n");
    }
    else if ( uplo == MagmaLower ) {
        fprintf(stderr, "lacpy lower is not implemented\n");
    }
    else {
		cl_int ciErrNum;
		cl_kernel ckKernel = NULL;
		ckKernel = rt->KernelPool["zlacpy_kernel"]; 
		if(!ckKernel){
			printf ("Error: cannot locate kernel in line %d, file %s\n", __LINE__, __FILE__);
			return;
		}
		
		int offset_A = (int)dA_offset;
		int offset_B = (int)dB_offset;
		int nn = 0;
		ciErrNum  = clSetKernelArg( ckKernel, nn++, sizeof(cl_int),           (void*)&m);
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),           (void*)&n );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_mem),           (void*)&dA);
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&offset_A );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&lda );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_mem),           (void*)&dB);
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&offset_B );
		ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int),		      (void*)&ldb );
		if (ciErrNum != CL_SUCCESS){
			printf("Error: clSetKernelArg at %d in file %s, %s\n", __LINE__, __FILE__, rt->GetErrorCode(ciErrNum));
			return;
		}

		// launch kernel
		ciErrNum = clEnqueueNDRangeKernel(
			queue, ckKernel, 1, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
		if (ciErrNum != CL_SUCCESS)
		{
			printf("Error: clEnqueueNDRangeKernel at %d in file %s \"%s\"\n",
				__LINE__, __FILE__, rt->GetErrorCode(ciErrNum));
			return;
		}
	}
}
Example #14
0
/**
    Purpose
    -------
    SLACPY_Q copies all or part of a two-dimensional matrix dA to another
    matrix dB.
    
    This is the same as SLACPY, but adds queue argument.
    
    Arguments
    ---------
    
    @param[in]
    uplo    magma_uplo_t
            Specifies the part of the matrix dA to be copied to dB.
      -     = MagmaUpper:      Upper triangular part
      -     = MagmaLower:      Lower triangular part
            Otherwise:  All of the matrix dA
    
    @param[in]
    m       INTEGER
            The number of rows of the matrix dA.  M >= 0.
    
    @param[in]
    n       INTEGER
            The number of columns of the matrix dA.  N >= 0.
    
    @param[in]
    dA      REAL array, dimension (LDDA,N)
            The m by n matrix dA.
            If UPLO = MagmaUpper, only the upper triangle or trapezoid is accessed;
            if UPLO = MagmaLower, only the lower triangle or trapezoid is accessed.
    
    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
    
    @param[out]
    dB      REAL array, dimension (LDDB,N)
            The m by n matrix dB.
            On exit, dB = dA in the locations specified by UPLO.
    
    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB.  LDDB >= max(1,M).
    
    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    @ingroup magma_saux2
    ********************************************************************/
extern "C" void
magmablas_slacpy(
    magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    magmaFloat_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloat_ptr       dB, size_t dB_offset, magma_int_t lddb,
    magma_queue_t queue )
{
    cl_kernel kernel;
    cl_int err;
    int i;

    magma_int_t info = 0;
    if ( m < 0 )
        info = -2;
    else if ( n < 0 )
        info = -3;
    else if ( ldda < max(1,m))
        info = -5;
    else if ( lddb < max(1,m))
        info = -7;
    
    if ( info != 0 ) {
        magma_xerbla( __func__, -(info) );
        return;
    }
    
    if ( m == 0 || n == 0 )
        return;
    
    size_t threads[2] = { BLK_X, 1 };
    size_t grid[2] = { (m + BLK_X - 1)/BLK_X, (n + BLK_Y - 1)/BLK_Y };
    grid[0] *= threads[0];
    grid[1] *= threads[1];
    
    if ( uplo == MagmaLower ) {
        kernel = g_runtime.get_kernel( "slacpy_kernel_lower" );
        if ( kernel != NULL ) {
            err = 0;
            i   = 0;
            err |= clSetKernelArg( kernel, i++, sizeof(m        ), &m         );
            err |= clSetKernelArg( kernel, i++, sizeof(n        ), &n         );
            err |= clSetKernelArg( kernel, i++, sizeof(dA       ), &dA        );
            err |= clSetKernelArg( kernel, i++, sizeof(dA_offset), &dA_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(ldda     ), &ldda      );
            err |= clSetKernelArg( kernel, i++, sizeof(dB       ), &dB        );
            err |= clSetKernelArg( kernel, i++, sizeof(dB_offset), &dB_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(lddb     ), &lddb      );
            check_error( err );

            err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, grid, threads, 0, NULL, NULL );
            check_error( err );
        }
    }
    else if ( uplo == MagmaUpper ) {
        kernel = g_runtime.get_kernel( "slacpy_kernel_upper" );
        if ( kernel != NULL ) {
            err = 0;
            i   = 0;
            err |= clSetKernelArg( kernel, i++, sizeof(m        ), &m         );
            err |= clSetKernelArg( kernel, i++, sizeof(n        ), &n         );
            err |= clSetKernelArg( kernel, i++, sizeof(dA       ), &dA        );
            err |= clSetKernelArg( kernel, i++, sizeof(dA_offset), &dA_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(ldda     ), &ldda      );
            err |= clSetKernelArg( kernel, i++, sizeof(dB       ), &dB        );
            err |= clSetKernelArg( kernel, i++, sizeof(dB_offset), &dB_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(lddb     ), &lddb      );
            check_error( err );

            err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, grid, threads, 0, NULL, NULL );
            check_error( err );
        }
    }
    else {
        kernel = g_runtime.get_kernel( "slacpy_kernel_full" );
        if ( kernel != NULL ) {
            err = 0;
            i   = 0;
            err |= clSetKernelArg( kernel, i++, sizeof(m        ), &m         );
            err |= clSetKernelArg( kernel, i++, sizeof(n        ), &n         );
            err |= clSetKernelArg( kernel, i++, sizeof(dA       ), &dA        );
            err |= clSetKernelArg( kernel, i++, sizeof(dA_offset), &dA_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(ldda     ), &ldda      );
            err |= clSetKernelArg( kernel, i++, sizeof(dB       ), &dB        );
            err |= clSetKernelArg( kernel, i++, sizeof(dB_offset), &dB_offset );
            err |= clSetKernelArg( kernel, i++, sizeof(lddb     ), &lddb      );
            check_error( err );

            err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, grid, threads, 0, NULL, NULL );
            check_error( err );
        }
    }
}
Example #15
0
int main(void) {
//time meassuring
  	struct timeval tvs;

//variables
	int 	Nx=1024;
	int		Ny=1024;
	int 	plotnum=0;
	int	  	Tmax=2;
	int 	plottime=0;
	int	  	plotgap=1;
	double	Lx=1.0;
	double 	Ly=1.0;
	double	dt=0.0;	
	double	A=0.0;
	double	B=0.0;
	double	Du=0.0;
	double	Dv=0.0;
//splitting coefficients
	double	a=0.5;	
	double 	b=0.5;
	double 	c=1.0;
//loop counters	
	int i=0;
	int j=0;
	int n=0;

	double*umax=NULL;
	double*vmax=NULL;
	parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B);
	plottime=plotgap;
	vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
	umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
//openCL variables
    cl_platform_id *platform_id = NULL;
    cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL;
	cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL;
    cl_int ret;
    cl_uint num_platforms;
// Detect how many platforms there are.
	ret = clGetPlatformIDs(0, NULL, &num_platforms);
// Allocate enough space for the number of platforms.
	platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id));
// Store the platforms
	ret = clGetPlatformIDs(num_platforms, platform_id, NULL);
	printf("Found %d platform(s)!\n",num_platforms);
    cl_uint *num_devices;
	num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint));
    cl_device_id **device_id = NULL;
	device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*));
// Detect number of devices in the platforms
	for(i=0;i<num_platforms;i++){
		char buf[65536];
		size_t size;
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size);
		printf("%s\n",buf);
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices);
		printf("Found %d device(s) on platform %d!\n", num_devices[i],i);
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size);
		printf("%s ",buf);
// Store numDevices from platform
		device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id));
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL);
		for(j=0;j<num_devices[i];j++){
			ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size);
			printf("%s (%d,%d)\n",buf,i,j);
		}
	}
//create context and command_queue
    cl_context context = NULL;
   	cl_command_queue command_queue = NULL;
//Which platform and device do i choose?
	int	chooseplatform=0;
	int	choosedevice=0;	  
	printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice);
	context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret);
	if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); }
	command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret);
	if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); }

//OpenCL arrays
    cl_mem cl_u = NULL,cl_v = NULL;
   	cl_mem cl_uhat = NULL, cl_vhat = NULL;
    cl_mem cl_kx = NULL, cl_ky = NULL;

//FFT
	clfftPlanHandle planHandle;
    cl_mem tmpBuffer = NULL;
	fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny);

//allocate gpu memory/
	cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret);
	cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret);

	printf("allocated space\n");
//load the kernels
	loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies");
	loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); 
	loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); 
	loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); 
	loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); 

	size_t global_work_size[1] = {Nx*Ny};
	size_t global_work_size_X[1] = {Nx};
	size_t global_work_size_Y[1] = {Ny};
//frequencies
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//printCL(&cl_kx,&command_queue,Nx,1);
//printCL(&cl_ky,&command_queue,1,Ny);
//inintial data
    ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly);
    ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//make output
    writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
    umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	printf("Got initial data, starting timestepping\n");
	mtime_s(&tvs);

	for(n=0;n<=Tmax;n++){
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

//linear
	fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
	fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);
//printf("A%f,B%f\n",A,B);
    ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat);
    ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat);
	ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du);
	ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A);
	ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B);
	ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c);
	ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
  	fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);		
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	
// done
	if(n==plottime){
		printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]);
		plottime=plottime+plotgap;
		plotnum=plotnum+1;
   	 	writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    	writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
        umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
        vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	}
}//end timestepping

	printf("Finished time stepping\n");
	mtime_e(&tvs,"Programm took:");
	writearray(umax,(Tmax/plotgap)+1,"u");
	writearray(vmax,(Tmax/plotgap)+1,"v");
	free(umax);
	free(vmax);	

	clReleaseMemObject(cl_u);
	clReleaseMemObject(cl_v);
	clReleaseMemObject(cl_uhat);
	clReleaseMemObject(cl_vhat);
	clReleaseMemObject(cl_kx);
	clReleaseMemObject(cl_ky);

    ret = clReleaseKernel(initialdata); 
    ret = clReleaseKernel(frequencies); 
    ret = clReleaseKernel(linearpart); 
    ret = clReleaseKernel(nonlinearpart_a);
    ret = clReleaseKernel(nonlinearpart_b);

	fftdestroy(&planHandle, &tmpBuffer);

	ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

	for(i=0;i<num_platforms;i++){free(device_id[i]);}
	free(device_id);
	free(platform_id);
	free(num_devices);
	printf("Program execution complete\n");

	return 0;
}
Example #16
0
//---------------------------------------------------------------------
// this function computes the norm of the difference between the
// computed solution and the exact solution
//---------------------------------------------------------------------
void error_norm(double rms[5])
{
  int i, m, d;

  cl_kernel k_error_norm;
  cl_mem m_rms;
  double (*g_rms)[5];
  size_t local_ws, global_ws, temp, wg_num, buf_size;
  cl_int ecode;

  int d0 = grid_points[0];
  int d1 = grid_points[1];
  int d2 = grid_points[2];

  for (m = 0; m < 5; m++) {
    rms[m] = 0.0;
  }

  temp = d2 / max_compute_units;
  local_ws  = temp == 0 ? 1 : temp;
  global_ws = clu_RoundWorkSize((size_t)d2, local_ws);
  wg_num = global_ws / local_ws;

  buf_size = sizeof(double) * 5 * wg_num;
  m_rms = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         buf_size, 
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer()");

  k_error_norm = clCreateKernel(p_error, "error_norm", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");

  ecode  = clSetKernelArg(k_error_norm, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_error_norm, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_error_norm, 2, sizeof(cl_mem), &m_rms);
  ecode |= clSetKernelArg(k_error_norm, 3, sizeof(double)*5*local_ws, NULL);
  ecode |= clSetKernelArg(k_error_norm, 4, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_error_norm, 5, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_error_norm, 6, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");
  
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_error_norm,
                                 1, NULL,
                                 &global_ws,
                                 &local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  g_rms = (double (*)[5])malloc(buf_size);

  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_rms,
                              CL_TRUE,
                              0, buf_size,
                              g_rms,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clReadBuffer()");

  // reduction
  for (i = 0; i < wg_num; i++) {
    for (m = 0; m < 5; m++) {
      rms[m] += g_rms[i][m];
    }
  }
  
  for (m = 0; m < 5; m++) {
    for (d = 0; d < 3; d++) {
      rms[m] = rms[m] / (double)(grid_points[d]-2);
    }
    rms[m] = sqrt(rms[m]);
  }

  free(g_rms);
  clReleaseMemObject(m_rms);
  clReleaseKernel(k_error_norm);
}
Example #17
0
int main(int argc, char **argv) {




	if (find_option(argc, argv, "-h") >= 0)
	{
		printf("Options:\n");
		printf("-h to see this help\n");
		printf("-n <int> to set the number of particles\n");
		printf("-o <filename> to specify the output file name\n");
		printf("-s <filename> to specify the summary output file name\n");
		return 0;
	}


	int n = read_int(argc, argv, "-n", 1000);

	char *savename = read_string(argc, argv, "-o", NULL);
	char *sumname = read_string(argc, argv, "-s", NULL);

	// For return values.
	cl_int ret;

	// OpenCL stuff.
	// Loading kernel files.
	FILE *kernelFile;
	char *kernelSource;
	size_t kernelSize;

	kernelFile = fopen("simulationKernel.cl", "r");

	if (!kernelFile) {
		fprintf(stderr, "No file named simulationKernel.cl was found\n");
		exit(-1);
	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);

	// Getting platform and device information
	cl_platform_id platformId = NULL;
	cl_device_id deviceID = NULL;
	cl_uint retNumDevices;
	cl_uint retNumPlatforms;
	ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
	// Different types of devices to pick from. At the moment picks the default opencl device.
	//CL_DEVICE_TYPE_GPU
	//CL_DEVICE_TYPE_ACCELERATOR
	//CL_DEVICE_TYPE_DEFAULT
	//CL_DEVICE_TYPE_CPU
	ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices);

	// Max workgroup size
	size_t max_available_local_wg_size;
	ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL);
	// Creating context.
	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret);


	// Creating command queue
        cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret);
	
	// Build program
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret);
//	printf("program = ret %i \n", ret);
	ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
//	printf("clBuildProgram: ret %i \n", ret);
	
	// Create kernels
	cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret);

	cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret);

	cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret);
	cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret);

	FILE *fsave = savename ? fopen(savename, "w") : NULL;
	FILE *fsum = sumname ? fopen(sumname, "a") : NULL;
	particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t));

	// GPU particle data structure
	cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret);

	// Set size
	set_size(n);

	init_particles(n, particles);

	double copy_time = read_timer();

	// Copy particles to device.
	ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL);
	copy_time = read_timer() - copy_time;
	

	// Calculating thread and thread block counts.
	// sizes
	size_t globalItemSize;
	size_t localItemSize;
	// Global item size
	if (n <= NUM_THREADS) {
		globalItemSize = NUM_THREADS;
		localItemSize = 16;
	}
	else if (n % NUM_THREADS != 0) {
		globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS;
	}
	else {
		globalItemSize = n;
	}

	// Local item size
	localItemSize = globalItemSize / NUM_THREADS;	

	// Bins and bin sizes.
	// Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10.
	// There will never be 10 particles in one bin.
	int maxParticles = 10;
	
	// Calculating the number of bins.
	int numberOfBins = (int)ceil(size/(2*cutoff)) + 2;
	
	// Bins will only exist on the device.
	particle_t* bins;
	
	// How many particles are there in each bin - also only exists on the device.
	volatile int* binSizes;
	
	// Number of bins to be initialized.
	size_t clearAmt = numberOfBins*numberOfBins;
	
	// Allocate memory for bins on the device.
	cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret);
	cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret);
	
	// SETTING ARGUMENTS FOR THE KERNELS
	
	// Set arguments for the init / clear kernel
	ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins);

	// Set arguments for the binning kernel
	ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(binKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins);

	// Set arguments for force kernel.
	ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins);


	// Set arguments for move kernel
	ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size);
	
	
	// Variable to check if kernel execution is done.
	cl_event kernelDone;
	
	
	double simulation_time = read_timer();
	int step = 0;
	for (step = 0; step < NSTEPS; step++) {


		// Execute bin initialization (clearing after first iteration)
		ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute binning kernel
		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
//		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);	
		// Execute force kernel
		ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute move kernel
		ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);

		if (fsave && (step%SAVEFREQ) == 0) {
			// Copy the particles back to the CPU
			ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone);
			ret = clWaitForEvents(1, &kernelDone);

			save(fsave, n, particles);
		}

	}
	simulation_time = read_timer() - simulation_time;
	printf("CPU-GPU copy time = %g seconds\n", copy_time);
	printf("n = %d, simulation time = %g seconds\n", n, simulation_time);

	if (fsum)
		fprintf(fsum, "%d %lf \n", n, simulation_time);

	if (fsum)
		fclose(fsum);
	free(particles);
	if (fsave)
		fclose(fsave);


	ret = clFlush(commandQueue);
	ret = clFinish(commandQueue);
	ret = clReleaseCommandQueue(commandQueue);
	ret = clReleaseKernel(forceKernel);
	ret = clReleaseKernel(moveKernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(d_particles);
	ret = clReleaseContext(context);


	return 0;
}
Example #18
0
int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 100000;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;
 
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
 
    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }
 
    size_t globalSize, localSize;
    cl_int err;
 
    // Number of work items in each local work group
    localSize = 64;
 
    // Number of total work items - localSize must be devisor
    globalSize = ceil(n/(float)localSize)*localSize;
 
    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
 
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 
    // Create a context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 
    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
 
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
 
    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
 
    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);
 
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
 
    // Execute the kernel over the entire range of the data set 
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);
 
    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);
 
    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );
 
    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Example #19
0
int btocl_invdetcholeskyDMATRIX1(cl_mem buffer, DMATRIX* dm, double* det) {
	// need extra buffer to store new diagonal
	int i,j, k,diagidx, subdiagidx, idx, idx1, idx2,n;
	double diag, *m;
	double* diagArray;
	//OCL variables
	cl_kernel kernel1, kernel2;
	cl_int err;
	cl_context context;
	cl_command_queue queue;
	cl_mem buffer_diag;
	size_t global_work_size;
	
	*det = 0;
	n = dm->nrows;
	m = dm->m;
	diagArray = (double*)malloc(sizeof(double)*n);
	
	context = btocl_getContext();
	queue =  btocl_getCommandQueue();	
	kernel1 = btocl_getKernel(BTOCL_CHOLUPDCOL);
	
	if (kernel1 == NULL) {
		printf("Error: Couldn't load kernel %s\n",btocl_getKernelName(BTOCL_CHOLUPDCOL));
		exit(1);
	} else {
		printf("loaded %s\n",btocl_getKernelName(BTOCL_CHOLUPDCOL));
	}
	kernel2 = btocl_getKernel(BTOCL_CHOLUPDMAT);
	if (kernel1 == NULL) {
		printf("Error: Couldn't load kernel %s\n",btocl_getKernelName(BTOCL_CHOLUPDMAT));
		exit(1);
	} else {
		printf("loaded %s\n",btocl_getKernelName(BTOCL_CHOLUPDMAT));
	}
	
	// buffer to store diagonal
	buffer_diag = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double), NULL, &err);
	if (err < 0) {
		printf("Couldn't create diagonal buffer\n");
		exit(1);
	}
	
	//btdebug_enter("btoclcopy1");
	// copy matrix to buffer - may not need this - probably copied during creation
	clEnqueueWriteBuffer(queue,buffer,CL_TRUE,0,n*n*sizeof(double),&m[0],0,0,NULL);
	//btdebug_exit("btoclcopy1");
	
	diagidx = 0;
	diag = m[diagidx];	// first diagonal
	global_work_size = n; // size of column to be updated
	for(i=0; i < n; i++) {
		//printf("Column %d\n",i);
		//printf("new diag %lf\n",diag);
		// diag = m[diagidx]; -- computed in previous iteration
		if (diag < 0) {
			printf("Error index %d negative diagonal %lf\n",diagidx,diag);		
			return 1;
		}
		global_work_size--;
		diag  = sqrt(diag);
		//printf("sq root diag %lf\n",diag);
		diagArray[i] = diag; // square root of diagonal
		if (diag < 0)
			*det += log(-diag);
		else
			*det += log(diag);
			
		// call update column kernel
		//perhaps it could be extended to have three phases: 
		// update, copy to local and do multiplication
		
		//idx = diagidx;
		//for(j=i+1; j < n; j++) {
		//	idx++;			
		//	m[idx] = m[idx]/diag;
		//}
		
		//printf("before update - diag %lf\n",diag);
		//btlapack_printDMATRIX(dm);
		
		// copy parameters
		// Pass Arguments
		if (global_work_size > 0) {
			if ((err = clSetKernelArg(kernel1,0,sizeof(cl_mem), &buffer)) < 0) {
				printf("Couldnt set first argument\n");
				exit(1);
			}
			if ((err = clSetKernelArg(kernel1,1,sizeof(diagidx), &diagidx)) < 0) {
				printf("Couldnt set second argument\n");
				exit(1);
			}
			if ((err = clSetKernelArg(kernel1,2,sizeof(diag), &diag)) < 0) {
				printf("Couldnt set third argument\n");
				exit(1);
			}
			// copy colum to buffer
			//btdebug_enter("btoclkernel1");
			// schedule kernel
			clEnqueueNDRangeKernel(queue,kernel1,1,NULL,&global_work_size,NULL,0,NULL,NULL);
			// copy back
			//btdebug_exit("btoclkernel1");
			
			if ((err = clSetKernelArg(kernel2,0,sizeof(cl_mem), &buffer)) < 0) {
				printf("Couldnt set first argument\n");
				exit(1);
			}
			if ((err = clSetKernelArg(kernel2,1,sizeof(cl_mem), &buffer_diag)) < 0) {
				printf("Couldnt set second argument\n");
				exit(1);
			}
			if ((err = clSetKernelArg(kernel2,2,sizeof(diagidx), &diagidx)) < 0) {
				printf("Couldnt set third argument\n");
				exit(1);
			}
			if ((err = clSetKernelArg(kernel2,3,sizeof(n), &n)) < 0) {  // lda
				printf("Couldnt set fifth argument\n");
				exit(1);
			}
			if ((err = clSetKernelArg(kernel2,4,sizeof(global_work_size), &global_work_size)) < 0) {
				printf("Couldnt set forth argument\n");
				exit(1);
			}
			//btdebug_enter("btoclkernel2");
			clEnqueueNDRangeKernel(queue,kernel2,1,NULL,&global_work_size,NULL,0,NULL,NULL);
			//btdebug_exit("btoclkernel2");
			// copy diagonal_buffer to diag
			//btdebug_enter("btocldiag");
			clEnqueueReadBuffer(queue,buffer_diag,CL_TRUE,0,sizeof(double),&diag,0,0,NULL);
			//btdebug_exit("btocldiag");
		} 
		//printf("after update, global_work size %d\n",global_work_size);
		//btlapack_printDMATRIX(dm);
				
		diagidx += n+1; // update diagonal index
		
		
	}
	// copy matrix from device to host
	clEnqueueReadBuffer(queue,buffer,CL_TRUE,0,n*n*sizeof(double),&m[0],0,0,NULL);
	
	// copy diagonals
	diagidx=0;
	for(j = 0; j < n; j++) {
		m[diagidx]=diagArray[j];
		diagidx += (n+1);
	}
	
	*det *= 2.0;
	return 0;
}
void OpenCLDevice::COM_clAttachOutputMemoryBufferToKernelParameter(cl_kernel kernel, int parameterIndex, cl_mem clOutputMemoryBuffer)
{
	cl_int error;
	error = clSetKernelArg(kernel, parameterIndex, sizeof(cl_mem), &clOutputMemoryBuffer);
	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
}
// host stub function
void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim,
                                    int *range, ops_arg arg0, ops_arg arg1,
                                    ops_arg arg2, ops_arg arg3) {

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[4] = {arg0, arg1, arg2, arg3};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 4, range, 107))
    return;
#endif

  if (OPS_diags > 1) {
    ops_timing_realloc(107, "flux_calc_kernely");
    OPS_kernels[107].count++;
    ops_timers_core(&c1, &t1);
  }

  // compute locally allocated range for the sub-block
  int start[3];
  int end[3];
#ifdef OPS_MPI
  sub_block_list sb = OPS_sub_block_list[block->index];
  if (!sb->owned)
    return;
  for (int n = 0; n < 3; n++) {
    start[n] = sb->decomp_disp[n];
    end[n] = sb->decomp_disp[n] + sb->decomp_size[n];
    if (start[n] >= range[2 * n]) {
      start[n] = 0;
    } else {
      start[n] = range[2 * n] - start[n];
    }
    if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0)
      start[n] = range[2 * n];
    if (end[n] >= range[2 * n + 1]) {
      end[n] = range[2 * n + 1] - sb->decomp_disp[n];
    } else {
      end[n] = sb->decomp_size[n];
    }
    if (sb->id_p[n] == MPI_PROC_NULL &&
        (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n]))
      end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]);
  }
#else
  for (int n = 0; n < 3; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }
#endif

  int x_size = MAX(0, end[0] - start[0]);
  int y_size = MAX(0, end[1] - start[1]);
  int z_size = MAX(0, end[2] - start[2]);

  int xdim0 = args[0].dat->size[0];
  int ydim0 = args[0].dat->size[1];
  int xdim1 = args[1].dat->size[0];
  int ydim1 = args[1].dat->size[1];
  int xdim2 = args[2].dat->size[0];
  int ydim2 = args[2].dat->size[1];
  int xdim3 = args[3].dat->size[0];
  int ydim3 = args[3].dat->size[1];

  // build opencl kernel if not already built

  buildOpenCLKernels_flux_calc_kernely(xdim0, ydim0, xdim1, ydim1, xdim2, ydim2,
                                       xdim3, ydim3);

  // set up OpenCL thread blocks
  size_t globalWorkSize[3] = {
      ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x,
      ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y,
      ((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z};
  size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y,
                             OPS_block_size_z};

  // set up initial pointers
  int d_m[OPS_MAX_DIM];
#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[0].dat->d_m[d];
#endif
  int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] -
                       args[0].dat->base[0] - d_m[0]);
  base0 = base0 +
          args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] -
                                      args[0].dat->base[1] - d_m[1]);
  base0 = base0 +
          args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 *
              (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[1].dat->d_m[d];
#endif
  int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] -
                       args[1].dat->base[0] - d_m[0]);
  base1 = base1 +
          args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] -
                                      args[1].dat->base[1] - d_m[1]);
  base1 = base1 +
          args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 *
              (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[2].dat->d_m[d];
#endif
  int base2 = 1 * 1 * (start[0] * args[2].stencil->stride[0] -
                       args[2].dat->base[0] - d_m[0]);
  base2 = base2 +
          args[2].dat->size[0] * 1 * (start[1] * args[2].stencil->stride[1] -
                                      args[2].dat->base[1] - d_m[1]);
  base2 = base2 +
          args[2].dat->size[0] * 1 * args[2].dat->size[1] * 1 *
              (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] -
               d_m[2]);

#ifdef OPS_MPI
  for (int d = 0; d < dim; d++)
    d_m[d] =
        args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d];
#else
  for (int d = 0; d < dim; d++)
    d_m[d] = args[3].dat->d_m[d];
#endif
  int base3 = 1 * 1 * (start[0] * args[3].stencil->stride[0] -
                       args[3].dat->base[0] - d_m[0]);
  base3 = base3 +
          args[3].dat->size[0] * 1 * (start[1] * args[3].stencil->stride[1] -
                                      args[3].dat->base[1] - d_m[1]);
  base3 = base3 +
          args[3].dat->size[0] * 1 * args[3].dat->size[1] * 1 *
              (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] -
               d_m[2]);

  ops_H_D_exchanges_device(args, 4);
  ops_halo_exchanges(args, 4, range);
  ops_H_D_exchanges_device(args, 4);

  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[107].mpi_time += t2 - t1;
  }

  if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) {

    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 0, sizeof(cl_mem),
                              (void *)&arg0.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 1, sizeof(cl_mem),
                              (void *)&arg1.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 2, sizeof(cl_mem),
                              (void *)&arg2.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 3, sizeof(cl_mem),
                              (void *)&arg3.data_d));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 4, sizeof(cl_double),
                              (void *)&dt));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 5, sizeof(cl_int),
                              (void *)&base0));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 6, sizeof(cl_int),
                              (void *)&base1));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 7, sizeof(cl_int),
                              (void *)&base2));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 8, sizeof(cl_int),
                              (void *)&base3));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 9, sizeof(cl_int),
                              (void *)&x_size));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 10, sizeof(cl_int),
                              (void *)&y_size));
    clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 11, sizeof(cl_int),
                              (void *)&z_size));

    // call/enque opencl kernel wrapper function
    clSafeCall(clEnqueueNDRangeKernel(
        OPS_opencl_core.command_queue, OPS_opencl_core.kernel[107], 3, NULL,
        globalWorkSize, localWorkSize, 0, NULL, NULL));
  }
  if (OPS_diags > 1) {
    clSafeCall(clFinish(OPS_opencl_core.command_queue));
  }

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[107].time += t1 - t2;
  }

  ops_set_dirtybit_device(args, 4);
  ops_set_halo_dirtybit3(&args[0], range);

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c2, &t2);
    OPS_kernels[107].mpi_time += t2 - t1;
    OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2);
    OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3);
  }
}
Example #22
0
int main()
{
  typedef float       ScalarType;
  
  
  /////////////////////////////////////////////////////////////////////////////////////////////////////////
  //////////////////////// Part 1: Set up a custom context and perform a sample operation. ////////////////
  ////////////////////////         This is rather lengthy due to the OpenCL framework.     ////////////////
  ////////////////////////         The following does essentially the same as the          ////////////////
  ////////////////////////         'custom_kernels'-tutorial!                               ////////////////
  /////////////////////////////////////////////////////////////////////////////////////////////////////////
  
  //manually set up a custom OpenCL context:
  std::vector<cl_device_id> device_id_array;
  
  //get all available devices
  viennacl::ocl::platform pf;
  std::cout << "Platform info: " << pf.info() << std::endl;
  std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT);
  std::cout << devices[0].name() << std::endl;
  std::cout << "Number of devices for custom context: " << devices.size() << std::endl;
  
  //set up context using all found devices:
  for (size_t i=0; i<devices.size(); ++i)
  {
      device_id_array.push_back(devices[i].id());
  }
     
  std::cout << "Creating context..." << std::endl;
  cl_int err;
  cl_context my_context = clCreateContext(0, device_id_array.size(), &(device_id_array[0]), NULL, NULL, &err);
  VIENNACL_ERR_CHECK(err);
   
  
  //create two Vectors:
  unsigned int vector_size = 10;
  std::vector<ScalarType> vec1(vector_size);
  std::vector<ScalarType> vec2(vector_size);
  std::vector<ScalarType> result(vector_size);
  
  //
  // fill the operands vec1 and vec2:
  //
  for (unsigned int i=0; i<vector_size; ++i)
  {
    vec1[i] = static_cast<ScalarType>(i);
    vec2[i] = static_cast<ScalarType>(vector_size-i);
  }
  
  //
  // create memory in OpenCL context:
  //
  cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
  VIENNACL_ERR_CHECK(err);

  // 
  // create a command queue for each device:
  // 
  
  std::vector<cl_command_queue> queues(devices.size());
  for (size_t i=0; i<devices.size(); ++i)
  {
    queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
    VIENNACL_ERR_CHECK(err);
  }
  
  // 
  // create and build a program in the context:
  // 
  size_t source_len = std::string(my_compute_program).length();
  cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
  err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);
  
/*            char buffer[1024];
            cl_build_status status;
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
            std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
            std::cout << "Log: " << buffer << std::endl;*/
  
  VIENNACL_ERR_CHECK(err);
  
  // 
  // create a kernel from the program:
  // 
  const char * kernel_name = "elementwise_prod";
  cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
  VIENNACL_ERR_CHECK(err);

  
  //
  // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2;
  //
  err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
  VIENNACL_ERR_CHECK(err);
  size_t global_size = vector_size;
  size_t local_size = vector_size;
  err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);
  
  
  //
  // Read and output result:
  //
  err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);
  err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);

  std::cout << "vec1  : ";
  for (size_t i=0; i<vec1.size(); ++i)
    std::cout << vec1[i] << " ";
  std::cout << std::endl;

  std::cout << "vec2  : ";
  for (size_t i=0; i<vec2.size(); ++i)
    std::cout << vec2[i] << " ";
  std::cout << std::endl;

  std::cout << "result: ";
  for (size_t i=0; i<result.size(); ++i)
    std::cout << result[i] << " ";
  std::cout << std::endl;
  
  ////////////////////////////////////////////////////////////////////////////////////////////////////////
  /////////////////////// Part 2: Let ViennaCL use the already created context: //////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////

  //Tell ViennaCL to use the previously created context.
  //This context is assigned an id '0' when using viennacl::ocl::switch_context().
  viennacl::ocl::setup_context(0, my_context, device_id_array, queues);
  viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero)
  
  //
  // Proof that ViennaCL really uses the new context:
  //
  std::cout << "Existing context: " << my_context << std::endl;
  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;

  //
  // Wrap existing OpenCL objects into ViennaCL:
  //
  viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size);
  viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size);
  viennacl::vector<ScalarType> vcl_result(mem_result, vector_size);
  viennacl::scalar<ScalarType> vcl_s = 2.0;

  std::cout << "Standard vector operations within ViennaCL:" << std::endl;
  vcl_result = vcl_s * vcl_vec1 + vcl_vec2;
  
  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;
  
  //
  // We can also reuse the existing elementwise_prod kernel. 
  // Therefore, we first have to make the existing program known to ViennaCL
  // For more details on the three lines, see tutorial 'custom-kernels'
  //
  std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
  viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program");
  viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel("elementwise_prod");
  viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size())));  //Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary here.
  
  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;
  
  
  //
  // Since a linear piece of memory can be interpreted in several ways, 
  // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/
  // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
  //
  viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3);
  
  vcl_vec2.resize(3);   //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2);

  std::cout << "result of matrix-vector product: ";
  std::cout << vcl_result << std::endl;

  //
  //  That's it.
  //
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
  
  return 0;
}
Example #23
0
	int main()
	{
	cl_int num_rand = 4096*256; /* The number of random numbers generated using one generator */
	int count_all, i, num_generator = sizeof(mts)/sizeof(mts[0]); /* The number of generators */
	double pi;
	cl_platform_id platform_id = NULL;
	cl_uint ret_num_platforms;
	cl_device_id device_id = NULL;
	cl_uint ret_num_devices;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_program program = NULL;
	cl_kernel kernel_mt = NULL, kernel_pi = NULL;
	size_t kernel_code_size;
	char *kernel_src_str;
	cl_uint *result;
	cl_int ret;
	FILE *fp;
	cl_mem rand, count;
	size_t global_item_size[3], local_item_size[3];
	cl_mem dev_mts;
	cl_event ev_mt_end, ev_pi_end, ev_copy_end;
	cl_ulong prof_start, prof_mt_end, prof_pi_end, prof_copy_end;
 
	clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
	&ret_num_devices);
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
	result = (cl_uint*)malloc(sizeof(cl_uint)*num_generator);
 
	command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
	fp = fopen("mt.cl", "r");
	kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE);
	kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);
 
	/* Create output buffer */
	rand = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_rand*num_generator, NULL, &ret);
	count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_generator, NULL, &ret);
 
	/* Build Program*/
	program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
	(const size_t *)&kernel_code_size, &ret);
	clBuildProgram(program, 1, &device_id, "", NULL, NULL);
	kernel_mt = clCreateKernel(program, "genrand", &ret);
	kernel_pi = clCreateKernel(program, "calc_pi", &ret);
 
	/* Create input parameter */
	dev_mts = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(mts), NULL, &ret);
	clEnqueueWriteBuffer(command_queue, dev_mts, CL_TRUE, 0, sizeof(mts), mts, 0, NULL, NULL);
 
	/* Set Kernel Arguments */
	clSetKernelArg(kernel_mt, 0, sizeof(cl_mem), (void*)&rand); /* Random numbers (output of genrand) */
	clSetKernelArg(kernel_mt, 1, sizeof(cl_mem), (void*)&dev_mts); /* MT parameter (input to genrand) */
	clSetKernelArg(kernel_mt, 2, sizeof(num_rand), &num_rand); /* Number of random numbers to generate */
 
	clSetKernelArg(kernel_pi, 0, sizeof(cl_mem), (void*)&count); /* Counter for points within circle (output of calc_pi) */
	clSetKernelArg(kernel_pi, 1, sizeof(cl_mem), (void*)&rand); /* Random numbers (input to calc_pi) */
	clSetKernelArg(kernel_pi, 2, sizeof(num_rand), &num_rand); /* Number of random numbers used */
 
	global_item_size[0] = num_generator; global_item_size[1] = 1; global_item_size[2] = 1;
	local_item_size[0] = num_generator; local_item_size[1] = 1; local_item_size[2] = 1;
 
	/* Create a random number array */
	clEnqueueNDRangeKernel(command_queue, kernel_mt, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_mt_end);
 
	/* Compute PI */
	clEnqueueNDRangeKernel(command_queue, kernel_pi, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_pi_end);
 
	/* Get result */
	clEnqueueReadBuffer(command_queue, count, CL_TRUE, 0, sizeof(cl_uint)*num_generator, result, 0, NULL, &ev_copy_end);
 
	/* Average the values of PI */
	count_all = 0;
	for (i=0; i < num_generator; i++) {
	count_all += result[i];
	}
 
	pi = ((double)count_all)/(num_rand * num_generator) * 4;
	printf("pi = %f\n", pi);
 
	/* Get execution time info */
	clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &prof_start, NULL);
	clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_mt_end, NULL);
	clGetEventProfilingInfo(ev_pi_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_pi_end, NULL);
	clGetEventProfilingInfo(ev_copy_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_copy_end, NULL);
 
	printf(" mt: %f[ms]\n"
		" pi: %f[ms]\n"
		" copy: %f[ms]\n",
		(prof_mt_end - prof_start)/(1000000.0),
		(prof_pi_end - prof_mt_end)/(1000000.0),
		(prof_copy_end - prof_pi_end)/(1000000.0));
 
	clReleaseEvent(ev_mt_end);
	clReleaseEvent(ev_pi_end);
	clReleaseEvent(ev_copy_end);
 
	clReleaseMemObject(rand);
	clReleaseMemObject(count);
	clReleaseKernel(kernel_mt);
	clReleaseKernel(kernel_pi);
	clReleaseProgram(program);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
	free(kernel_src_str);
	free(result);
	return 0;
}
Example #24
0
int main(void) {
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // Generate the input array on the host.
    float h_a[ARRAY_SIZE];
    float h_b[ARRAY_SIZE];
    for (int i = 0; i < ARRAY_SIZE; i++) {
        h_a[i] = (float)i;
        h_b[i] = (float)(2 * i);
    }

    float h_c[ARRAY_SIZE];

    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("vectors_cl.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
                         &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                      ARRAY_BYTES, NULL, &ret);

    // Copy h_a and h_b to memory buffer
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_a, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_b, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
        (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (ret != 0) {
        printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret);
        exit(1);
    }

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != 0) {
        printf("clBuildProgram returned non-zero status %d: ", ret);

        if (ret == CL_INVALID_PROGRAM) {
            printf("invalid program\n");
        } else if (ret == CL_INVALID_VALUE) {
            printf("invalid value\n");
        } else if (ret == CL_INVALID_DEVICE) {
            printf("invalid device\n");
        } else if (ret == CL_INVALID_BINARY) {
            printf("invalid binary\n");
        } else if (ret == CL_INVALID_BUILD_OPTIONS) {
            printf("invalid build options\n");
        } else if (ret == CL_INVALID_OPERATION) {
            printf("invalid operation\n");
        } else if (ret == CL_COMPILER_NOT_AVAILABLE) {
            printf("compiler not available\n");
        } else if (ret == CL_BUILD_PROGRAM_FAILURE) {
            printf("build program failure\n");

            // Determine the size of the log
            size_t log_size;
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

            // Allocate memory for the log
            char *log = (char *) malloc(log_size);

            // Get the log
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

            // Print the log
            printf("%s\n", log);
        } else if (ret == CL_OUT_OF_HOST_MEMORY) {
            printf("out of host memory\n");
        }
        exit(1);
    }

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "add", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    size_t array_size = ARRAY_SIZE;
    ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size);

    // Execute the OpenCL kernel on the list
    size_t global_item_size = ARRAY_SIZE; // Process the entire lists
    size_t local_item_size = 1; // Divide work items into groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
                              ARRAY_BYTES, h_c, 0, NULL, NULL);

    // Print out the resulting array.
    for (int i = 0; i < 8; i++) {
        printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    printf("...\n");

    for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) {
        printf("%d + %d = %d",
               (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    return 0;
}
Example #25
0
// main() for simple buffer and sub-buffer example
//
int main(int argc, char** argv)
{
    cl_int errNum;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;
    cl_context context;
    cl_program program;
    std::vector<cl_kernel> kernels;
    std::vector<cl_command_queue> queues;
    std::vector<cl_mem> buffers;
    int * inputOutput;
    std::cout << "Simple buffer and sub-buffer Example" << std::endl;
    // First, select an OpenCL platform to run on.
    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms);
    std::cout << "Number of platforms: \t" << numPlatforms << std::endl;
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    std::ifstream srcFile("simple.cl");
    
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl");
    
    std::string srcProg(
                        std::istreambuf_iterator<char>(srcFile),
                        (std::istreambuf_iterator<char>()));
    const char * src = srcProg.c_str();
    size_t length = srcProg.length();
    deviceIDs = NULL;
    DisplayPlatformInfo(
                        platformIDs[PLATFORM_INDEX],
                        CL_PLATFORM_VENDOR,
                        "CL_PLATFORM_VENDOR");
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            0,
                            NULL,
                            &numDevices);
    if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){
        checkErr(errNum, "clGetDeviceIDs");
    }
    
    deviceIDs = (cl_device_id *)alloca(
                                       sizeof(cl_device_id) * numDevices);
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            numDevices,
                            &deviceIDs[0],
                            NULL);
    checkErr(errNum, "clGetDeviceIDs");
    
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[PLATFORM_INDEX],
        0
    };
    
    context = clCreateContext(
                              contextProperties,
                              numDevices,
                              deviceIDs,
                              NULL,
                              NULL,
                              &errNum);
    
    checkErr(errNum, "clCreateContext");
    // Create program from source
    program = clCreateProgramWithSource(
                                        context,
                                        1,
                                        &src,
                                        &length,
                                        &errNum);
    checkErr(errNum, "clCreateProgramWithSource");
    
    // Build program
    errNum = clBuildProgram(
                            program,
                            numDevices,
                            deviceIDs,
                            "-I.",
                            NULL,
                            NULL);

    if (errNum != CL_SUCCESS){
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
                              program,
                              deviceIDs[0],
                              CL_PROGRAM_BUILD_LOG,
                              sizeof(buildLog),
                              buildLog,
                              NULL);
        std::cerr << "Error in OpenCL C source: " << std::endl;
        std::cerr << buildLog;
        checkErr(errNum, "clBuildProgram");
    }
        // create buffers and sub-buffers
        inputOutput = new int[NUM_BUFFER_ELEMENTS * numDevices];
        for (unsigned int i = 0; i < NUM_BUFFER_ELEMENTS * numDevices; i++)
        {
            inputOutput[i] = i;
        }
        
        // create a single buffer to cover all the input data
        cl_mem buffer = clCreateBuffer(
                                       context,
                                       CL_MEM_READ_WRITE,
                                       sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                       NULL,
                                       &errNum);
        checkErr(errNum, "clCreateBuffer");
        buffers.push_back(buffer);
        // now for all devices other than the first create a sub-buffer
        for (unsigned int i = 1; i < numDevices; i++)
        {
            cl_buffer_region region =
            {
                NUM_BUFFER_ELEMENTS * i * sizeof(int),
                NUM_BUFFER_ELEMENTS * sizeof(int)
            };
            buffer = clCreateSubBuffer(
                                       buffers[0],
                                       CL_MEM_READ_WRITE,
                                       CL_BUFFER_CREATE_TYPE_REGION,
                                       &region,
                                       &errNum);
            checkErr(errNum, "clCreateSubBuffer");
            buffers.push_back(buffer);
        }
        // Create command queues
        for (int i = 0; i < numDevices; i++)
        {
            InfoDevice<cl_device_type>::display(deviceIDs[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE");
            cl_command_queue queue =
            clCreateCommandQueue(
                                 context,
                                 deviceIDs[i],
                                 0,
                                 &errNum);
            checkErr(errNum, "clCreateCommandQueue");
            queues.push_back(queue);
            cl_kernel kernel = clCreateKernel(
                                              program,
                                              "square",
                                              &errNum);
            checkErr(errNum, "clCreateKernel(square)");
            errNum = clSetKernelArg(
                                    kernel,
                                    0,
                                    sizeof(cl_mem), (void *)&buffers[i]);
            checkErr(errNum, "clSetKernelArg(square)");
            kernels.push_back(kernel);
            // Write input data
            clEnqueueWriteBuffer(
                                 queues[0],
                                 buffers[0],
                                 CL_TRUE,
                                 0,
                                 sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                 (void*)inputOutput,
                                 0,
                                 NULL,
                                 NULL);
            std::vector<cl_event> events;
            // call kernel for each device
            for (int i = 0; i < queues.size(); i++)
            {
                cl_event event;
                size_t gWI = NUM_BUFFER_ELEMENTS;
                errNum = clEnqueueNDRangeKernel(
                                                queues[i],
                                                kernels[i],
                                                1,
                                                NULL,
                                                (const size_t*)&gWI,
                                                (const size_t*)NULL,
                                                0,
                                                0,
                                                &event);
                events.push_back(event);
            }
            // Technically don't need this as we are doing a blocking read
            // with in-order queue.
            clWaitForEvents(events.size(), events.data());
            // Read back computed data
            clEnqueueReadBuffer(
                                queues[0],
                                buffers[0],
                                CL_TRUE,
                                0,
                                sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                (void*)inputOutput,
                                0,
                                NULL,
                                NULL);
            // Display output in rows
            for (unsigned i = 0; i < numDevices; i++)
            {
                for (unsigned elems = i * NUM_BUFFER_ELEMENTS;
                     elems < ((i+1) * NUM_BUFFER_ELEMENTS);
                     elems++)
                {
                    std::cout << " " << inputOutput[elems];
                }
                std::cout << std::endl;
            }
            std::cout << "Program completed successfully" << std::endl;
            return 0; 
        }
}
Example #26
0
int main() {

   /* OpenCL data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   cl_int err;

   /* Data and events */
   char *kernel_msg;
   float data[4096];
   cl_mem data_buffer;
   cl_event kernel_event, read_event;   
   
   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }     

   /* Build the program and create a kernel */
   program = build_program(context, device, PROGRAM_FILE);
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);   
   };

   /* Create a write-only buffer to hold the output data */
   data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         sizeof(data), NULL, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };         

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
   if(err < 0) {
      perror("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }

   /* Read the buffer */
   err = clEnqueueReadBuffer(queue, data_buffer, CL_FALSE, 0, 
      sizeof(data), &data, 0, NULL, &read_event);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }
 
   /* Set event handling routines */
   kernel_msg = "The kernel finished successfully.\n\0";
   err = clSetEventCallback(kernel_event, CL_COMPLETE, 
         &kernel_complete, kernel_msg);
   if(err < 0) {
      perror("Couldn't set callback for event");
      exit(1);   
   }
   clSetEventCallback(read_event, CL_COMPLETE, &read_complete, data);

   /* Deallocate resources */
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Example #27
0
int main( int argc, char **argv )
{	
  struct pb_Parameters *params;
  params = pb_ReadParameters(&argc, argv);
  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
    {
      fprintf(stderr, "Expecting one input filename\n");
      exit(-1);
    }

  int err = 0;
  if(argc != 3)
    err |= 1;
  else {
    char* numend;
    N = strtol(argv[1], &numend, 10);
    if(numend == argv[1])
      err |= 2;
    B = strtol(argv[2], &numend, 10);
    if(numend == argv[2])
      err |= 4;
  }

  if(err)
  {
    fprintf(stderr, "Expecting two integers for N and B\n");
    exit(-1);
  }

  //8*1024*1024;
  int n_bytes = N * B* sizeof(float2);
  int nthreads = T;
  
  struct pb_TimerSet timers;
  pb_InitializeTimerSet(&timers);
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  float *shared_source =(float *)malloc(n_bytes);  
  float2 *source    = (float2 *)malloc( n_bytes );
  float2 *result    = (float2 *)calloc( N*B, sizeof(float2) );

  inputData(params->inpFiles[0],(float*)source,N*B*2);

  // OpenCL Code 
  cl_int clErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(params);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_int clStatus;
  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;

  cl_program clProgram;

  cl_kernel fft_kernel;

  cl_mem d_source, d_work; //float2 *d_source, *d_work;
  cl_mem d_shared_source; //float *d_shared_source;

  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clErrNum);
  OCL_ERRCK_VAR(clErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  const char *source_path = "src/opencl_nvidia/fft_kernel.cl";
  char *sourceCode;
  sourceCode = readFile(source_path);
  if (sourceCode == NULL) {
    fprintf(stderr, "Could not load program source of '%s'\n", source_path); exit(1);
  }

  clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&sourceCode, NULL, &clErrNum);
  OCL_ERRCK_VAR(clErrNum);

  free(sourceCode);

  /*
    char compileOptions[1024];
  //                -cl-nv-verbose // Provides register info for NVIDIA devices
  // Set all Macros referenced by kernels
  sprintf(compileOptions, "\
                -D PRESCAN_THREADS=%u\
                -D KB=%u -D UNROLL=%u\
                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",

                prescanThreads,
                lmemKB, UNROLL,
                bins_per_block, blockX
            ); 
  */
  OCL_ERRCK_RETVAL ( clBuildProgram(clProgram, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) );


  char *build_log;
  size_t ret_val_size;
  OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
  build_log = (char *)malloc(ret_val_size+1);
  OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );

  // to be careful, terminate with \0
  build_log[ret_val_size] = '\0';

  fprintf(stderr, "%s\n", build_log );

  fft_kernel = clCreateKernel(clProgram, "GPU_FftShMem", &clErrNum);
  OCL_ERRCK_VAR(clErrNum);

  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
  // allocate & copy device memory
  d_shared_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, shared_source, &clErrNum);  OCL_ERRCK_VAR(clErrNum);

  d_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, source, &clErrNum);  OCL_ERRCK_VAR(clErrNum);
  
  //result is initially zero'd out
  d_work = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, result, &clErrNum);  OCL_ERRCK_VAR(clErrNum);

  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
  
  size_t block[1] = { nthreads };
  size_t grid[1] = { B*block[0] };
  
  OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 0, sizeof(cl_mem), (void *)&d_source) );
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, fft_kernel, 1, 0,
                            grid, block, 0, 0, 0) ); 	
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);

  // copy device memory to host
  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_source, CL_TRUE, 
                        0, // Offset in bytes
                        n_bytes, // Size of data to read
                        result, // Host Source
                        0, NULL, NULL) );
                        
  if (params->outFile) {
    /* Write result to file */
    pb_SwitchToTimer(&timers, pb_TimerID_IO);
    outputData(params->outFile, (float*)result, N*B*2);
    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
  }

  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_source) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_work) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_shared_source) );
  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  free(shared_source);  
  free(source);
  free(result);
  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
  pb_PrintTimerSet(&timers);
  
  pb_DestroyTimerSet(&timers);
  
  return 0;
}
Example #28
0
int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem memobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    float mem[MEM_SIZE];

    FILE *fp;
    char fileName[] = "./kernel.clbin";
    size_t binary_size;
    char *binary_buf;
    cl_int binary_status;
    cl_int i;

    /* カーネルを含むオブジェクトファイルをロード */
    fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    binary_buf = (char *)malloc(MAX_BINARY_SIZE);
    binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp );
    fclose( fp );

    /* データを初期化 */
    for( i = 0; i < MEM_SIZE; i++ ) {
        mem[i] = i;
    }

    /* プラットフォーム・デバイスの情報の取得 */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* OpenCLコンテキストの作成 */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
  
    /* コマンドキューの作成 */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
  
    /* メモリバッファの作成 */
    memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret);

    /* メモリバッファにデータを転送 */
    ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 読み込んだバイナリからカーネルプログラムを作成 */
    program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, 
                                        (const unsigned char **)&binary_buf, &binary_status, &ret);
    
    /* OpenCLカーネルの作成 */
    kernel = clCreateKernel(program, "vecAdd", &ret);
    printf("err:%d\n", ret);

    /* OpenCLカーネル引数の設定 */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);

    size_t global_work_size[3] = {MEM_SIZE, 0, 0};
    size_t local_work_size[3]  = {MEM_SIZE, 0, 0};

    /* OpenCLカーネルを実行 */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

    /* メモリバッファから結果を取得 */
    ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 結果の表示 */
    for(i=0; i<MEM_SIZE; i++) {
        printf("mem[%d] : %f\n", i, mem[i]);
    }
  
    /* 終了処理 */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    free(binary_buf);

    return 0;
}
Example #29
0
void b2CLNarrowPhase::UpdateContactPairs(int contactNum, int *pContactNums, int maxContactNum/*, b2ContactListener* listener*/)
{
	//// for debug
	//int *fill_data = new int[contactNum];
	//for (int i=0; i<contactNum; i++)
	//	fill_data[i] = 231;
	//b2CLDevice::instance().copyArrayToDevice(b2CLCommonData::instance().manifoldBinaryBitListBuffer, fill_data, 0, sizeof(int) * contactNum);
	//delete [] fill_data;

	//// for debug
	//int* globalIndices = new int[50*4];
	//b2CLDevice::instance().copyArrayFromDevice(globalIndices, b2CLCommonData::instance().globalIndicesBuffer, 0, sizeof(int)*4*50, true);
	//int* indices = new int[200];
	//b2CLDevice::instance().copyArrayFromDevice(indices, b2CLCommonData::instance().pairIndicesBuffer, 0, sizeof(int)*200, true);
	//int *test = new int[contactNum];
 //	memset(test, 0, sizeof(int)*contactNum);
	//for (int i=0; i<contactNum; i++)
	//	test[indices[i]] = 1;
	//for (int i=0; i<contactNum; i++)
	//	if (test[i]==0)
	//		int a = 1;
	//delete [] test;
	//delete [] indices;
	//delete [] globalIndices;

  	for (int contactType=0; contactType<b2Shape::contact_type_num; contactType++)
	{
		if (pContactNums[0]!=0)
			int a = 0;
		if (pContactNums[contactType]>0)
		{
			unsigned int a = 0;

			cl_kernel collideKernel;

			switch (contactType)
			{
			case 0: // circle-circle
				collideKernel = collideCirclesKernel;
				break;
			case 1: // circle-polygon
				collideKernel = collidePolygonAndCircleKernel;
				break;
			case 2: // polygon-polygon
				collideKernel = collidePolygonsKernel;
				break;
			case 3: // edge-circle
				collideKernel = collideEdgeAndCircleKernel;
				break;
			case 4: // edge-polygon
				collideKernel = collideEdgeAndPolygonKernel;
				break;
			default:
				printf("Error! Unsupported contact type: %d!\n", contactType);
				exit(0);
			}
        
			int err = CL_SUCCESS;
			err |= clSetKernelArg(collideKernel,  a++, sizeof(cl_mem), &(b2CLCommonData::instance().manifoldListBuffers[b2CLCommonData::instance().currentManifoldBuffer]));
		#if defined(SCAN_OPENCL)
			err |= clSetKernelArg(collideKernel,  a++, sizeof(cl_mem), &(b2CLCommonData::instance().manifoldBinaryBitListBuffer));
		#endif
			err |= clSetKernelArg(collideKernel,  a++, sizeof(cl_mem), &(b2CLCommonData::instance().shapeListBuffer));
			err |= clSetKernelArg(collideKernel,  a++, sizeof(cl_mem), &(b2CLCommonData::instance().xfListBuffer));
			err |= clSetKernelArg(collideKernel,  a++, sizeof(cl_mem), &(b2CLCommonData::instance().globalIndicesBuffer));
			err |= clSetKernelArg(collideKernel,  a++, sizeof(cl_mem), &(b2CLCommonData::instance().pairIndicesBuffer));
			err |= clSetKernelArg(collideKernel,  a++, sizeof(int), &maxContactNum);
			err |= clSetKernelArg(collideKernel,  a++, sizeof(int), pContactNums+contactType);
			if (err != CL_SUCCESS)
			{
				printf("Error: %s: Failed to set kernel arguments!\n", (char *) collidePolygonsKernel);
				return;
			}

			int group_num = (pContactNums[contactType] + kernel_work_group_size-1)/kernel_work_group_size;
        
			size_t global = group_num * kernel_work_group_size;
			//cout << contactNum << ", " << group_num << endl;
			err = CL_SUCCESS;
			err |= clEnqueueNDRangeKernel(b2CLDevice::instance().GetCommandQueue(), collideKernel, 1, NULL, &global, &kernel_work_group_size, 0, NULL, NULL);
			if (err != CL_SUCCESS)
			{
				printf("Error: Collide Kernel: Failed to execute kernel!\n");
				return;
			}

		#ifdef _DEBUG
			//// for debug
			//b2clManifold *testManifold = new b2clManifold[contactNum];
			//b2CLDevice::instance().copyArrayFromDevice(testManifold, b2CLCommonData::instance().manifoldListBuffers[b2CLCommonData::instance().currentManifoldBuffer], 0, sizeof(b2clManifold)*contactNum,true);

			//int* input = new int[contactNum];
			//b2CLDevice::instance().copyArrayFromDevice(input, b2CLCommonData::instance().manifoldBinaryBitListBuffer, 0, sizeof(int)*contactNum, true);
			//for (int i=0; i<contactNum; i++)
			//{
			//	if (input[i]!=0 && input[i]!=1)
			//		int a = 0;
			//}

			//delete [] testManifold;
			//delete [] input;
		#endif
		}
	}
}
Example #30
0
void* RenderDisplay(void* arguments)
{
    int err = 0;

    struct PASSING_OCL *ocl_info = (struct PASSING_OCL *)arguments;
    cl_command_queue commandQueue;
    cl_uint RandomSeeds[WorkAmount * 2];
    cl_mem Pixels;
    cl_mem Seeds;
    unsigned int PixelData[WorkAmount];

    for(int i = 0; i< WorkAmount*2; i++)
    {
        RandomSeeds[i] = rand();
        if (RandomSeeds[i] < 2)
            RandomSeeds[i] = 2;
    }
    commandQueue = clCreateCommandQueue(ocl_info->ocl.context, ocl_info->ocl.device[ocl_info->device_index], CL_QUEUE_PROFILING_ENABLE, &err);
    if(err != CL_SUCCESS)
    {
        printf("Error: Failed clCreateCommandQueue\n");
        return NULL;
    }

    Pixels = clCreateBuffer(ocl_info->ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * WorkAmount, NULL, &err);
    if(err != CL_SUCCESS)
    {
        printf("Error: Failed pixels create buffer\n");
        return NULL;
    }

    Seeds = clCreateBuffer(ocl_info->ocl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * WorkAmount*2, RandomSeeds, &err);
    if(err != CL_SUCCESS)
    {
        printf("Error: Failed Seeds create buffer\n");
        return NULL;
    }

    for(int i = ocl_info->workNDRangeStart; i<ocl_info->workNDRangeEnd; i++)
    {
        size_t globalWorkSize[1] = {WorkAmount};
        size_t localWorkSize[1] = {ocl_info->localSize};

        err = clSetKernelArg(ocl_info->ocl.kernel, 8, sizeof(cl_uint), (void*)&Seeds);
        if(CL_SUCCESS != err)
        {
            printf("Error: Failed to set argument Seeds\n");
            return NULL;
        }

        err = clSetKernelArg(ocl_info->ocl.kernel, 9, sizeof(cl_mem), (void*)&Pixels);
        if(CL_SUCCESS != err)
        {
            printf("Error: Failed to set argument Pixels\n");
            return NULL;
        }

        err = clSetKernelArg(ocl_info->ocl.kernel, 10, sizeof(cl_uint), (void*)&i);
        if(CL_SUCCESS != err)
        {
            printf("Error: Failed to set argument i\n");
            return NULL;
        }

        err = clEnqueueNDRangeKernel(commandQueue, ocl_info->ocl.kernel, 1, NULL, globalWorkSize, localWorkSize, NULL, 0, NULL);
        if(CL_SUCCESS != err)
        {
            printf("Error: Failed to run kernel to run\n");
            return NULL;
        }
        clEnqueueReadBuffer(commandQueue, Pixels, CL_TRUE, 0, WorkAmount * sizeof(cl_uint), PixelData, 0, NULL, NULL);
        for(int j=0; j<WorkAmount; j++)
        {
            ImagePixel[j + i*WorkAmount] = PixelData[j];
        }

    }

    if(clFinish(commandQueue) != CL_SUCCESS)
    {
        printf("Error: Failed to Finish");
        return NULL;
    }
    return NULL;
}