Example #1
0
static int init_cladsyn(CSOUND *csound, CLADSYN *p){

  int asize, ipsize, fpsize, err;
  cl_device_id device_ids[32], device_id;             
  cl_context context;                
  cl_command_queue commands;          
  cl_program program;                
  cl_kernel kernel1, kernel2;                 
  cl_uint num = 0, nump =  0;
  cl_platform_id platforms[16];
    uint i;

  if(p->fsig->overlap > 1024)
     return csound->InitError(csound, "overlap is too large\n");



  err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 32, device_ids, &num);
  if (err != CL_SUCCESS){
    clGetPlatformIDs(16, platforms, &nump);
    int devs = 0;
    for(i=0; i < nump && devs < 32; i++){
     char name[128];
     clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, name, NULL);
     csound->Message(csound, "available platform[%d] %s\n",i, name);
     err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 32-devs, &device_ids[devs], &num);
    if (err != CL_SUCCESS)
     csound->InitError(csound, "failed to find an OpenCL device! %s \n", cl_error_string(err));
    }
    devs += num;
  }

  
  for(i=0; i < num; i++){
  char name[128];
  cl_device_type type;
  clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 128, name, NULL);
  clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  if(type & CL_DEVICE_TYPE_CPU)
  csound->Message(csound, "available CPU[device %d] %s\n",i, name);
  else  if(type & CL_DEVICE_TYPE_GPU)
  csound->Message(csound, "available GPU[device %d] %s\n",i, name);
  else  if(type & CL_DEVICE_TYPE_ACCELERATOR)
  csound->Message(csound, "available ACCELLERATOR[device %d] %s\n",i, name);
  else 
  csound->Message(csound, "available generic [device %d] %s\n",i, name);;
  }

  // SELECT THE GPU HERE
  if(*p->idev < num)
   device_id = device_ids[(int)*p->idev];
  else
   device_id = device_ids[num-1];

   context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
   if (!context)
     return csound->InitError(csound, "Failed to create a compute context! %s\n", 
                             cl_error_string(err));
  
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
       return csound->InitError(csound, "Failed to create a command commands! %s\n", 
                             cl_error_string(err));
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) &code, NULL, &err);
    if (!program)
       return csound->InitError(csound, "Failed to create compute program! %s\n", 
                             cl_error_string(err));
  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        csound->Message(csound, "Failed to build program executable! %s\n", 
                             cl_error_string(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        return csound->InitError(csound, "%s\n", buffer);
    }

    kernel1 = clCreateKernel(program, "sample", &err);
    if (!kernel1 || err != CL_SUCCESS)
      return csound->InitError(csound, "Failed to create sample compute kernel! %s\n", 
                             cl_error_string(err));

   kernel2 = clCreateKernel(program, "update", &err);
    if (!kernel2 || err != CL_SUCCESS)
      return csound->InitError(csound,"Failed to create update compute kernel! %s\n", 
                             cl_error_string(err));
 
  char name[128];
  clGetDeviceInfo(device_id, CL_DEVICE_NAME, 128, name, NULL);
  csound->Message(csound, "using device: %s\n",name);

  p->bins = (p->fsig->N)/2;

  if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum;

  p->vsamps = p->fsig->overlap;
  p->threads = p->bins*p->vsamps;
  p->mthreads = (p->bins > p->vsamps ? p->bins : p->vsamps);

  asize =  p->vsamps*sizeof(cl_float);
  ipsize = (p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_long);
  fpsize = p->fsig->N*sizeof(cl_float);

  p->out = clCreateBuffer(context,0, asize, NULL, NULL);
  p->frame =   clCreateBuffer(context, CL_MEM_READ_ONLY, fpsize, NULL, NULL);
  p->ph =  clCreateBuffer(context,0, ipsize, NULL, NULL);
  p->amps =  clCreateBuffer(context,0,(p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_float), NULL, NULL);
 
  // memset needed?

  asize = p->vsamps*sizeof(float);
  if(p->out_.auxp == NULL ||
      p->out_.size < (unsigned long) asize)
    csound->AuxAlloc(csound, asize , &p->out_);

  csound->RegisterDeinitCallback(csound, p, destroy_cladsyn);
  p->count = 0;
  p->context = context;
  p->program = program;
  p->commands = commands;
  p->kernel1 = kernel1;
  p->kernel2 = kernel2;
 
  clGetKernelWorkGroupInfo(p->kernel1, 
       device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs1, NULL);
  clGetKernelWorkGroupInfo(p->kernel2, 
       device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs2, NULL);
 
  p->sr = csound->GetSr(csound); 
  clSetKernelArg(p->kernel1, 0, sizeof(cl_mem), &p->out);
  clSetKernelArg(p->kernel1, 1, sizeof(cl_mem), &p->frame);
  clSetKernelArg(p->kernel1, 2, sizeof(cl_mem), &p->ph);
  clSetKernelArg(p->kernel1, 3, sizeof(cl_mem), &p->amps);
  clSetKernelArg(p->kernel1, 5, sizeof(cl_int), &p->bins);
  clSetKernelArg(p->kernel1, 6, sizeof(cl_int), &p->vsamps);
  clSetKernelArg(p->kernel1, 7, sizeof(cl_float), &p->sr);

  clSetKernelArg(p->kernel2, 0, sizeof(cl_mem), &p->out);
  clSetKernelArg(p->kernel2, 1, sizeof(cl_mem), &p->frame);
  clSetKernelArg(p->kernel2, 2, sizeof(cl_mem), &p->ph);
  clSetKernelArg(p->kernel2, 3, sizeof(cl_mem), &p->amps);
  clSetKernelArg(p->kernel2, 5, sizeof(cl_int), &p->bins);
  clSetKernelArg(p->kernel2, 6, sizeof(cl_int), &p->vsamps);
  clSetKernelArg(p->kernel2, 7, sizeof(cl_float),  &p->sr); 
  return OK;
}
Example #2
0
int main(int argc, char **argv){
	
	printf("Check OpenCL environtment\n");

	cl_platform_id platid;
	cl_device_id devid;
	cl_int res;
	size_t param;
	
	/* Query OpenCL, get some information about the returned device */
	clGetPlatformIDs(1u, &platid, NULL);
	clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL);

	cl_char vendor_name[1024] = {0};
	cl_char device_name[1024] = {0};
	clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL);
	clGetDeviceInfo(devid, CL_DEVICE_NAME,   sizeof(device_name), device_name, NULL);
	printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &param, NULL);
	printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param);
	
	clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param);

	clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param);

	/* Check if kernel source exists, we compile argv[1] passed kernel */
	if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); }

	char *kernel_source;
	if(load_program_source(argv[1], &kernel_source)) return 1;
	
	printf("Building from OpenCL source: \t%s\n", argv[1]);
	printf("Compile/query OpenCL_program:\t%s\n", argv[2]);
	
	/* Create context and kernel program */
	cl_context context = 	clCreateContext(0, 1, &devid, NULL, NULL, NULL);
	cl_program pro = 	clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL);
	res = 			clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL);

	if(res != CL_SUCCESS){
		printf("clBuildProgram failed: %d\n", res); char buf[0x10000];
		clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL);
		printf("\n%s\n", buf); return(-1); }

	cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); 	check_return(res);
	
	/* Get the maximum work-group size for executing the kernel on the device */
	size_t global, local;
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL);		check_return(res);
	printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local);
	
	res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param, NULL);	check_return(res);
	printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param);
	
	cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL);
	if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); }

	local = 4;
	int n = 2 * local;	//num_group * local workgroup size 
	global = n;
	
	int	num_groups=		global / local,
		allocated_local=	sizeof(data) * local + 
					sizeof(debug) * local;

	data *DP __attribute__ ((aligned(16)));
	DP = calloc(n, sizeof(data) *1);

	debug *dbg __attribute__ ((aligned(16)));
	dbg = calloc(n, sizeof(debug));
	
	printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups);
	printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256));
	printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local);

	cl_mem	cl_DP, cl_EC, cl_INV, DEBUG;
	cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res);					check_return(res);				
	cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(Elliptic_Curve), NULL, &res);	check_return(res);	//_constant address space
	cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY,  1 * sizeof(u8) * 0x80, NULL, &res);		check_return(res);
	DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res);		check_return(res);
	
	Elliptic_Curve EC;
	/*	
		Curve domain parameters, (test vectors)
		-------------------------------------------------------------------------------------
		p:	c1c627e1638fdc8e24299bb041e4e23af4bb5427		is prime
		a:	c1c627e1638fdc8e24299bb041e4e23af4bb5424		divisor g = 62980
		b:	877a6d84155a1de374b72d9f9d93b36bb563b2ab		divisor g = 227169643
		Gx: 	010aff82b3ac72569ae645af3b527be133442131		divisor g = 32209245
		Gy: 	46b8ec1e6d71e5ecb549614887d57a287df573cc		divisor g = 972	
		precomputed_per_curve_constants:
		U:	c1c627e1638fdc8e24299bb041e4e23af4bb5425
		V:	3e39d81e9c702371dbd6644fbe1b1dc50b44abd9
		
		already prepared mod p to test:
		a:      07189f858e3f723890a66ec1079388ebd2ed509c
		b:      6043379beb0dade6eed1e9d6de64f4a0c50639d4
		gx:     5ef84aacf4f0ea6752f572d0741f40049f354dca
		gy:     418c695435af6b3d4d7cbb72967395016ef67239
		resulting point:
		P.x:    01718f862ebe9423bd661a65355aa1c86ba330f8		program MUST got this point !!
		P.y:    557e8ed53ffbfe2c990a121967b340f62e0e4fe2
		taken mod p:
		P.x:    41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3		
		P.y:    73ca143c9badedf2d9d3c7573307115ccfe04f13
	*/	
	u8 *t;
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427");	memcpy(EC.p, t, 20);
	t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c");	memcpy(EC.a, t, 20);
	t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4");	memcpy(EC.b, t, 20);
	t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca");	memcpy(EC.Gx, t, 20);
	t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239");	memcpy(EC.Gy, t, 20);
	
	t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425");	memcpy(EC.U, t, 20);
	t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9");	memcpy(EC.V, t, 20);

	/* we need to map buffer now to load some k into data */
	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);

	t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710");
	for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21);
	
	free(t);
//d	for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1);

	/* we can alter just a byte into a chosen k to verify that we'll get a different point! */
	//DP[2].k[2] = 0x09;
	
//no	res = clEnqueueWriteBuffer(cmd_queue, cl_DP,  CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL);	check_return(res);

	res = clEnqueueWriteBuffer(cmd_queue, cl_EC,  CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL);	check_return(res);
	res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL);	check_return(res);

	res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP);		/* i/o buffer */
	res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC);
	res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV);	
	res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL);	//allocate space for __local in kernel (just this!) one * localsize
	res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG);		//this used to debug kernel output
	check_return(res);

//	printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local);	
	
	cl_event NDRangeEvent;
	cl_ulong start, end;
	
	/* Execute NDrange */	
	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent);		check_return(res);
//	res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent);		check_return(res);
	
	printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data));

	DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data),  0, NULL, NULL, &res);	check_return(res);
	dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res);	check_return(res);
	
	/* using clEnqueueReadBuffer template */
//	res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL);			check_return(res);
		
	clFlush(cmd_queue);
	clFinish(cmd_queue);

	/* get NDRange execution time with internal ocl profiler */
	res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
	res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &end,   NULL);
	check_return(res);
	printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000));			//relative to NDRange call
	printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start)));

	
	printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n");
	for(int i = 0; i < n; i++) {		
//		if(i %local == 0) {
			printf("%d \t", i);
			//printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7));
			
			/* silence this doubled debug info
			printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", 
				dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3],
				dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]);
			*/	
			//printf("%d %d\n", P[i].dig, P[i].c);
			bn_print("", DP[i].k, 21, 1);
			bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1);
			
			printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", 
				DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3],
				DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]);
//		}
	}
	
	/* Release OpenCL stuff, free the rest */
	clReleaseMemObject(cl_DP);
	clReleaseMemObject(cl_EC);
	clReleaseMemObject(cl_INV);
	clReleaseMemObject(DEBUG);
	clReleaseKernel(kernelobj);
	clReleaseProgram(pro);
	clReleaseCommandQueue(cmd_queue);
	clReleaseContext(context);
	
	free(kernel_source);
	
	puts("Done!");
	return 0;
}
void multiformat_scal_opencl_func(void *buffers[], void *args)
{
	(void) args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"multiformat_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(n), &n);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
}
Example #4
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
      
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned

    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation

    cl_device_id device_id;             // compute device id 
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    // Fill our data set with random float values
    //
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;

    // Determine the platform ID: NULL platform IDs lead to 
    // "platform specific" behavior!
    cl_platform_id platforms[8];
    uint32_t num_platforms;
    err = clGetPlatformIDs(8, platforms, &num_platforms);
    if(err != CL_SUCCESS) {
      printf("Error: failed to get platform ids!\n");
      return EXIT_FAILURE;
    }
    printf("%u platform ids found\n", num_platforms);
    
    // Connect to a compute device
    //
    int gpu = 1;
    err = clGetDeviceIDs(platforms[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
  
    // Create a compute context 
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    
    
    // Write our data set into the input array in device memory 
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);

    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if(results[i] == data[i] * data[i])
            correct++;
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return 0;
}
size_t 	select_device(int jtrUniqDevNo, struct fmt_main *fmt) {
	cl_int 		err;
	const char  	*errMsg;
	size_t	 	memAllocSz;

	active_dev_ctr++;

	opencl_init("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevNo, NULL);

	globalObj[jtrUniqDevNo].krnl[0] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_short", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_preprocess_short FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[1] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_long", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_preprocess_long FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[2] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_iter", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_iter FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[3] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_postprocess", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_postprocess FAILED\n");
		return 0;
	}

	errMsg = "Create Buffer FAILED";

	memAllocSz = 4 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu == (cl_mem)0)
		HANDLE_CLERROR(err,errMsg );
	globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_WRITE_ONLY, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	memAllocSz = MAX_KEYS_PER_CRYPT * sizeof(temp_buf);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	memAllocSz = 5 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);


	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 0 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu), "Set Kernel 0 Arg 1 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 0 Arg 3 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 1 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 1 Arg 1 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 2, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu), "Set Kernel 1 Arg 2 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[2], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 2 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 3 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu), "Set Kernel 3 Arg 1 :FAILED");

	if (!local_work_size)
		find_best_workgroup(jtrUniqDevNo, quick_bechmark(jtrUniqDevNo));

	else {
		size_t 		maxsize, maxsize2;

		globalObj[jtrUniqDevNo].lws = local_work_size;

		// Obey limits
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[0], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Error querying max LWS");
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[1], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[2], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[3], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;

		while (globalObj[jtrUniqDevNo].lws > maxsize)
			globalObj[jtrUniqDevNo].lws /= 2;

		if (options.verbosity > 3)
			fprintf(stderr, "Local worksize (LWS) forced to "Zu"\n", globalObj[jtrUniqDevNo].lws);

		globalObj[jtrUniqDevNo].exec_time_inv = 1;
	}

	if (!global_work_size)
		find_best_gws(jtrUniqDevNo, fmt);

	else {
		if (options.verbosity > 3)
			fprintf(stderr, "Global worksize (GWS) forced to "Zu"\n", global_work_size);

		fmt -> params.max_keys_per_crypt = global_work_size;
		fmt -> params.min_keys_per_crypt = max_lws();
	}

	return globalObj[jtrUniqDevNo].lws;
}
int
MemoryOptimizations::copy(cl_kernel& kernel, int vectorSize)
{
    cl_int status;
    cl_event events[2];

    /* Check group size against kernelWorkGroupSize */
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceId],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetKernelWorkGroupInfo failed."))
    {
        return SDK_FAILURE;
    }

    if(localThreads[0] * localThreads[1] > kernelWorkGroupSize)
    {
        std::cout << "\nDevice doesn't support required work-group size!\n";
        return SDK_SUCCESS;
    }

    /*** Set appropriate arguments to the kernel ***/
    
    status = clSetKernelArg(kernel,
                            0,
                            sizeof(cl_mem), 
                            (void *)&inputBuffer);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clSetKernelArg failed.(inputBuffer)"))
        return SDK_FAILURE;

    status = clSetKernelArg(kernel,
                            1,
                            sizeof(cl_mem), 
                            (void *)&outputBuffer);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clSetKernelArg failed.(outputBuffer)"))
        return SDK_FAILURE;

    double nsec = 0;

    // Reduce the iterations if verification is enabled.
    if(verify)
        Iterations = 1;

    /* Run the kernel for a number of iterations */
    for(int i = 0; i < Iterations; i++)
    {

        /*Enqueue a kernel run call */
        status = clEnqueueNDRangeKernel(commandQueue,
                                        kernel,
                                        2,
                                        NULL,
                                        globalThreads,
                                        localThreads,
                                        0,
                                        NULL,
                                        &events[0]);
        
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clEnqueueNDRangeKernel failed."))
            return SDK_FAILURE;


        /* wait for the kernel call to finish execution */
        status = clWaitForEvents(1, &events[0]);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clWaitForEvents failed."))
            return SDK_FAILURE;

        /* Calculate performance */
        cl_ulong startTime;
        cl_ulong endTime;
        
        /* Get kernel profiling info */
        status = clGetEventProfilingInfo(events[0],
                                         CL_PROFILING_COMMAND_START,
                                         sizeof(cl_ulong),
                                         &startTime,
                                         0);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetEventProfilingInfo failed.(startTime)"))
            return SDK_FAILURE;


        status = clGetEventProfilingInfo(events[0],
                                         CL_PROFILING_COMMAND_END,
                                         sizeof(cl_ulong),
                                         &endTime,
                                         0);

        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetEventProfilingInfo failed.(endTime)"))
            return SDK_FAILURE;

        /* Cumulate time for each iteration */
        nsec += endTime - startTime;
    }

    /* Copy bytes */
    int numThreads = (int)(globalThreads[0] * globalThreads[1]);
    double bytes = (double)(Iterations * 2 * vectorSize * sizeof(cl_float));
    double perf = (bytes / nsec) * numThreads;
    
    std::cout << ": " << perf << " GB/s" << std::endl;

    if(verify)
    {
        /* Enqueue readBuffer*/
        status = clEnqueueReadBuffer(commandQueue,
                                     outputBuffer,
                                     CL_TRUE,
                                     0,
                                     length * sizeof(cl_float4),
                                     output,
                                     0,
                                     NULL,
                                     0);

        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clEnqueueReadBuffer failed."))
            return SDK_FAILURE;
        
        /* Verify data */
        if(!memcmp(input, output, vectorSize * sizeof(cl_float) * length))
        {
            std::cout << "Passed!\n";
            return SDK_SUCCESS;
        }
        else
        {
            std::cout << "Failed!\n";
            return SDK_FAILURE;
        }
    }

    return SDK_SUCCESS;
}
void test_variable_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	int id, devid, ret;
	int factor = *(int *) args;

        cl_int             err;
	cl_kernel          kernel;
	cl_command_queue   queue;
	cl_event           event;

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]);

	cl_context context;
	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);
	starpu_opencl_get_context(devid, &context);

	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
		sizeof(int), &variable_config.copy_failed, &err);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"variable_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(fail), &fail);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(factor), &factor);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global = 1;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	err = clEnqueueReadBuffer(queue,
				  fail,
				  CL_TRUE,
				  0, 
				  sizeof(int),
				  &variable_config.copy_failed,
				  0,
				  NULL,
				  NULL);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
	return;
}
// Helper function to create and build program and kernel
// *********************************************************************
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
{
    // compile cl program
    size_t program_length;
    char *source; 

    std::ostringstream preamble;   

    // create the program
    // with type specification depending on datatype argument
    switch (datatype)
    {
    default:
    case REDUCE_INT:
        preamble << "#define T int" << std::endl;
        break;
    case REDUCE_FLOAT:
        preamble << "#define T float" << std::endl;
        break;
    }
    
    // set blockSize at compile time
    preamble << "#define blockSize " << blockSize << std::endl;
    
    // set isPow2 at compile time
    preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
    
    // Load the source code and prepend the preamble
    source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
    oclCheckError(source != NULL, shrTRUE);
    
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, 
                                                     &program_length, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    free(source);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS); 
    }
    
    // create Kernel    
    std::ostringstream kernelName;
    kernelName << "reduce" << whichKernel;    
    cl_kernel ckKernel = clCreateKernel(cpProgram, kernelName.str().c_str(), &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    size_t wgSize;
    ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
    if (wgSize == 64) 
      smallBlock = true;
    else smallBlock = false;

    // NOTE: the program will get deleted when the kernel is also released
    clReleaseProgram(cpProgram);
    
    return ckKernel;
}
Example #9
0
OPENCL_EXPERIMENTS_EXPORT
cl_int opencl_plugin_voxelize_meshes(opencl_plugin plugin,
                                     float inv_element_size,
                                     float corner_x,
                                     float corner_y,
                                     float corner_z,
                                     cl_int x_cell_length,
                                     cl_int y_cell_length,
                                     cl_int z_cell_length,
                                     cl_int mesh_data_count,
                                     mesh_data *mesh_data_list,
                                     cl_uchar *voxel_grid_out)
{
    cl_int err = CL_SUCCESS;
    cl_int i;
    cl_int next_row_offset, next_slice_offset;
    size_t local_work_size;
    cl_int num_voxels;

    clock_t t1;
    clock_t t2;
    clock_t t3;

    assert(plugin != NULL);
    assert(inv_element_size >= 0);
    assert(x_cell_length >= 0);
    assert(y_cell_length >= 0);
    assert(z_cell_length >= 0);
    assert(mesh_data_count >= 0);
    assert(mesh_data_list != NULL);

    t1 = clock();

    /* (Re-)allocate buffer for voxel grid */
    num_voxels = x_cell_length * y_cell_length * z_cell_length;
    if (opencl_plugin_init_voxel_buffer(plugin, num_voxels))
        goto error;

    /* (Re-)allocate buffers for mesh data */
    if (opencl_plugin_init_mesh_buffers(plugin, mesh_data_count, mesh_data_list))
        goto error;

    err = clGetKernelWorkGroupInfo(
        plugin->voxelize_kernel, plugin->selected_device,
        CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size,
        NULL);
    CHECK_CL_ERROR(err);

    if (enqueue_zero_buffer(plugin->queue, plugin->voxel_grid_buffer,
                            plugin->voxel_grid_buffer_capacity, 0, NULL, NULL,
                            &err))
        goto error;

    err = clFinish(plugin->queue);
    CHECK_CL_ERROR(err);

    t1 = clock() - t1;
    t2 = clock();

    next_row_offset = x_cell_length;
    next_slice_offset = x_cell_length * y_cell_length;

    err |= clSetKernelArg(plugin->voxelize_kernel, 0, sizeof(cl_mem), &plugin->voxel_grid_buffer);
    err |= clSetKernelArg(plugin->voxelize_kernel, 1, sizeof(float),  &inv_element_size);
    err |= clSetKernelArg(plugin->voxelize_kernel, 2, sizeof(float),  &corner_x);
    err |= clSetKernelArg(plugin->voxelize_kernel, 3, sizeof(float),  &corner_y);
    err |= clSetKernelArg(plugin->voxelize_kernel, 4, sizeof(float),  &corner_z);
    err |= clSetKernelArg(plugin->voxelize_kernel, 5, sizeof(cl_int), &next_row_offset);
    err |= clSetKernelArg(plugin->voxelize_kernel, 6, sizeof(cl_int), &next_slice_offset);
    err |= clSetKernelArg(plugin->voxelize_kernel, 7, sizeof(cl_int), &x_cell_length);
    err |= clSetKernelArg(plugin->voxelize_kernel, 8, sizeof(cl_int), &y_cell_length);
    err |= clSetKernelArg(plugin->voxelize_kernel, 9, sizeof(cl_int), &z_cell_length);
    CHECK_CL_ERROR(err);

    for (i = 0; i < mesh_data_count; i++) {
        size_t global_work_size;
        cl_uint vertex_buffer_base_idx = mesh_data_list[i].vertex_buffer_base_idx;
        cl_uint triangle_buffer_base_idx = mesh_data_list[i].triangle_buffer_base_idx;
        err |= clSetKernelArg(plugin->voxelize_kernel, 10, sizeof(cl_mem), &plugin->vertex_buffer);
        err |= clSetKernelArg(plugin->voxelize_kernel, 11, sizeof(cl_mem), &plugin->triangle_buffer);
        err |= clSetKernelArg(plugin->voxelize_kernel, 12, sizeof(cl_int), &mesh_data_list[i].num_triangles);
        err |= clSetKernelArg(plugin->voxelize_kernel, 13, sizeof(cl_uint), &vertex_buffer_base_idx);
        err |= clSetKernelArg(plugin->voxelize_kernel, 14, sizeof(cl_uint), &triangle_buffer_base_idx);
        CHECK_CL_ERROR(err);

        /* As per the OpenCL spec, global_work_size must divide evenly by
         * local_work_size */
        global_work_size = mesh_data_list[i].num_triangles / local_work_size;
        global_work_size *= local_work_size;
        if (global_work_size < (size_t)mesh_data_list[i].num_triangles)
            global_work_size += local_work_size;

        err = clEnqueueNDRangeKernel(
            plugin->queues[i % plugin->num_queues], plugin->voxelize_kernel, 1, NULL, &global_work_size,
            &local_work_size, 0, NULL, NULL);
        CHECK_CL_ERROR_MSG(err, "clEnqueueNDRangeKernel failed on mesh %d/%d",
                           i + 1, mesh_data_count);

        err = clFinish(plugin->queue);
        CHECK_CL_ERROR_MSG(err, "clFinish failed on mesh %d/%d",
                           i + 1, mesh_data_count);
    }

    err = clFinish(plugin->queue);
    CHECK_CL_ERROR(err);

    for (i = 0; i < plugin->num_queues; i++) {
        err = clFinish(plugin->queues[i]);
        CHECK_CL_ERROR(err);
    }

    t2 = clock() - t2;
    t3 = clock();

    err = clEnqueueReadBuffer(
        plugin->queue, plugin->voxel_grid_buffer, CL_TRUE, 0,
        num_voxels, voxel_grid_out, 0, NULL, NULL);
    CHECK_CL_ERROR(err);

    t3 = clock() - t3;

    TRACE("Clock T1: %f", ((float)t1 * 1000.0f) / CLOCKS_PER_SEC);
    TRACE("Clock T2: %f", ((float)t2 * 1000.0f) / CLOCKS_PER_SEC);
    TRACE("Clock T3: %f", ((float)t3 * 1000.0f) / CLOCKS_PER_SEC);
    return 0;
error:
    return -1;
}