Ejemplo n.º 1
0
int main() {

   /* Host/device data structures */
   cl_platform_id platform;
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_int err;

   /* Program/kernel data structures */
   cl_program program;
   FILE *program_handle;
   char *program_buffer, *program_log;
   size_t program_size, log_size;
   cl_kernel kernel;
   size_t offset = 0;
   size_t global_size, local_size;

   /* Data and buffers */
   char pattern[16] = "thatwithhavefrom";
   FILE *text_handle;
   char *text;
   size_t text_size;
   int chars_per_item;
   int result[4] = {0, 0, 0, 0};
   cl_mem text_buffer, result_buffer;
   
   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   /* Determine global size and local size */
   clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, 		
      sizeof(global_size), &global_size, NULL);	
   clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, 		
      sizeof(local_size), &local_size, NULL);
   global_size *= local_size;

   /* Create a context */
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Read program file and place content into buffer */
   program_handle = fopen(PROGRAM_FILE, "r");
   if(program_handle == NULL) {
      perror("Couldn't find the program file");
      exit(1);
   }
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_buffer = (char*)calloc(program_size+1, sizeof(char));
   fread(program_buffer, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   /* Read text file and place content into buffer */
   text_handle = fopen(TEXT_FILE, "r");
   if(text_handle == NULL) {
      perror("Couldn't find the text file");
      exit(1);
   }
   fseek(text_handle, 0, SEEK_END);
   text_size = ftell(text_handle)-1;
   rewind(text_handle);
   text = (char*)calloc(text_size, sizeof(char));
   fread(text, sizeof(char), text_size, text_handle);
   fclose(text_handle);
   chars_per_item = text_size / global_size + 1;

   /* Create program from file */
   program = clCreateProgramWithSource(context, 1, 
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if(err < 0) {
            
      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &log_size);
      program_log = (char*) calloc(log_size+1, sizeof(char));
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            log_size+1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }

   /* Create a kernel */
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);
   };

   /* Create buffers to hold the text characters and count */
   text_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
         CL_MEM_COPY_HOST_PTR, text_size, text, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };
   result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
         CL_MEM_COPY_HOST_PTR, sizeof(result), result, NULL);

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(pattern), pattern);
   err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &text_buffer);
   err |= clSetKernelArg(kernel, 2, sizeof(chars_per_item), &chars_per_item);
   err |= clSetKernelArg(kernel, 3, 4 * sizeof(int), NULL);
   err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &result_buffer);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueNDRangeKernel(queue, kernel, 1, &offset, &global_size, 
         &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      printf("Error code: %d\n", err);
      exit(1);   
   }

   /* Read and print the result */
   err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, 
      sizeof(result), &result, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   printf("\nResults: \n");
   printf("Number of occurrences of 'that': %d\n", result[0]);
   printf("Number of occurrences of 'with': %d\n", result[1]);
   printf("Number of occurrences of 'have': %d\n", result[2]);
   printf("Number of occurrences of 'from': %d\n", result[3]);

   /* Deallocate resources */
   clReleaseMemObject(result_buffer);
   clReleaseMemObject(text_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Ejemplo n.º 2
0
int main() {

   /* OpenCL data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   cl_int err;

   /* Data and events */
   char *kernel_msg;
   float data[4096];
   cl_mem data_buffer;
   cl_event kernel_event, read_event;   
   
   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }     

   /* Build the program and create a kernel */
   program = build_program(context, device, PROGRAM_FILE);
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);   
   };

   /* Create a write-only buffer to hold the output data */
   data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         sizeof(data), NULL, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };         

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
   if(err < 0) {
      perror("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }

   /* Read the buffer */
   err = clEnqueueReadBuffer(queue, data_buffer, CL_FALSE, 0, 
      sizeof(data), &data, 0, NULL, &read_event);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }
 
   /* Set event handling routines */
   kernel_msg = "The kernel finished successfully.\n\0";
   err = clSetEventCallback(kernel_event, CL_COMPLETE, 
         &kernel_complete, kernel_msg);
   if(err < 0) {
      perror("Couldn't set callback for event");
      exit(1);   
   }
   clSetEventCallback(read_event, CL_COMPLETE, &read_complete, data);

   /* Deallocate resources */
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Ejemplo n.º 3
0
int main(int argc, char **argv) {




	if (find_option(argc, argv, "-h") >= 0)
	{
		printf("Options:\n");
		printf("-h to see this help\n");
		printf("-n <int> to set the number of particles\n");
		printf("-o <filename> to specify the output file name\n");
		printf("-s <filename> to specify the summary output file name\n");
		return 0;
	}


	int n = read_int(argc, argv, "-n", 1000);

	char *savename = read_string(argc, argv, "-o", NULL);
	char *sumname = read_string(argc, argv, "-s", NULL);

	// For return values.
	cl_int ret;

	// OpenCL stuff.
	// Loading kernel files.
	FILE *kernelFile;
	char *kernelSource;
	size_t kernelSize;

	kernelFile = fopen("simulationKernel.cl", "r");

	if (!kernelFile) {
		fprintf(stderr, "No file named simulationKernel.cl was found\n");
		exit(-1);
	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);

	// Getting platform and device information
	cl_platform_id platformId = NULL;
	cl_device_id deviceID = NULL;
	cl_uint retNumDevices;
	cl_uint retNumPlatforms;
	ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
	// Different types of devices to pick from. At the moment picks the default opencl device.
	//CL_DEVICE_TYPE_GPU
	//CL_DEVICE_TYPE_ACCELERATOR
	//CL_DEVICE_TYPE_DEFAULT
	//CL_DEVICE_TYPE_CPU
	ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices);

	// Max workgroup size
	size_t max_available_local_wg_size;
	ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL);
	// Creating context.
	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret);


	// Creating command queue
        cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret);
	
	// Build program
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret);
//	printf("program = ret %i \n", ret);
	ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
//	printf("clBuildProgram: ret %i \n", ret);
	
	// Create kernels
	cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret);

	cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret);

	cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret);
	cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret);

	FILE *fsave = savename ? fopen(savename, "w") : NULL;
	FILE *fsum = sumname ? fopen(sumname, "a") : NULL;
	particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t));

	// GPU particle data structure
	cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret);

	// Set size
	set_size(n);

	init_particles(n, particles);

	double copy_time = read_timer();

	// Copy particles to device.
	ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL);
	copy_time = read_timer() - copy_time;
	

	// Calculating thread and thread block counts.
	// sizes
	size_t globalItemSize;
	size_t localItemSize;
	// Global item size
	if (n <= NUM_THREADS) {
		globalItemSize = NUM_THREADS;
		localItemSize = 16;
	}
	else if (n % NUM_THREADS != 0) {
		globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS;
	}
	else {
		globalItemSize = n;
	}

	// Local item size
	localItemSize = globalItemSize / NUM_THREADS;	

	// Bins and bin sizes.
	// Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10.
	// There will never be 10 particles in one bin.
	int maxParticles = 10;
	
	// Calculating the number of bins.
	int numberOfBins = (int)ceil(size/(2*cutoff)) + 2;
	
	// Bins will only exist on the device.
	particle_t* bins;
	
	// How many particles are there in each bin - also only exists on the device.
	volatile int* binSizes;
	
	// Number of bins to be initialized.
	size_t clearAmt = numberOfBins*numberOfBins;
	
	// Allocate memory for bins on the device.
	cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret);
	cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret);
	
	// SETTING ARGUMENTS FOR THE KERNELS
	
	// Set arguments for the init / clear kernel
	ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins);

	// Set arguments for the binning kernel
	ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(binKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins);

	// Set arguments for force kernel.
	ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins);
	ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes);
	ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins);


	// Set arguments for move kernel
	ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles);
	ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n);
	ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size);
	
	
	// Variable to check if kernel execution is done.
	cl_event kernelDone;
	
	
	double simulation_time = read_timer();
	int step = 0;
	for (step = 0; step < NSTEPS; step++) {


		// Execute bin initialization (clearing after first iteration)
		ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute binning kernel
		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
//		ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);	
		// Execute force kernel
		ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);
		// Execute move kernel
		ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone);
		ret = clWaitForEvents(1, &kernelDone);

		if (fsave && (step%SAVEFREQ) == 0) {
			// Copy the particles back to the CPU
			ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone);
			ret = clWaitForEvents(1, &kernelDone);

			save(fsave, n, particles);
		}

	}
	simulation_time = read_timer() - simulation_time;
	printf("CPU-GPU copy time = %g seconds\n", copy_time);
	printf("n = %d, simulation time = %g seconds\n", n, simulation_time);

	if (fsum)
		fprintf(fsum, "%d %lf \n", n, simulation_time);

	if (fsum)
		fclose(fsum);
	free(particles);
	if (fsave)
		fclose(fsave);


	ret = clFlush(commandQueue);
	ret = clFinish(commandQueue);
	ret = clReleaseCommandQueue(commandQueue);
	ret = clReleaseKernel(forceKernel);
	ret = clReleaseKernel(moveKernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(d_particles);
	ret = clReleaseContext(context);


	return 0;
}
Ejemplo n.º 4
0
void clean_all(void) {

		printf("Cleaning Variables ... \n\n");
		
		// Opencl environment variables
		clReleaseCommandQueue(command_queue);
		clReleaseContext(context);
		
		
		// Release all memory allocated

		
		if (Data_MeshType == UNSTRUCTURED) {
	
			// Mesh Variables
			free(MeshElementArray.Node1);
			free(MeshElementArray.Node2);
			free(MeshElementArray.Node3);
			free(MeshElementArray.Node4);
			
			free(MeshNodeArray_double.x);
			free(MeshNodeArray_double.y);
			free(MeshNodeArray_double.z);


			free(MeshElementArray.Neighborindex1);
			free(MeshElementArray.Neighborindex2);
			free(MeshElementArray.Neighborindex3);
			free(MeshElementArray.Neighborindex4);
		
			clReleaseMemObject(Mesh_Node_x);
			clReleaseMemObject(Mesh_Node_y);
			clReleaseMemObject(Mesh_Node_z);
			
			clReleaseMemObject(Mesh_Element_Node1);
			clReleaseMemObject(Mesh_Element_Node2);
			clReleaseMemObject(Mesh_Element_Node3);
			clReleaseMemObject(Mesh_Element_Node4);
			
			clReleaseMemObject(Mesh_Element_Neighborindex1);
			clReleaseMemObject(Mesh_Element_Neighborindex2);
			clReleaseMemObject(Mesh_Element_Neighborindex3);
			clReleaseMemObject(Mesh_Element_Neighborindex4);
			
			clReleaseMemObject(r);
			clReleaseMemObject(s);
			clReleaseMemObject(t);
			clReleaseMemObject(eid);
			
		}

		// Cleaning Velocity variables
		
			free(velocity.u0);
			free(velocity.v0);
			free(velocity.w0);
			free(velocity.u1);
			free(velocity.v1);
			free(velocity.w1);
			free(velocity.time0);
			free(velocity.time1);
			
			free(Tracer.x);
			Tracer.x = NULL;
			free(Tracer.y);
			Tracer.y = NULL;
			
		
			free(Tracer.z);
			Tracer.z = NULL;
		
			free(Tracer.ElementIndex);
			Tracer.ElementIndex = NULL;
			free(Tracer.Start_time);
			Tracer.Start_time = NULL;
			free(Tracer.Stop_time);
			Tracer.Stop_time = NULL;
			free(Tracer.LeftDomain);
			Tracer.LeftDomain = NULL;
			
			if (Trace_ReleaseStrategy == 1) {
				free(Tracer1.x);
				Tracer1.x = NULL;
				free(Tracer1.y);
				Tracer1.y = NULL;
			
		
				free(Tracer1.z);
				Tracer1.z = NULL;
		
				free(Tracer1.ElementIndex);
				Tracer1.ElementIndex = NULL;
				free(Tracer1.Start_time);
				Tracer1.Start_time = NULL;
				free(Tracer1.Stop_time);
				Tracer1.Stop_time = NULL;
				free(Tracer1.LeftDomain);
				Tracer1.LeftDomain = NULL;
			
				free(index1);
				index1 = NULL;
				
				free(Tracer.Status);
				Tracer.Status = NULL;
			}
			free(DataTime1);
			free(Output_time);
			free(Launch_time);
			
			
		clReleaseMemObject(Vel_U0);
		clReleaseMemObject(Vel_U1);
		clReleaseMemObject(Vel_V0);
		clReleaseMemObject(Vel_V1);
		clReleaseMemObject(Vel_W0);
		clReleaseMemObject(Vel_W1);
		
		clReleaseMemObject(x_dev);
		clReleaseMemObject(y_dev);
		
		clReleaseMemObject(posx);
		clReleaseMemObject(posy);
		clReleaseMemObject(xn0);
		clReleaseMemObject(xn1);
		clReleaseMemObject(integrate);
		
		if (Dimensions == 3) {
	
			clReleaseMemObject(z_dev);
			clReleaseMemObject(posz);
			clReleaseMemObject(xn2);
		}
	
		clReleaseMemObject(Start_time_dev);
		clReleaseMemObject(Stop_time_dev);
		
		clReleaseMemObject(ElementIndex_dev);
		clReleaseMemObject(LeftDomain_dev);
		
		// Remove Temp file containing tracer release information
		if (!Keep_Tempfile) {
			char BinFile[LONGSTRING];
			sprintf(BinFile, "%s%s.bin", Path_Output, Temp_OutFilePrefix);
			if(remove(BinFile))
					fprintf(stderr, "Warning: Could not delete file %s\n", BinFile);
		}
		
		CL_CHECK(clReleaseKernel(kernel1));
		CL_CHECK(clReleaseKernel(kernel2));
		CL_CHECK(clReleaseKernel(kernel3));
		CL_CHECK(clReleaseKernel(kernel4));
		CL_CHECK(clReleaseKernel(kernel5));
	
   		CL_CHECK(clReleaseProgram(program));
		printf("Cleaning Successfull \n\n");

}
Ejemplo n.º 5
0
void OpenCLDevice::deinitialize()
{
	if (this->m_queue) {
		clReleaseCommandQueue(this->m_queue);
	}
}
Ejemplo n.º 6
0
int main(void) {
//time meassuring
  	struct timeval tvs;

//variables
	int 	Nx=1024;
	int		Ny=1024;
	int 	plotnum=0;
	int	  	Tmax=2;
	int 	plottime=0;
	int	  	plotgap=1;
	double	Lx=1.0;
	double 	Ly=1.0;
	double	dt=0.0;	
	double	A=0.0;
	double	B=0.0;
	double	Du=0.0;
	double	Dv=0.0;
//splitting coefficients
	double	a=0.5;	
	double 	b=0.5;
	double 	c=1.0;
//loop counters	
	int i=0;
	int j=0;
	int n=0;

	double*umax=NULL;
	double*vmax=NULL;
	parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B);
	plottime=plotgap;
	vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
	umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double));
//openCL variables
    cl_platform_id *platform_id = NULL;
    cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL;
	cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL;
    cl_int ret;
    cl_uint num_platforms;
// Detect how many platforms there are.
	ret = clGetPlatformIDs(0, NULL, &num_platforms);
// Allocate enough space for the number of platforms.
	platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id));
// Store the platforms
	ret = clGetPlatformIDs(num_platforms, platform_id, NULL);
	printf("Found %d platform(s)!\n",num_platforms);
    cl_uint *num_devices;
	num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint));
    cl_device_id **device_id = NULL;
	device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*));
// Detect number of devices in the platforms
	for(i=0;i<num_platforms;i++){
		char buf[65536];
		size_t size;
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size);
		printf("%s\n",buf);
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices);
		printf("Found %d device(s) on platform %d!\n", num_devices[i],i);
		ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size);
		printf("%s ",buf);
// Store numDevices from platform
		device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id));
		ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL);
		for(j=0;j<num_devices[i];j++){
			ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size);
			printf("%s (%d,%d)\n",buf,i,j);
		}
	}
//create context and command_queue
    cl_context context = NULL;
   	cl_command_queue command_queue = NULL;
//Which platform and device do i choose?
	int	chooseplatform=0;
	int	choosedevice=0;	  
	printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice);
	context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret);
	if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); }
	command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret);
	if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); }

//OpenCL arrays
    cl_mem cl_u = NULL,cl_v = NULL;
   	cl_mem cl_uhat = NULL, cl_vhat = NULL;
    cl_mem cl_kx = NULL, cl_ky = NULL;

//FFT
	clfftPlanHandle planHandle;
    cl_mem tmpBuffer = NULL;
	fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny);

//allocate gpu memory/
	cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret);
	cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret);
	cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret);
	cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret);

	printf("allocated space\n");
//load the kernels
	loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies");
	loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); 
	loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); 
	loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); 
	loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); 

	size_t global_work_size[1] = {Nx*Ny};
	size_t global_work_size_X[1] = {Nx};
	size_t global_work_size_Y[1] = {Ny};
//frequencies
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
    ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky);
	ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//printCL(&cl_kx,&command_queue,Nx,1);
//printCL(&cl_ky,&command_queue,1,Ny);
//inintial data
    ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx);
	ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly);
    ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//make output
    writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
    umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	printf("Got initial data, starting timestepping\n");
	mtime_s(&tvs);

	for(n=0;n<=Tmax;n++){
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

//linear
	fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
	fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);
//printf("A%f,B%f\n",A,B);
    ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat);
    ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat);
	ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du);
	ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A);
	ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B);
	ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c);
	ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny);
    ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer);
  	fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer);

//nonlinearpart_b
    ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);		
//nonlinearpart_a
    ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u);
	ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v);
	ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A);
	ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a);
    ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);	
// done
	if(n==plottime){
		printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]);
		plottime=plottime+plotgap;
		plotnum=plotnum+1;
   	 	writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
    	writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
        umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u");
        vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v");
	}
}//end timestepping

	printf("Finished time stepping\n");
	mtime_e(&tvs,"Programm took:");
	writearray(umax,(Tmax/plotgap)+1,"u");
	writearray(vmax,(Tmax/plotgap)+1,"v");
	free(umax);
	free(vmax);	

	clReleaseMemObject(cl_u);
	clReleaseMemObject(cl_v);
	clReleaseMemObject(cl_uhat);
	clReleaseMemObject(cl_vhat);
	clReleaseMemObject(cl_kx);
	clReleaseMemObject(cl_ky);

    ret = clReleaseKernel(initialdata); 
    ret = clReleaseKernel(frequencies); 
    ret = clReleaseKernel(linearpart); 
    ret = clReleaseKernel(nonlinearpart_a);
    ret = clReleaseKernel(nonlinearpart_b);

	fftdestroy(&planHandle, &tmpBuffer);

	ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

	for(i=0;i<num_platforms;i++){free(device_id[i]);}
	free(device_id);
	free(platform_id);
	free(num_devices);
	printf("Program execution complete\n");

	return 0;
}
Ejemplo n.º 7
0
void xcl_release_world(xcl_world world) {
	clReleaseCommandQueue(world.command_queue);
	clReleaseContext(world.context);
}
Ejemplo n.º 8
0
ofxClScheduler::~ofxClScheduler() {
	if(globalQ)	clReleaseCommandQueue(globalQ);
	if(context)	clReleaseContext(context);
}
Ejemplo n.º 9
0
int main(int argc, char** argv) {

   /* OpenCL 1.1 data structures */
   cl_platform_id* platforms;
   cl_program program;
   cl_context context;

   /* OpenCL 1.1 scalar data types */
   cl_uint numOfPlatforms;
   cl_int  error;

   /*
    Prepare an array of __cl_float4 via dynamic memory allocation
    This will map to the native vector type which is SSE / SSE2 / AVX on
    Intel-compatible processors.
   */
   cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device
   cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device
   for( int i = 0; i < DATA_SIZE; ++i) {
       ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i};
       ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f};
   }

   /* 
      Get the number of platforms 
      Remember that for each vendor's SDK installed on the computer,
      the number of available platform also increased. 
    */
   error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }

   platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
   printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);

   error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }
   // Search for a CPU/GPU device through the installed platforms
   // Build a OpenCL program and do not run it.
   for(cl_uint i = 0; i < numOfPlatforms; i++ ) {

        cl_uint numOfDevices = 0;

        /* Determine how many devices are connected to your platform */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);

        /* Load the information about your devices into the variable 'devices' */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        printf("Number of detected OpenCL devices: %d\n", numOfDevices);

	    /* Create a context */
        cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 };
	    context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a valid OpenCL context");
	        exit(1);
	    }

	    /* For each device, create a buffer and partition that data among the devices for compute! */
	    cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
	                                  sizeof(cl_float8) * DATA_SIZE, ud_in, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a buffer");
	        exit(1);
	    }

        int offset = 0; 
        for(int i = 0; i < numOfDevices; ++i, ++offset ) {
	        /* Load the two source files into temporary datastores */
	        const char *file_names[] = {"vectorization.cl"}; 
	        const int NUMBER_OF_FILES = 1;
	        char* buffer[NUMBER_OF_FILES];
	        size_t sizes[NUMBER_OF_FILES];
	        loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	
	        /* Create the OpenCL program object */
	        program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);				
		    if(error != CL_SUCCESS) {
		      perror("Can't create the OpenCL program object");
		      exit(1);   
		    }

	        /* Build OpenCL program object and dump the error message, if any */
	        char *program_log;
	        size_t log_size;
            char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe";
	        error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL);		
		    if(error != CL_SUCCESS) {
		      // If there's an error whilst building the program, dump the log
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		      program_log = (char*) malloc(log_size+1);
		      program_log[log_size] = '\0';
		      clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 
		            log_size+1, program_log, NULL);
		      printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
		      free(program_log);
		      exit(1);
		    }
	  
	        /* Query the program as to how many kernels were detected */
	        cl_uint numOfKernels;
	        error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels);
	        if (error != CL_SUCCESS) {
	            perror("Unable to retrieve kernel count from program");
	            exit(1);
	        }
	        cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels);
	        error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);

            /* Loop thru each kernel and execute on device */
	        for(cl_uint j = 0; j < numOfKernels; j++) {
	            char kernelName[32];
	            cl_uint argCnt;
	            clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL);
	            clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL);
	            printf("Kernel name: %s with arity: %d\n", kernelName, argCnt);
	            printf("About to create command queue and enqueue this kernel...\n");
	
	            /* Create a command queue */
	            cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create command-queue");
	                exit(1);
	            }
	
                /* Create a buffer and copy the data from the main buffer */
	            cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 
	                                                sizeof(cl_float8) * DATA_SIZE, 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create sub-buffer object");
	                exit(1);
	            }

	            /* Let OpenCL know that the kernel is suppose to receive an argument */
	            error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj);
	            error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to set buffer object in kernel");
	                exit(1);
	            }
	
	            /* Enqueue the kernel to the command queue */
	            error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to enqueue task to command-queue");
	                exit(1);
	            }
	            printf("Task has been enqueued successfully!\n");

	            /* Enqueue the read-back from device to host */
	            error = clEnqueueReadBuffer(cQ, outobj,
	                                         CL_TRUE,               // blocking read
	                                         0,                      // read from the start
	                                         sizeof(cl_float8)*DATA_SIZE,          // how much to copy
	                                         ud_out, 0, NULL, NULL);
                /* Check the returned data */
	            if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) {
	                printf("Check passed!\n");
	            } else printf("Check failed!\n");
	
	            /* Release the command queue */
	            clReleaseCommandQueue(cQ);
	            clReleaseMemObject(outobj);
	        } 

        /* Clean up */
        
        for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); }
        for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
        clReleaseProgram(program);
    }// end of device loop and execution
    
	    clReleaseMemObject(inobj);
        clReleaseContext(context);
   }// end of platform loop

   free(ud_in);
   free(ud_out);
}
Ejemplo n.º 10
0
int main(int argc, char** argv)
{
    int          rank, size;         // MPI rank & size
    int          err;                // error code returned from OpenCL calls
    float        h_a[LENGTH];        // a vector
    float        h_b[LENGTH];        // b vector
    float        h_c[LENGTH];        // c vector (a+b) returned from the compute device (local per task)
    float        _h_c[LENGTH];       // c vector (a+b) returned from the compute device (global for master)
    unsigned int correct;            // number of correct results

    size_t global;                   // global domain size
    size_t local;                    // local  domain size

    cl_device_id     device_id;      // compute device id
    cl_context       context;        // compute context
    cl_command_queue commands;       // compute command queue
    cl_program       program;        // compute program
    cl_kernel        ko_vadd;        // compute kernel

    cl_mem d_a;                      // device memory used for the input  a vector
    cl_mem d_b;                      // device memory used for the input  b vector
    cl_mem d_c;                      // device memory used for the output c vector

    int mycount, i;

    err = MPI_Init (&argc, &argv);

    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Init failed!\n");
        exit (-1);
    }

    err = MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_rank failed!\n");
        exit (-1);
    }

    err = MPI_Comm_size (MPI_COMM_WORLD, &size);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_size failed\n");
        exit (-1);
    }

    if (LENGTH % size != 0)
    {
        printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH);
        exit (-1);
    }

    mycount = LENGTH / size;

    if (rank == 0)
    {
        for (i = 0; i < LENGTH; i++)
        {
            h_a[i] = rand() / (float)RAND_MAX;
            h_b[i] = rand() / (float)RAND_MAX;
            h_a[i] = i;
            h_b[i] = i*2;
        }
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_b\n");
            exit (-1);
        }
    }
    else
    {
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_b\n");
            exit (-1);
        }
    }

    // Set up platform
    cl_uint numPlatforms;

    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n");
        return EXIT_FAILURE;
    }

    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n");
        return EXIT_FAILURE;
    }

    // Secure a GPU
    for (i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }

    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    else
    {
        if (output_device_info (rank, device_id) != CL_SUCCESS)
            return EXIT_FAILURE;
    }

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel from the program
    ko_vadd = clCreateKernel(program, "vadd", &err);
    if (!ko_vadd || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Create the input (a, b) and output (c) arrays in device memory
    d_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_b = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_c = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL);
    if (!d_a || !d_b || !d_c)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }

    // Write a and b vectors into compute device memory
    err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_a to source array!\n");
        exit(1);
    }

    err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_b to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    global = LENGTH;
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the commands to complete before reading back results
    clFinish(commands);

    // Read back the results from the compute device
    err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

    err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Gather failed receiving h_c\n");
        exit (-1);
    }

    if (rank == 0)
    {
        // Test the results
        correct = 0;
        float tmp;

        for(i = 0; i < LENGTH; i++)
        {
            tmp = h_a[i] + h_b[i];     // assign element i of a+b to tmp
            tmp -= _h_c[i];             // compute deviation of expected and output result
            if(tmp*tmp < TOL*TOL)      // correct if square deviation is less than tolerance squared
                correct++;
            else
                printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]);
        }

        // summarize results
        printf("C = A+B:  %d out of %d results were correct.\n", correct, LENGTH);
    }

    // cleanup then shutdown
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(ko_vadd);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    err = MPI_Finalize ();
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Finalize failed!\n");
        exit (-1);
    }

    return 0;
}
Ejemplo n.º 11
0
int main()
{
	struct ecl_context ctx;
	cl_program program;
	cl_kernel kernel;
	cl_int err;
	cl_mem in, out;
	size_t globWorkSize;
	int n = 100000;
	cl_event event;
	cl_ulong start, end;

	err = eclGetSomeContext(&ctx);
	assert(err == CL_SUCCESS);

	err = eclGetProgramFromSource(ctx.context, ctx.device, src, &program);
	assert(err == CL_SUCCESS);

	kernel = clCreateKernel(program, "stream", &err);
	assert(err == CL_SUCCESS);

	in = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, n * sizeof(float),
			0, &err);
	assert(err == CL_SUCCESS);
	out = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, n * sizeof(float),
			0, &err);
	assert(err == CL_SUCCESS);

	err = clSetKernelArg(kernel, 0, sizeof(in), &in);
	assert(err == CL_SUCCESS);
	err = clSetKernelArg(kernel, 1, sizeof(out), &out);
	assert(err == CL_SUCCESS);
	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
	assert(err == CL_SUCCESS);

	globWorkSize = n;
	err = clEnqueueNDRangeKernel(ctx.queue, kernel, 1, 0, &globWorkSize, 0,
			0, 0, &event);
	assert(err == CL_SUCCESS);
	clWaitForEvents(1, &event);
	err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
			sizeof(start), &start, 0);
	if (err == CL_PROFILING_INFO_NOT_AVAILABLE) {
		printf("Profling info not available.\n");
		return err;
	} else if (err) {
		printf("An error occurred getting profiling info.\n");
		return err;
	}
	err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
			sizeof(end), &end, 0);
	if (err == CL_PROFILING_INFO_NOT_AVAILABLE) {
		printf("Profling info not available.\n");
		return err;
	} else if (err) {
		printf("An error occurred getting profiling info.\n");
		return err;
	}
	printf("T/ms: %lf\n", (double)(end - start) / 1.0e6);
	printf("MB:   %lf\n", (double)n * sizeof(float) / 1.0e6);
	printf("GB/s: %lf\n", (double)n * sizeof(float) / (end - start));

	clReleaseCommandQueue(ctx.queue);
	clReleaseContext(ctx.context);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseMemObject(in);
	clReleaseMemObject(out);
	clReleaseEvent(event);

	return err;
}
Ejemplo n.º 12
0
void OpenCLFree()
{
	clReleaseCommandQueue(opencl_command_queue);
	clReleaseContext(opencl_context);
}
void call_kernel(float *data,unsigned int count,char * cl_name,float *results) {

    FILE* programHandle;
    size_t programSize, KernelSourceSize;
    char *programBuffer, *KernelSource;

    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation

    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array

	int err;
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    commands = clCreateCommandQueue(context, device_id, 0, &err);

	//----------------------------------------------------------------------------
    // get size of kernel source
    programHandle = fopen(cl_name, "r");
    fseek(programHandle, 0, SEEK_END);
    programSize = ftell(programHandle);
    rewind(programHandle);

    programBuffer = (char*) malloc(programSize + 1);
    programBuffer[programSize] = '\0';
    fread(programBuffer, sizeof(char), programSize, programHandle);
    fclose(programHandle);

    // create program from buffer
    program = clCreateProgramWithSource(context,1,(const char**) &programBuffer,&programSize, NULL);
    free(programBuffer);

    // read kernel source back in from program to check
    clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &KernelSourceSize);
    KernelSource = (char*) malloc(KernelSourceSize);
    clGetProgramInfo(program, CL_PROGRAM_SOURCE, KernelSourceSize, KernelSource, NULL);

    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    kernel = clCreateKernel(program, "square", &err);
	//----------------------------------------------------------------------------

    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);

    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);

    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    clFinish(commands);
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );

    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    printf("nKernel source:\n\n %s \n", KernelSource);
    free(KernelSource);
}
Ejemplo n.º 14
0
int main()
{
	cl_device_id device = new_device();
	cl_context context;
	cl_command_queue queue;
	cl_program program;
	cl_kernel kernel;
	cl_int i, j, err;

	float matrix_1[80], matrix_2[80], matrix_3[80];
	const size_t buffer_origin[3] = { 5 * sizeof(float), 3, 0 };
	const size_t host_origin[3] = { 1 * sizeof(float), 1, 0 };
	const size_t region[3] = { 4 * sizeof(float), 4, 1 };
	cl_mem matrix_buffer_1, matrix_buffer_2, matrix_buffer_3;

	for (i = 0; i < 80; i++)
	{
		matrix_1[i] = i * 1.0f;
		matrix_2[i] = 3.0;
		matrix_3[i] = 0;
	}

	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
	if (err < 0)
	{
		perror("Couldn't create a context\n");
		exit(1);
	}
	program = build_program(context, device, FILE_NAME);
	kernel = clCreateKernel(program, "add", &err);
	if (err < 0) {
		perror("Couldn't create a kernel\n");
		exit(1);
	}
	
	matrix_buffer_1 = clCreateBuffer(context, CL_MEM_READ_WRITE |
		CL_MEM_COPY_HOST_PTR, sizeof(matrix_1), matrix_1, &err);
	if (err < 0) {
		perror("Couldn't create a buffer\n");
		exit(1);
	}
	matrix_buffer_2 = clCreateBuffer(context, CL_MEM_READ_WRITE |
		CL_MEM_COPY_HOST_PTR, sizeof(matrix_2), matrix_2, &err);
	if (err < 0) {
		perror("Couldn't create a buffer\n");
		exit(1);
	}
	matrix_buffer_3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(matrix_3), matrix_3, &err);
	if (err < 0) {
		perror("Couldn't create a buffer\n");
		exit(1);
	}
	int row = 8;
	int col = 10;
	err = clSetKernelArg(kernel, 0, sizeof(int), &row);
	err = clSetKernelArg(kernel, 1, sizeof(int), &col);
	err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &matrix_buffer_1);
	err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &matrix_buffer_2);
	err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &matrix_buffer_3);

	queue = clCreateCommandQueue(context, device, 0, &err);
	if (err < 0) {
		perror("Couldn't create a command queue\n");
		exit(1);
	}
	err = clEnqueueTask(queue, kernel, 0, NULL, NULL);
	if (err < 0) {
		perror("Couldn't enque task\n");
		exit(1);
	}
	err = clEnqueueReadBuffer(queue, matrix_buffer_3, CL_TRUE, 0,
		sizeof(matrix_3), &matrix_3, 0, NULL, NULL);
	for (i = 0; i < 8; i++) {
		for (j = 0; j < 10; j++) {
			printf("%6.1f ", matrix_3[j + i * 10]);
		}
		printf("\n");
	}
	clReleaseMemObject(matrix_buffer_1);
	clReleaseMemObject(matrix_buffer_2);
	clReleaseMemObject(matrix_buffer_3);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(queue);
	clReleaseContext(context);
	return 0;
}
Ejemplo n.º 15
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);

	printResult();

    /* Call clblas function. */
    err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event);
//	printf("here\n");
    if (err != CL_SUCCESS) {
        printf("clblasSrot() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
                                    Y, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
                                    X, 0, NULL, NULL);

        /* At this point you will get the result of SROT placed in vector Y. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Ejemplo n.º 16
0
int vadd(void) {
    // Create the two input vectors
    int i;
    const int LIST_SIZE = 1024;
    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = i;
        B[i] = LIST_SIZE - i;
    }

    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    const char *fname = "/home/ckit/program/workspace_java/OpenCLHookSample/jni/vector_add_kernel.cl";
//    const char *fname = "vadd.ir";
    fp = fopen(fname, "rb");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;

      cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platform_id );

	//#define XXX CL_DEVICE_TYPE_DEFAULT
//	#define XXX CL_DEVICE_TYPE_ALL
//	#define XXX CL_DEVICE_TYPE_GPU
	#define XXX CL_DEVICE_TYPE_CPU

	cl_uint num_platforms = 2;
	cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)* num_platforms);
	if(NULL == platforms){
		printf("malloc err!\n");
	}
	ret = clGetPlatformIDs(2, platforms, &ret_num_platforms);
	printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platforms[1] );
    ret = clGetDeviceIDs( platforms[0], XXX, 1, &device_id, &ret_num_devices);
    printf("clGetDeviceIDs err=%d,num_platforms=%d, device_id=%x\n", ret, ret_num_platforms, (unsigned int)device_id );
    char name[64];
    ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char)*64, name, NULL);
    printf("device_name : %s\n", name);

    // Create an OpenCL context
//    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    cl_context_properties cps[3] =
    {
    	(cl_context_properties)CL_CONTEXT_PLATFORM,
        (cl_context_properties)platforms[0],
        (cl_context_properties)0
    };
    cl_context context = clCreateContextFromType( cps, XXX, NULL, NULL, &ret);
	printf("clCreateContextFromType err=%d,device_type=%x\n", ret, (unsigned int)XXX);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector 
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);

    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
            (const char **)&source_str, (const size_t *)&source_size, &ret);
//    cl_int status;
//    cl_int err;
//    cl_program program = clCreateProgramWithBinary(
//    		context, 1, &device_id, &source_size, (const unsigned char **)&source_str, &status, &err);
    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    
    // Execute the OpenCL kernel on the list
    size_t global_item_size = LIST_SIZE; // Process the entire lists
    size_t local_item_size = 64; // Process in groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);

    // Display the result to the screen
    for(i = 0; i < /*LIST_SIZE*/10; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    return 0;
}
Ejemplo n.º 17
0
void runProgram(int N, char *fileName)
{
	printf("GPU Symmetrize()..."
		"\nSquareMatrix[%d][%d]\n", N, N);

	int i,j;

	// initialize input array
	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = j;	
		}
	}

	//  result
	float *Aout;
	Aout = (float*)malloc(sizeof(float)*N*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);
#endif

	int NumK = 1;
	int NumE = 2;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);



#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	//printf("binary size = %ld\n", binary_size);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL);
	OCL_CHECK(err);

	//puts("save binaries");

	// Print the binary out to the output file
	fh = fopen("kernel.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

	puts("done save binaries");

#endif


	kernel[0] = clCreateKernel(program, "kernel_a", &err);
	OCL_CHECK(err);

	// memory on device
	cl_mem A_d    	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem Aout_d   = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);


	// copy data to device
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , &event[0]); 
	OCL_CHECK(err);

	size_t localsize[2];
	size_t globalsize[2];

	localsize[0] = 16; 
	localsize[1] = 16;

	globalsize[0] = N;
	globalsize[1] = N;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL);
	OCL_CHECK(err);

	clFinish(queue);

	// read device data back to host
	clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]);

	err = clWaitForEvents(1,&event[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("Output");
	check_2d_f(Aout,N,N);
#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(Aout_d);	


	// // check
	// int flag = 1;
	// for(i=0;i<N;++i){
	// 	for(j=0;j<N;++j){
	// 		if(A[i*N+j] != At[j*N+i])		
	// 		{
	// 			flag  = 0;
	// 			break;
	// 		}
	// 	}
	// }
	// if( flag == 0 )
	// {
	// 	puts("Bugs! Check program.");
	// }else{
	// 	puts("Succeed!");	
	// }



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif



	free(A);
	free(Aout);

	return;
}
Ejemplo n.º 18
0
int main(int argc, char *argv[])
{
  int error, xsize, ysize, rgb_max;
  int *r, *b, *g;

  float *gray, *congray, *congray2, *congray_cl;

  // identity kernel
  // float filter[] = {
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,1,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  // };

  // 45 degree motion blur
  float filter[] =
    {0,      0,      0,      0,      0, 0.0145,      0,
     0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
     0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
     0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
     0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
     0, 0.0145,      0,      0,      0,      0,      0};

  // mexican hat kernel
  // float filter[] = {
  //   0, 0,-1,-1,-1, 0, 0,
  //   0,-1,-3,-3,-3,-1, 0,
  //  -1,-3, 0, 7, 0,-3,-1,
  //  -1,-3, 7,24, 7,-3,-1,
  //  -1,-3, 0, 7, 0,-3,-1,
  //   0,-1,-3,-3,-3,-1, 0,
  //   0, 0,-1,-1,-1, 0, 0
  // };


  if(argc != 3)
  {
    fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]);
    abort();
  }

  const char* filename = argv[1];
  const int num_loops = atoi(argv[2]);


  // --------------------------------------------------------------------------
  // load image
  // --------------------------------------------------------------------------
  printf("Reading ``%s''\n", filename);
  ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b);
  printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize);

  // --------------------------------------------------------------------------
  // allocate CPU buffers
  // --------------------------------------------------------------------------
  posix_memalign((void**)&gray, 32, 4*xsize*ysize*sizeof(float));
  if(!gray) { fprintf(stderr, "alloc gray"); abort(); }
    
  posix_memalign((void**)&congray, 32, 4*xsize*ysize*sizeof(float));
  if(!congray) { fprintf(stderr, "alloc congray"); abort(); }
    
  posix_memalign((void**)&congray2, 32, 4*xsize*ysize*sizeof(float));
  if(!congray2) { fprintf(stderr, "alloc congray2"); abort(); }
    
  posix_memalign((void**)&congray_cl, 32, 4*xsize*ysize*sizeof(float));
  if(!congray_cl) { fprintf(stderr, "alloc congray_cl"); abort(); }

  // --------------------------------------------------------------------------
  // convert image to grayscale
  // --------------------------------------------------------------------------
  for(int n = 0; n < xsize*ysize; ++n) {
    gray[4*n] = r[n];
    gray[4*n+1] = g[n];
    gray[4*n+2] = b[n];
    gray[4*n+3] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max;
  }

    for(int n = 0; n < 4*xsize*ysize; ++n) {
        congray[n]=gray[n];
    }
    
  // --------------------------------------------------------------------------
  // execute filter on cpu
  // --------------------------------------------------------------------------
  

    
  for(int s=0;s<num_loops;s++){
    for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i)
  {
    for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j)
    {
      float sumR = 0;
      float sumG = 0;
      float sumB = 0;
      float sum = 0;
      
      for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k)
      {
        for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l){
      sumR += congray[4*((i+k)*xsize + (j+l))] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
	  sumG += congray[4*((i+k)*xsize + (j+l))+1] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
	  sumB += congray[4*((i+k)*xsize + (j+l))+2] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
	  //sum += congray[4*((i+k)*xsize + (j+l))+3] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
        }
      }
      congray2[4*(i*xsize + j)] = sumR;
      congray2[4*(i*xsize + j)+1] = sumG;
      congray2[4*(i*xsize + j)+2] = sumB;
      congray2[4*(i*xsize + j)+3] = sum;
    }
  }
      for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i)
      {
          for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j)
          {
              congray[4*(i*xsize + j)] = congray2[4*(i*xsize + j)];
              congray[4*(i*xsize + j)+1] = congray2[4*(i*xsize + j)+1];
              congray[4*(i*xsize + j)+2] = congray2[4*(i*xsize + j)+2];
              congray[4*(i*xsize + j)+3] = congray2[4*(i*xsize + j)+3];
          }
      }
      
  }
    
    
  // --------------------------------------------------------------------------
  // output cpu filtered image
  // --------------------------------------------------------------------------
  printf("Writing cpu filtered image\n");
  for(int n = 0; n < xsize*ysize; ++n) {
    r[n] = (int)(congray[4*n] );
    g[n] = (int)(congray[4*n+1] );
    b[n] = (int)(congray[4*n+2]);
  }
  error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

    
  // --------------------------------------------------------------------------
  // get an OpenCL context and queue
  // --------------------------------------------------------------------------
  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);
  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("convolution_color.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution_color", NULL);
  free(knl_text);

#ifdef NON_OPTIMIZED
  int deviceWidth = xsize;
#else
  int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX;
#endif
  int deviceHeight = ysize;
  size_t deviceDataSize = 4 * deviceHeight*deviceWidth*sizeof(float);

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
      deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        deviceDataSize, gray, 0, NULL, NULL));
#else
  size_t buffer_origin[3] = {0,0,0};
  size_t host_origin[3] = {0,0,0};
  size_t region[3] = {deviceWidth*sizeof(float), ysize, 1};
  clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE,
                           buffer_origin, host_origin, region,
                           deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
                           gray, 0, NULL, NULL);
#endif

  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0,
        FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL));

  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  cl_int rows = ysize;
  cl_int cols = xsize;
  cl_int filterWidth = FILTER_WIDTH;
  cl_int paddingPixels = 2*HALF_FILTER_WIDTH;

  size_t local_size[] = { WGX, WGY };
  size_t global_size[] = { ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0],  ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1],  };
    
  cl_int localWidth = local_size[0] + paddingPixels;
  cl_int localHeight = local_size[1] + paddingPixels;
  size_t localMemSize = 4 * localWidth * localHeight * sizeof(float);

  CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray));
  CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray));
  CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter));
  CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows));
  CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols));
  CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth));
  CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL));
  CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight));
  CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth));

  // --------------------------------------------------------------------------
  // print kernel info
  // --------------------------------------------------------------------------
  print_kernel_info(queue, knl);

  CALL_CL_SAFE(clFinish(queue));
  timestamp_type tic, toc;
  get_timestamp(&tic);
  for(int loop = 0; loop < num_loops; ++loop)
  {
      if (loop%2==0){
      CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray));
      CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray));
      }
      else{
          CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_congray), &buf_congray));
          CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_gray), &buf_gray));
      }
      
    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL,
          global_size, local_size, 0, NULL, NULL));
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops;
  printf("%f s\n", elapsed);
  printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed);
  printf("%f GBit/s\n", 4*2*xsize*ysize*sizeof(float)/1e9/elapsed);
  printf("%f GFlop/s\n",4*(xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH)
	 *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed);

  // --------------------------------------------------------------------------
  // transfer back & check
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  if (num_loops%2==0)
      CALL_CL_SAFE(clEnqueueReadBuffer(queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,deviceDataSize,congray_cl,0, NULL, NULL));
  else
    CALL_CL_SAFE(clEnqueueReadBuffer(queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,deviceDataSize, congray_cl, 0, NULL, NULL));
#else
  buffer_origin[0] = 3*sizeof(float);
  buffer_origin[1] = 3;
  buffer_origin[2] = 0;

  host_origin[0] = 3*sizeof(float);
  host_origin[1] = 3;
  host_origin[2] = 0;

  region[0] = (xsize-paddingPixels)*sizeof(float);
  region[1] = (ysize-paddingPixels);
  region[2] = 1;

  if (num_loops%2==0)
      clEnqueueReadBufferRect(queue, buf_gray, CL_TRUE,buffer_origin, host_origin, region,deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,congray_cl, 0, NULL, NULL);
  else
      clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,buffer_origin, host_origin, region,deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,congray_cl, 0, NULL, NULL);
#endif

  // --------------------------------------------------------------------------
  // output OpenCL filtered image
  // --------------------------------------------------------------------------
  printf("Writing OpenCL filtered image\n");
  for(int n = 0; n < xsize*ysize; ++n) {
    //r[n] = (int)(congray_cl[4*n] * rgb_max);
    //g[n] = (int)(congray_cl[4*n+1] * rgb_max);
    //b[n] = (int)(congray_cl[4*n+2] * rgb_max);
    r[n] = (int)(congray_cl[4*n]);
    g[n] = (int)(congray_cl[4*n+1]);
    b[n] = (int)(congray_cl[4*n+2]);
  }
  error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_congray));
  CALL_CL_SAFE(clReleaseMemObject(buf_gray));
  CALL_CL_SAFE(clReleaseMemObject(buf_filter));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));
  free(gray);
  free(congray);
  free(congray_cl);
  free(r);
  free(b);
  free(g);
}
Ejemplo n.º 19
0
int 
exec_trig_kernel(const char *program_source, 
                 int n, void *srcA, void *dst) 
{ 
  cl_context  context; 
  cl_command_queue cmd_queue; 
  cl_device_id  *devices; 
  cl_program  program; 
  cl_kernel  kernel; 
  cl_mem       memobjs[2]; 
  size_t       global_work_size[1]; 
  size_t       local_work_size[1]; 
  size_t       cb; 
  cl_int       err; 

  float c = 7.3f; // a scalar number to test non-pointer args
 
  // create the OpenCL context on a GPU device 
  context = poclu_create_any_context();
  if (context == (cl_context)0) 
    return -1; 
 
  // get the list of GPU devices associated with context 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
  devices = (cl_device_id *) malloc(cb); 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
 
  // create a command-queue 
  cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (cmd_queue == (cl_command_queue)0) 
    { 
      clReleaseContext(context); 
      free(devices); 
      return -1; 
    } 
  free(devices); 
 
  // allocate the buffer memory objects 
  memobjs[0] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcA, NULL); 
  if (memobjs[0] == (cl_mem)0) 
    { 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  memobjs[1] = clCreateBuffer(context, 
			      CL_MEM_READ_WRITE, 
			      sizeof(cl_float4) * n, NULL, NULL); 
  if (memobjs[1] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 1); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the program 
  program = clCreateProgramWithSource(context, 
				      1, (const char**)&program_source, NULL, NULL); 
  if (program == (cl_program)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // build the program 
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the kernel 
  kernel = clCreateKernel(program, "trig", NULL); 
  if (kernel == (cl_kernel)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set the args values 
  err = clSetKernelArg(kernel,  0,  
		       sizeof(cl_mem), (void *) &memobjs[0]); 
  err |= clSetKernelArg(kernel, 1,
			sizeof(cl_mem), (void *) &memobjs[1]); 
  err |= clSetKernelArg(kernel, 2,
			sizeof(float), (void *) &c); 
 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set work-item dimensions 
  global_work_size[0] = n; 
  local_work_size[0]= 2; 
 
  // execute kernel 
  err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, 
			       global_work_size, local_work_size,  
			       0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // read output image 
  err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 
			    0, n * sizeof(cl_float4), dst, 
			    0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // release kernel, program, and memory objects 
  delete_memobjs(memobjs, 2); 
  clReleaseKernel(kernel); 
  clReleaseProgram(program); 
  clReleaseCommandQueue(cmd_queue); 
  clReleaseContext(context); 
  return 0; // success... 
}
Ejemplo n.º 20
0
int main()
{
	srand(unsigned(time(nullptr)));
	int err;                            // error code returned from api calls

	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel;                   // compute kernel

										// OpenCL device memory for matrices
	cl_mem d_A;
	cl_mem d_B;
	cl_mem d_C;

	// set seed for rand()
	srand(2014);

	//Allocate host memory for matrices A and B
	unsigned int size_A = WA * HA;
	unsigned int mem_size_A = sizeof(float) * size_A;
	float* h_A = (float*)malloc(mem_size_A);

	unsigned int size_B = WB * HB;
	unsigned int mem_size_B = sizeof(float) * size_B;
	float* h_B = (float*)malloc(mem_size_B);

	//Initialize host memory
	randomMemInit(h_A, size_A);
	randomMemInit(h_B, size_B);

	//Allocate host memory for the result C
	unsigned int size_C = WC * HC;
	unsigned int mem_size_C = sizeof(float) * size_C;
	float* h_C = (float*)malloc(mem_size_C);

	printf("Initializing OpenCL device...\n");

	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[100];
	clGetPlatformIDs(dev_cnt, platform_ids, NULL);

	// Connect to a compute device
	int gpu = 1;
	err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS){
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context){
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, 0, &err);
	if (!commands){
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource);
	if (lFileSize < 0L){
		perror("File read failed");
		return 1;
	}
	//const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl");

	program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);
	if (!program){
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS){
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	// Create the compute kernel in the program we wish to run
	kernel = clCreateKernel(program, "matrixMul", &err);
	if (!kernel || err != CL_SUCCESS){
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	// Create the input and output arrays in device memory for our calculation
	d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err);
	d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err);
	d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err);

	if (!d_A || !d_B || !d_C){
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}

	printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB);

	//Launch OpenCL kernel
	size_t localWorkSize[2], globalWorkSize[2];

	int wA = WA;
	int wC = WC;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B);
	err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA);
	err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC);

	if (err != CL_SUCCESS){
		printf("Error: Failed to set kernel arguments! %d\n", err);
		exit(1);
	}

	localWorkSize[0] = 16;
	localWorkSize[1] = 16;
	globalWorkSize[0] = 1024;
	globalWorkSize[1] = 1024;

	err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to execute kernel! %d\n", err);
		exit(1);
	}

	//Retrieve result from device
	err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL);

	if (err != CL_SUCCESS){
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}
	//print table A

	printf("\nMatrix A\n");
	for (int i = 0; i < size_A; i++){
		printf("%f\t", h_A[i]);
		if (((i + 1) % WA) == 0)
			printf("\n");
	}

	//print table B

	printf("\nMatrix B\n");
	for (int i = 0; i < size_B; i++){
		printf("%f\t", h_B[i]);
		if (((i + 1) % WB) == 0)
			printf("\n");
	}

	//print out the results

	printf("\nMatrix C (Results)\n");
	for (int i = 0; i < size_C; i++){
		printf("%f\t", h_C[i]);
		if (((i + 1) % WC) == 0)
			printf("\n");
	}
	printf("\n");


	printf("Matrix multiplication completed...\n");

	//Shutdown and cleanup
	free(h_A);
	free(h_B);
	free(h_C);

	clReleaseMemObject(d_A);
	clReleaseMemObject(d_C);
	clReleaseMemObject(d_B);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	std::cin.clear();
	std::cin.sync();
	std::cin.get();
}
Ejemplo n.º 21
0
int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 100000;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;
 
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
 
    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }
 
    size_t globalSize, localSize;
    cl_int err;
 
    // Number of work items in each local work group
    localSize = 64;
 
    // Number of total work items - localSize must be devisor
    globalSize = ceil(n/(float)localSize)*localSize;
 
    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
 
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 
    // Create a context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 
    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
 
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
 
    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
 
    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);
 
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
 
    // Execute the kernel over the entire range of the data set 
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);
 
    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);
 
    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );
 
    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Ejemplo n.º 22
0
void	release_cl(t_cl *cl)
{
	clReleaseCommandQueue(cl->cmd_queue);
	clReleaseContext(cl->context);
}
Ejemplo n.º 23
0
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA,
                        const void *inMatrixB, int nrowB, int ncolB, bool transposeB,
                        double alpha, double beta, void *outMatrix, bool use_float)
{
    std::stringstream result;
    
    float *input_matrixA_f = (float *)inMatrixA;
    float *input_matrixB_f = (float *)inMatrixB;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrixA_d = (double *)inMatrixA;
    double *input_matrixB_d = (double *)inMatrixB;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;
    
    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }
    
    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrixA = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixA:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(float), input_matrixA_f, &err);
            
        } else {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(double), input_matrixA_d, &err);
        }
    }
    
    cl_mem cl_input_matrixB = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixB:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(float), input_matrixB_f, &err);
            
        } else {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(double), input_matrixB_d, &err);
        }
    }
    
    int nrowC = transposeA ? ncolA : nrowA;
    int ncolC = transposeB ? nrowB : ncolB;
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }
        
        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(double), output_matrix_d, &err);
        }
        
    }
    
    // ++++++++++++
    const int lda = nrowA;  // first dimension of A (rows), before any transpose
    const int ldb = nrowB;  // first dimension of B (rows), before any transpose
    const int ldc = nrowC;      // first dimension of C (rows)
    
    const int M = transposeA ? ncolA : nrowA;    // rows in A (after transpose, if any) and C
    const int N = transposeB ? nrowB : ncolB;    // cols in B (after transpose, if any) and C
    const int K = transposeA ? nrowA : ncolA;    // cols in A and rows in B (after transposes, if any)
    
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans;
    const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSgemm:" << std::endl;
            }
            
            status = clblasSgemm(order, transA, transB, M, N, K,
                              alpha, cl_input_matrixA, 0, lda,
                              cl_input_matrixB, 0, ldb, beta,
                              cl_output_matrix, 0, ldc,
                              1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }
            
        } else {
            if (debug) {
                result << "clblasDgemm:" << std::endl;
            }
            
            status = clblasDgemm(order, transA, transB, M, N, K,
                                 alpha, cl_input_matrixA, 0, lda,
                                 cl_input_matrixB, 0, ldb, beta,
                                 cl_output_matrix, 0, ldc,
                                 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrixA);
    cl_input_matrixA = NULL;
    
    clReleaseMemObject(cl_input_matrixB);
    cl_input_matrixB = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
    return errorStatus;
}
Ejemplo n.º 24
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned
    
    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation
    
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    // Fill our data set with random float values
    //
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;
    
    // Connect to a compute device
    //
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    
    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }
    
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
    
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }
    
    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
    
    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
    
    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }
    
    // Write our data set into the input array in device memory
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }
    
    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
    
    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    
    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }
    
    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);
    
    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        if(results[i] == data[i] * data[i])
            correct++;
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    
    return 0;
}
Ejemplo n.º 25
0
int main(void) {
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // Generate the input array on the host.
    float h_a[ARRAY_SIZE];
    float h_b[ARRAY_SIZE];
    for (int i = 0; i < ARRAY_SIZE; i++) {
        h_a[i] = (float)i;
        h_b[i] = (float)(2 * i);
    }

    float h_c[ARRAY_SIZE];

    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("vectors_cl.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
                         &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      ARRAY_BYTES, NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                      ARRAY_BYTES, NULL, &ret);

    // Copy h_a and h_b to memory buffer
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_a, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
                               ARRAY_BYTES, h_b, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
        (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (ret != 0) {
        printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret);
        exit(1);
    }

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != 0) {
        printf("clBuildProgram returned non-zero status %d: ", ret);

        if (ret == CL_INVALID_PROGRAM) {
            printf("invalid program\n");
        } else if (ret == CL_INVALID_VALUE) {
            printf("invalid value\n");
        } else if (ret == CL_INVALID_DEVICE) {
            printf("invalid device\n");
        } else if (ret == CL_INVALID_BINARY) {
            printf("invalid binary\n");
        } else if (ret == CL_INVALID_BUILD_OPTIONS) {
            printf("invalid build options\n");
        } else if (ret == CL_INVALID_OPERATION) {
            printf("invalid operation\n");
        } else if (ret == CL_COMPILER_NOT_AVAILABLE) {
            printf("compiler not available\n");
        } else if (ret == CL_BUILD_PROGRAM_FAILURE) {
            printf("build program failure\n");

            // Determine the size of the log
            size_t log_size;
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

            // Allocate memory for the log
            char *log = (char *) malloc(log_size);

            // Get the log
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

            // Print the log
            printf("%s\n", log);
        } else if (ret == CL_OUT_OF_HOST_MEMORY) {
            printf("out of host memory\n");
        }
        exit(1);
    }

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "add", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    size_t array_size = ARRAY_SIZE;
    ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size);

    // Execute the OpenCL kernel on the list
    size_t global_item_size = ARRAY_SIZE; // Process the entire lists
    size_t local_item_size = 1; // Divide work items into groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
                              ARRAY_BYTES, h_c, 0, NULL, NULL);

    // Print out the resulting array.
    for (int i = 0; i < 8; i++) {
        printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    printf("...\n");

    for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) {
        printf("%d + %d = %d",
               (int)h_a[i], (int)h_b[i], (int)h_c[i]);
        printf(((i % 4) != 3) ? "\t" : "\n");
    }

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    return 0;
}
Ejemplo n.º 26
0
int main(int argc, char** argv)
{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();

	cl_int err;

	size_t global_size;
	size_t local_size;

	cl_program program;
	cl_kernel kernel_compute_flux;
	cl_kernel kernel_compute_flux_contributions;
	cl_kernel kernel_compute_step_factor;
	cl_kernel kernel_time_step;
	cl_kernel kernel_initialize_variables;

	cl_mem ff_variable;
	cl_mem ff_fc_momentum_x;
	cl_mem ff_fc_momentum_y;
	cl_mem ff_fc_momentum_z;
	cl_mem ff_fc_density_energy;

	if (argc < 2)
	{
		printf("Usage ./cfd <data input file>\n");
		return 0;
	}


	const char* data_file_name = argv[1];


	// set far field conditions and load them into constant memory on the gpu
	{
		float h_ff_variable[NVAR];
		const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack);

		h_ff_variable[VAR_DENSITY] = (float)(1.4);

		float ff_pressure = (float)(1.0);
		float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
		float ff_speed = (float)(ff_mach)*ff_speed_of_sound;

		float3 ff_velocity;
		ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack));
		ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack));
		ff_velocity.z = 0.0;

		h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
		h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
		h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;

		h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0));

		float3 h_ff_momentum;
		h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
		h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
		h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
		float3 h_ff_fc_momentum_x;
		float3 h_ff_fc_momentum_y;
		float3 h_ff_fc_momentum_z;
		float3 h_ff_fc_density_energy;
		compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum,
				&h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity,
				&h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z,
				&h_ff_fc_density_energy);

		// copy far field conditions to the gpu
		ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err);
		CHKERR(err, "Unable to allocate ff data");
	}
	int nel;
	int nelr;

	// read in domain geometry
	cl_mem areas;
	cl_mem elements_surrounding_elements;
	cl_mem normals;
	{
		std::ifstream file(data_file_name);

		file >> nel;

		nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));

		float* h_areas = new float[nelr];
		int* h_elements_surrounding_elements = new int[nelr*NNB];
		float* h_normals = new float[nelr*NDIM*NNB];


		// read in data
		for(int i = 0; i < nel; i++)
		{
			file >> h_areas[i];
			for(int j = 0; j < NNB; j++)
			{
				file >> h_elements_surrounding_elements[i + j*nelr];
				if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
				h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering

				for(int k = 0; k < NDIM; k++)
				{
					file >> h_normals[i + (j + k*NNB)*nelr];
					h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
				}
			}
		}

		// fill in remaining data
		int last = nel-1;
		for(int i = nel; i < nelr; i++)
		{
			h_areas[i] = h_areas[last];
			for(int j = 0; j < NNB; j++)
			{
				// duplicate the last element
				h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
				for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
			}
		}

		areas = alloc<float>(context, nelr);
		upload<float>(commands, areas, h_areas, nelr);

		elements_surrounding_elements = alloc<int>(context, nelr*NNB);
		upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);

		normals = alloc<float>(context, nelr*NDIM*NNB);
		upload<float>(commands, normals, h_normals, nelr*NDIM*NNB);

		delete[] h_areas;
		delete[] h_elements_surrounding_elements;
		delete[] h_normals;
	}

	// Get program source.
	long kernelSize = getKernelSize();
	char* kernelSource = new char[kernelSize];
	getKernelSource(kernelSource, kernelSize);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err);
	CHKERR(err, "Failed to create a compute program!");

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err == CL_BUILD_PROGRAM_FAILURE)
	{
		char *log;
		size_t logLen;
		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen);
		log = (char *) malloc(sizeof(char)*logLen);
		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL);
		fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log);
		free(log);
		exit(1);
	}
	CHKERR(err, "Failed to build program!");
	delete[] kernelSource;

	// Create the compute kernel in the program we wish to run
	kernel_compute_flux = clCreateKernel(program, "compute_flux", &err);
	CHKERR(err, "Failed to create a compute kernel!");

	// Create the reduce kernel in the program we wish to run
	kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err);
	CHKERR(err, "Failed to create a compute_flux_contributions kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err);
	CHKERR(err, "Failed to create a compute_step_factor kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_time_step = clCreateKernel(program, "time_step", &err);
	CHKERR(err, "Failed to create a time_step kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err);
	CHKERR(err, "Failed to create a initialize_variables kernel!");

	// Create arrays and set initial conditions
	cl_mem variables = alloc<cl_float>(context, nelr*NVAR);

	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	//err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	local_size = 1;//std::min(local_size, (size_t)nelr);
	global_size = nelr;
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	err = clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0");


	cl_mem old_variables = alloc<float>(context, nelr*NVAR);
	cl_mem fluxes = alloc<float>(context, nelr*NVAR);
	cl_mem step_factors = alloc<float>(context, nelr);
	clFinish(commands);
	cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM);
	cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM);
	clFinish(commands);

	// make sure all memory is floatly allocated before we start timing
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1");
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");

	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2");
	std::cout << "About to memcopy" << std::endl;
	err = clReleaseMemObject(step_factors);
	float temp[nelr];
	for(int i = 0; i < nelr; i++)
		temp[i] = 0;
	step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err);
	CHKERR(err, "Unable to memset step_factors");
	// make sure CUDA isn't still doing something before we start timing

	clFinish(commands);

	// these need to be computed the first time in order to compute time step
	std::cout << "Starting..." << std::endl;


	// Begin iterations
	for(int i = 0; i < iterations; i++)
	{
		copy<float>(commands, old_variables, variables, nelr*NVAR);

		// for the first iteration we compute the time step
		err = 0;
		err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr);
		err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables);
		err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas);
		err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors);
		CHKERR(err, "Failed to set kernel arguments!");
		// Get the maximum work group size for executing the kernel on the device
		err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
		CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");
		err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!");
		for(int j = 0; j < RK; j++)
		{
			err = 0;
			err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer)
			//compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!");
			err = 0;
			err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements);
			err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals);
			err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy);
			err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes);
			err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable);
			err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!");
			err = 0;
			err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j);
			err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables);
			err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors);
			err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_time_step work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_time_step]!");
		}
	}

	clFinish(commands);
	std::cout << "Finished" << std::endl;
	std::cout << "Saving solution..." << std::endl;
	dump(commands, variables, nel, nelr);
	std::cout << "Saved solution..." << std::endl;
	std::cout << "Cleaning up..." << std::endl;

	clReleaseProgram(program);
	clReleaseKernel(kernel_compute_flux);
	clReleaseKernel(kernel_compute_flux_contributions);
	clReleaseKernel(kernel_compute_step_factor);
	clReleaseKernel(kernel_time_step);
	clReleaseKernel(kernel_initialize_variables);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	dealloc<float>(areas);
	dealloc<int>(elements_surrounding_elements);
	dealloc<float>(normals);

	dealloc<float>(variables);
	dealloc<float>(old_variables);
	dealloc<float>(fluxes);
	dealloc<float>(step_factors);
	dealloc<float>(fc_momentum_x);
	dealloc<float>(fc_momentum_y);
	dealloc<float>(fc_momentum_z);
	dealloc<float>(fc_density_energy);

	std::cout << "Done..." << std::endl;
	ocd_finalize();
	return 0;
}
Ejemplo n.º 27
0
int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem memobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    float mem[MEM_SIZE];

    FILE *fp;
    char fileName[] = "./kernel.clbin";
    size_t binary_size;
    char *binary_buf;
    cl_int binary_status;
    cl_int i;

    /* カーネルを含むオブジェクトファイルをロード */
    fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    binary_buf = (char *)malloc(MAX_BINARY_SIZE);
    binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp );
    fclose( fp );

    /* データを初期化 */
    for( i = 0; i < MEM_SIZE; i++ ) {
        mem[i] = i;
    }

    /* プラットフォーム・デバイスの情報の取得 */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* OpenCLコンテキストの作成 */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
  
    /* コマンドキューの作成 */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
  
    /* メモリバッファの作成 */
    memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret);

    /* メモリバッファにデータを転送 */
    ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 読み込んだバイナリからカーネルプログラムを作成 */
    program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, 
                                        (const unsigned char **)&binary_buf, &binary_status, &ret);
    
    /* OpenCLカーネルの作成 */
    kernel = clCreateKernel(program, "vecAdd", &ret);
    printf("err:%d\n", ret);

    /* OpenCLカーネル引数の設定 */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);

    size_t global_work_size[3] = {MEM_SIZE, 0, 0};
    size_t local_work_size[3]  = {MEM_SIZE, 0, 0};

    /* OpenCLカーネルを実行 */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

    /* メモリバッファから結果を取得 */
    ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

    /* 結果の表示 */
    for(i=0; i<MEM_SIZE; i++) {
        printf("mem[%d] : %f\n", i, mem[i]);
    }
  
    /* 終了処理 */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    free(binary_buf);

    return 0;
}
Ejemplo n.º 28
0
void DeleteCL()
{
	clReleaseContext(g_cxMainContext);
	clReleaseCommandQueue(g_cqCommandQue);
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'post_increment_short4.cl' */
        source_code = read_buffer("post_increment_short4.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "post_increment_short4", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_short4 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_short4));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_short4){{2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short4), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_short4 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_short4));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_short4));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_short4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_short4), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_short4));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Ejemplo n.º 30
0
//Do the proper test using different sizes.
static cl_ulong gws_test(size_t num, struct fmt_main * self) {

    cl_event myEvent;
    cl_int ret_code;
    cl_uint *tmpbuffer;
    cl_ulong startTime, endTime, runtime;
    int i, loops;

    //Prepare buffers.
    create_clobj(num, self);

    tmpbuffer = mem_alloc(sizeof(sha512_hash) * num);

    if (tmpbuffer == NULL) {
        fprintf(stderr, "Malloc failure in find_best_gws\n");
        exit(EXIT_FAILURE);
    }

    queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id],
            CL_QUEUE_PROFILING_ENABLE, &ret_code);
    HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue");

    // Set salt.
    set_salt(get_salt("$6$saltstring$"));
    salt->initial = salt->rounds - get_multiple(salt->rounds, HASH_LOOPS);

    // Set keys
    for (i = 0; i < num; i++) {
        set_key("aaabaabaaa", i);
    }
    //** Get execution time **//
    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_FALSE, 0,
            sizeof(sha512_salt), salt, 0, NULL, &myEvent),
            "Failed in clEnqueueWriteBuffer");

    HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
    HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
    runtime = endTime - startTime;

    //** Get execution time **//
    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0,
            sizeof(sha512_password) * num, plaintext, 0, NULL, &myEvent),
            "Failed in clEnqueueWriteBuffer");

    HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
    HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
    runtime += endTime - startTime;

    //** Get execution time **//
    if (gpu(source_in_use) || use_local(source_in_use)) {
        ret_code = clEnqueueNDRangeKernel(queue_prof, prepare_kernel,
            1, NULL, &num, &local_work_size, 0, NULL, &myEvent);

        HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
        HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
        HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
        HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
        runtime += endTime - startTime;
    }

    loops = gpu(source_in_use) || use_local(source_in_use) ? (salt->rounds / HASH_LOOPS) : 1;

    //** Get execution time **//
    for (i = 0; i < loops; i++)
    {
        ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel,
               1, NULL, &num, &local_work_size, 0, NULL, &myEvent);

        HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
        HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
        HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
        HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
        runtime += endTime - startTime;
    }

    //** Get execution time **//
    HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0,
            sizeof(sha512_hash) * num, tmpbuffer, 0, NULL, &myEvent),
            "Failed in clEnqueueReadBuffer");

    HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
    HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
    runtime += endTime - startTime;

    MEM_FREE(tmpbuffer);
    HANDLE_CLERROR(clReleaseCommandQueue(queue_prof),
            "Failed in clReleaseCommandQueue");
    release_clobj();

     if (ret_code != CL_SUCCESS) {

        if (ret_code != CL_INVALID_WORK_GROUP_SIZE)
            fprintf(stderr, "Error %d\n", ret_code);
        return 0;
    }
    return runtime;
}