Пример #1
0
// Host function that launches an OpenCL kernel to compute the MGVF matrices for the specified cells
void IMGVF_OpenCL(MAT **I, MAT **IMGVF, double vx, double vy, double e, int max_iterations, double cutoff, int num_cells) {

    cl_int error;

    // Initialize the data on the GPU
    IMGVF_OpenCL_init(I, num_cells);

    // Load the kernel source from the file
    const char *source = load_kernel_source("track_ellipse_kernel.cl");
    size_t sourceSize = strlen(source);

    // Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error);
    check_error(error, __FILE__, __LINE__);
    error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    // Show compiler warnings/errors
    static char log[65536];
    memset(log, 0, sizeof(log));
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
    if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
    check_error(error, __FILE__, __LINE__);

    // Create the IMGVF kernels
    cl_kernel IMGVF_kernel = clCreateKernel(program, "IMGVF_kernel", &error);
    check_error(error, __FILE__, __LINE__);

    // Setup execution parameters
    size_t num_work_groups = num_cells;
    size_t global_work_size = num_work_groups * local_work_size;

    // Convert double-precision parameters to single-precision
    float vx_float = (float) vx;
    float vy_float = (float) vy;
    float e_float = (float) e;
    float cutoff_float = (float) cutoff;

    // Set the kernel arguments
    clSetKernelArg(IMGVF_kernel, 0, sizeof(cl_mem), (void *) &device_IMGVF_all);
    clSetKernelArg(IMGVF_kernel, 1, sizeof(cl_mem), (void *) &device_I_all);
    clSetKernelArg(IMGVF_kernel, 2, sizeof(cl_mem), (void *) &device_I_offsets);
    clSetKernelArg(IMGVF_kernel, 3, sizeof(cl_mem), (void *) &device_m_array);
    clSetKernelArg(IMGVF_kernel, 4, sizeof(cl_mem), (void *) &device_n_array);
    clSetKernelArg(IMGVF_kernel, 5, sizeof(cl_float), (void *) &vx_float);
    clSetKernelArg(IMGVF_kernel, 6, sizeof(cl_float), (void *) &vy_float);
    clSetKernelArg(IMGVF_kernel, 7, sizeof(cl_float), (void *) &e_float);
    clSetKernelArg(IMGVF_kernel, 8, sizeof(cl_int), (void *) &max_iterations);
    clSetKernelArg(IMGVF_kernel, 9, sizeof(cl_float), (void *) &cutoff_float);

    // Compute the MGVF on the GPU
    error = clEnqueueNDRangeKernel(command_queue, IMGVF_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
    check_error(error, __FILE__, __LINE__);

    // Check for kernel errors
    error = clFinish(command_queue);
    check_error(error, __FILE__, __LINE__);

    // Copy back the final results from the GPU
    IMGVF_OpenCL_cleanup(IMGVF, num_cells);
}
void 
kernel_gpu_opencl_wrapper(	fp* image,											// input image
							int Nr,												// IMAGE nbr of rows
							int Nc,												// IMAGE nbr of cols
							long Ne,											// IMAGE nbr of elem
							int niter,											// nbr of iterations
							fp lambda,											// update step size
							long NeROI,											// ROI nbr of elements
							int* iN,
							int* iS,
							int* jE,
							int* jW,
							int iter,											// primary loop
							int mem_size_i,
							int mem_size_j)
{

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_ALL, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

  char clOptions[150];
//  sprintf(clOptions,"-I../../src");                                                    
  sprintf(clOptions,"-I.");
#ifdef RD_WG_SIZE
  sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
#endif
#ifdef RD_WG_SIZE_0
  sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
#endif
#ifdef RD_WG_SIZE_0_0
  sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
#endif

	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							clOptions, 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE Kernels
	//====================================================================================================100

	// Extract kernel
	cl_kernel extract_kernel;
	extract_kernel = clCreateKernel(program, 
									"extract_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Prepare kernel
	cl_kernel prepare_kernel;
	prepare_kernel = clCreateKernel(program, 
									"prepare_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Reduce kernel
	cl_kernel reduce_kernel;
	reduce_kernel = clCreateKernel(	program, 
									"reduce_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// SRAD kernel
	cl_kernel srad_kernel;
	srad_kernel = clCreateKernel(	program, 
									"srad_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// SRAD2 kernel
	cl_kernel srad2_kernel;
	srad2_kernel = clCreateKernel(	program, 
									"srad2_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Compress kernel
	cl_kernel compress_kernel;
	compress_kernel = clCreateKernel(	program, 
										"compress_kernel", 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	TRIGGERING INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();		// the above does it

	//======================================================================================================================================================150
	// 	GPU VARIABLES
	//======================================================================================================================================================150

	// CUDA kernel execution parameters
	int blocks_x;

	//======================================================================================================================================================150
	// 	ALLOCATE MEMORY IN GPU
	//======================================================================================================================================================150

	//====================================================================================================100
	// common memory size
	//====================================================================================================100

	int mem_size;															// matrix memory size
	mem_size = sizeof(fp) * Ne;												// get the size of float representation of input IMAGE

	//====================================================================================================100
	// allocate memory for entire IMAGE on DEVICE
	//====================================================================================================100

	cl_mem d_I;
	d_I = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for coordinates on DEVICE
	//====================================================================================================100

	cl_mem d_iN;
	d_iN = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_i,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_iS;
	d_iS = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_i,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_jE;
	d_jE = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_j,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_jW;
	d_jW = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_j,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for derivatives
	//====================================================================================================100

	cl_mem d_dN;
	d_dN = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_dS;
	d_dS = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_dW;
	d_dW = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_dE;
	d_dE = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for coefficient on DEVICE
	//====================================================================================================100

	cl_mem d_c;
	d_c = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for partial sums on DEVICE
	//====================================================================================================100

	cl_mem d_sums;
	d_sums = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_sums2;
	d_sums2 = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	COPY INPUT TO CPU
	//======================================================================================================================================================150

	//====================================================================================================100
	// Image
	//====================================================================================================100

	error = clEnqueueWriteBuffer(	command_queue, 
									d_I, 
									1, 
									0, 
									mem_size, 
									image, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// coordinates
	//====================================================================================================100

	error = clEnqueueWriteBuffer(	command_queue, 
									d_iN, 
									1, 
									0, 
									mem_size_i, 
									iN, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clEnqueueWriteBuffer(	command_queue, 
									d_iS, 
									1, 
									0, 
									mem_size_i, 
									iS, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clEnqueueWriteBuffer(	command_queue, 
									d_jE, 
									1, 
									0, 
									mem_size_j, 
									jE, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clEnqueueWriteBuffer(	command_queue, 
									d_jW, 
									1, 
									0, 
									mem_size_j, 
									jW, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	KERNEL EXECUTION PARAMETERS
	//======================================================================================================================================================150

	// threads
	size_t local_work_size[1];
	local_work_size[0] = NUMBER_THREADS;

	// workgroups
	int blocks_work_size;
	size_t global_work_size[1];
	blocks_x = Ne/(int)local_work_size[0];
	if (Ne % (int)local_work_size[0] != 0){												// compensate for division remainder above by adding one grid
		blocks_x = blocks_x + 1;																	
	}
	blocks_work_size = blocks_x;
	global_work_size[0] = blocks_work_size * local_work_size[0];						// define the number of blocks in the grid

	printf("max # of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);

	//======================================================================================================================================================150
	// 	Extract Kernel - SCALE IMAGE DOWN FROM 0-255 TO 0-1 AND EXTRACT
	//======================================================================================================================================================150

	//====================================================================================================100
	//	set arguments
	//====================================================================================================100

	error = clSetKernelArg(	extract_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	extract_kernel, 
							1, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	launch kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									extract_kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	Synchronization - wait for all operations in the command queue so far to finish
	//====================================================================================================100

	// error = clFinish(command_queue);
	// if (error != CL_SUCCESS) 
		// fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	WHAT IS CONSTANT IN COMPUTATION LOOP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	Prepare Kernel
	//====================================================================================================100

	error = clSetKernelArg(	prepare_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	prepare_kernel, 
							1, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	prepare_kernel, 
							2, 
							sizeof(cl_mem), 
							(void *) &d_sums);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	prepare_kernel, 
							3, 
							sizeof(cl_mem), 
							(void *) &d_sums2);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	Reduce Kernel
	//====================================================================================================100

	int blocks2_x;
	int blocks2_work_size;
	size_t global_work_size2[1];
	long no;
	int mul;
	int mem_size_single = sizeof(fp) * 1;
	fp total;
	fp total2;
	fp meanROI;
	fp meanROI2;
	fp varROI;
	fp q0sqr;

	error = clSetKernelArg(	reduce_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	reduce_kernel, 
							3, 
							sizeof(cl_mem), 
							(void *) &d_sums);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	reduce_kernel, 
							4, 
							sizeof(cl_mem), 
							(void *) &d_sums2);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	SRAD Kernel
	//====================================================================================================100

	error = clSetKernelArg(	srad_kernel, 
							0, 
							sizeof(fp), 
							(void *) &lambda);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							1, 
							sizeof(int), 
							(void *) &Nr);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							2, 
							sizeof(int), 
							(void *) &Nc);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							3, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							4, 
							sizeof(cl_mem), 
							(void *) &d_iN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							5, 
							sizeof(cl_mem), 
							(void *) &d_iS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							6, 
							sizeof(cl_mem), 
							(void *) &d_jE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							7, 
							sizeof(cl_mem), 
							(void *) &d_jW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							8, 
							sizeof(cl_mem), 
							(void *) &d_dN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							9, 
							sizeof(cl_mem), 
							(void *) &d_dS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							10, 
							sizeof(cl_mem), 
							(void *) &d_dW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							11, 
							sizeof(cl_mem), 
							(void *) &d_dE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							13, 
							sizeof(cl_mem), 
							(void *) &d_c);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							14, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	SRAD2 Kernel
	//====================================================================================================100

	error = clSetKernelArg(	srad2_kernel, 
							0, 
							sizeof(fp), 
							(void *) &lambda);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							1, 
							sizeof(int), 
							(void *) &Nr);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							2, 
							sizeof(int), 
							(void *) &Nc);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							3, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							4, 
							sizeof(cl_mem), 
							(void *) &d_iN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							5, 
							sizeof(cl_mem), 
							(void *) &d_iS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							6, 
							sizeof(cl_mem), 
							(void *) &d_jE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							7, 
							sizeof(cl_mem), 
							(void *) &d_jW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							8, 
							sizeof(cl_mem), 
							(void *) &d_dN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							9, 
							sizeof(cl_mem), 
							(void *) &d_dS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							10, 
							sizeof(cl_mem), 
							(void *) &d_dW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							11, 
							sizeof(cl_mem), 
							(void *) &d_dE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							12, 
							sizeof(cl_mem), 
							(void *) &d_c);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							13, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	COMPUTATION
	//======================================================================================================================================================150

	printf("Iterations Progress: ");

	// execute main loop
	for (iter=0; iter<niter; iter++){										// do for the number of iterations input parameter

		printf("%d ", iter);
		fflush(NULL);

		//====================================================================================================100
		// Prepare kernel
		//====================================================================================================100

		// launch kernel
		error = clEnqueueNDRangeKernel(	command_queue, 
										prepare_kernel, 
										1, 
										NULL, 
										global_work_size, 
										local_work_size, 
										0, 
										NULL, 
										NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// synchronize
		// error = clFinish(command_queue);
		// if (error != CL_SUCCESS) 
			// fatal_CL(error, __LINE__);

		//====================================================================================================100
		//	Reduce Kernel - performs subsequent reductions of sums
		//====================================================================================================100

		// initial values
		blocks2_work_size = blocks_work_size;							// original number of blocks
		global_work_size2[0] = global_work_size[0];
		no = Ne;														// original number of sum elements
		mul = 1;														// original multiplier

		// loop
		while(blocks2_work_size != 0){

			// set arguments that were uptaded in this loop
			error = clSetKernelArg(	reduce_kernel, 
									1, 
									sizeof(long), 
									(void *) &no);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);
			error = clSetKernelArg(	reduce_kernel, 
									2, 
									sizeof(int), 
									(void *) &mul);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);

			error = clSetKernelArg(	reduce_kernel, 
									5, 
									sizeof(int), 
									(void *) &blocks2_work_size);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);

			// launch kernel
			error = clEnqueueNDRangeKernel(	command_queue, 
											reduce_kernel, 
											1, 
											NULL, 
											global_work_size2, 
											local_work_size, 
											0, 
											NULL, 
											NULL);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);

			// synchronize
			// error = clFinish(command_queue);
			// if (error != CL_SUCCESS) 
				// fatal_CL(error, __LINE__);

			// update execution parameters
			no = blocks2_work_size;												// get current number of elements
			if(blocks2_work_size == 1){
				blocks2_work_size = 0;
			}
			else{
				mul = mul * NUMBER_THREADS;										// update the increment
				blocks_x = blocks2_work_size/(int)local_work_size[0];			// number of blocks
				if (blocks2_work_size % (int)local_work_size[0] != 0){			// compensate for division remainder above by adding one grid
					blocks_x = blocks_x + 1;
				}
				blocks2_work_size = blocks_x;
				global_work_size2[0] = blocks2_work_size * (int)local_work_size[0];
			}

		}

		// copy total sums to device
		error = clEnqueueReadBuffer(command_queue,
									d_sums,
									CL_TRUE,
									0,
									mem_size_single,
									&total,
									0,
									NULL,
									NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		error = clEnqueueReadBuffer(command_queue,
									d_sums2,
									CL_TRUE,
									0,
									mem_size_single,
									&total2,
									0,
									NULL,
									NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		//====================================================================================================100
		// calculate statistics
		//====================================================================================================100
		
		meanROI	= total / (fp)(NeROI);										// gets mean (average) value of element in ROI
		meanROI2 = meanROI * meanROI;										//
		varROI = (total2 / (fp)(NeROI)) - meanROI2;							// gets variance of ROI								
		q0sqr = varROI / meanROI2;											// gets standard deviation of ROI

		//====================================================================================================100
		// execute srad kernel
		//====================================================================================================100

		// set arguments that were uptaded in this loop
		error = clSetKernelArg(	srad_kernel, 
							12, 
							sizeof(fp), 
							(void *) &q0sqr);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// launch kernel
		error = clEnqueueNDRangeKernel(	command_queue, 
										srad_kernel, 
										1, 
										NULL, 
										global_work_size, 
										local_work_size, 
										0, 
										NULL, 
										NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// synchronize
		// error = clFinish(command_queue);
		// if (error != CL_SUCCESS) 
			// fatal_CL(error, __LINE__);

		//====================================================================================================100
		// execute srad2 kernel
		//====================================================================================================100

		// launch kernel
		error = clEnqueueNDRangeKernel(	command_queue, 
										srad2_kernel, 
										1, 
										NULL, 
										global_work_size, 
										local_work_size, 
										0, 
										NULL, 
										NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// synchronize
		// error = clFinish(command_queue);
		// if (error != CL_SUCCESS) 
			// fatal_CL(error, __LINE__);

		//====================================================================================================100
		// End
		//====================================================================================================100

	}

	printf("\n");

	//======================================================================================================================================================150
	// 	Compress Kernel - SCALE IMAGE UP FROM 0-1 TO 0-255 AND COMPRESS
	//======================================================================================================================================================150

	//====================================================================================================100
	// set parameters
	//====================================================================================================100

	error = clSetKernelArg(	compress_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	compress_kernel, 
							1, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// launch kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									compress_kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// synchronize
	//====================================================================================================100

	error = clFinish(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	COPY RESULTS BACK TO CPU
	//======================================================================================================================================================150

	error = clEnqueueReadBuffer(command_queue,
								d_I,
								CL_TRUE,
								0,
								mem_size,
								image,
								0,
								NULL,
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// int i;
	// for(i=0; i<100; i++){
		// printf("%f ", image[i]);
	// }

	//======================================================================================================================================================150
	// 	FREE MEMORY
	//======================================================================================================================================================150

	// OpenCL structures
	error = clReleaseKernel(extract_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(prepare_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(reduce_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(srad_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(srad2_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(compress_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseProgram(program);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// common_change
	error = clReleaseMemObject(d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_c);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clReleaseMemObject(d_iN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_iS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_jE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_jW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clReleaseMemObject(d_dN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_dS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_dE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_dW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clReleaseMemObject(d_sums);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_sums2);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// OpenCL structures
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseCommandQueue(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseContext(context);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//======================================================================================================================================================150
	// 	End
	//======================================================================================================================================================150

}
Пример #3
0
static CUmodule
build_kernel_source(const char *source_file, long target_capability)
{
	char		   *source;
	int				link_dev_runtime;
	nvrtcProgram	program;
	nvrtcResult		rc;
	char			arch_buf[128];
	const char	   *options[10];
	int				opt_index = 0;
	int				build_failure = 0;
	char		   *build_log;
	size_t			build_log_len;
	char		   *ptx_image;
	size_t			ptx_image_len;
	void		   *bin_image;
	size_t			bin_image_len;
	CUmodule		cuda_module;
	CUresult		cuda_rc;

	source = load_kernel_source(source_file, &link_dev_runtime);
	rc = nvrtcCreateProgram(&program,
							source,
							NULL,
							0,
							NULL,
							NULL);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcCreateProgram");

	/*
	 * Put command line options as cuda_program.c doing
	 */
	options[opt_index++] = "-I " CUDA_INCLUDE_PATH;
	snprintf(arch_buf, sizeof(arch_buf),
			 "--gpu-architecture=compute_%ld", target_capability);
	options[opt_index++] = arch_buf;
#ifdef PGSTROM_DEBUG
	options[opt_index++] = "--device-debug";
	options[opt_index++] = "--generate-line-info";
#endif
	options[opt_index++] = "--use_fast_math";
	if (link_dev_runtime)
		options[opt_index++] = "--relocatable-device-code=true";

	/*
	 * Kick runtime compiler
	 */
	rc = nvrtcCompileProgram(program, opt_index, options);
	if (rc != NVRTC_SUCCESS)
	{
		if (rc == NVRTC_ERROR_COMPILATION)
			build_failure = 1;
		else
			nvrtc_error(rc, "nvrtcCompileProgram");
	}

	/*
	 * Print build log
	 */
	rc = nvrtcGetProgramLogSize(program, &build_log_len);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetProgramLogSize");
	build_log = malloc(build_log_len + 1);
	if (!build_log)
	{
		fputs("out of memory", stderr);
		exit(1);
	}
	rc = nvrtcGetProgramLog(program, build_log);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetProgramLog");

	if (build_log_len > 1)
		printf("build log:\n%s\n", build_log);
	if (build_failure)
		exit(1);

	/*
	 * Get PTX Image
	 */
	rc = nvrtcGetPTXSize(program, &ptx_image_len);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetPTXSize");
	ptx_image = malloc(ptx_image_len + 1);
	if (!ptx_image)
	{
		fputs("out of memory", stderr);
		exit(1);
	}
	rc = nvrtcGetPTX(program, ptx_image);
	if (rc != NVRTC_SUCCESS)
		nvrtc_error(rc, "nvrtcGetPTX");
	ptx_image[ptx_image_len] = '\0';

	/*
	 * Link device runtime if needed
	 */
	if (link_dev_runtime)
	{
		link_device_libraries(ptx_image, ptx_image_len,
							  &bin_image, &bin_image_len,
							  target_capability);
	}
	else
	{
		bin_image = ptx_image;
		bin_image_len = ptx_image_len;
	}

	cuda_rc = cuModuleLoadData(&cuda_module, bin_image);
	if (cuda_rc != CUDA_SUCCESS)
		cuda_error(rc, "cuModuleLoadData");
	return cuda_module;
}
Пример #4
0
int main(int argc, char** argv) {


	cl_int error;
	cl_uint num_platforms;
	
	// Get the number of platforms
	error = clGetPlatformIDs(0, NULL, &num_platforms);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get the list of platforms
	cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(num_platforms, platforms, NULL);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Print the chosen platform (if there are multiple platforms, choose the first one)
	cl_platform_id platform = platforms[0];
	char pbuf[100];
	error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);
	
	// Create a GPU context
	cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0};
    context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get and print the chosen device (if there are multiple devices, choose the first one)
	size_t devices_size;
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	device = devices[0];
	error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);
	
	// Create a command queue
	command_queue = clCreateCommandQueue(context, device, 0, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	

    int size;
    int grid_rows,grid_cols = 0;
    float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; 
    char *tfile, *pfile, *ofile;
    
    int total_iterations = 60;
    int pyramid_height = 1; // number of iterations
	
	if (argc < 7)
		usage(argc, argv);
	if((grid_rows = atoi(argv[1]))<=0||
	   (grid_cols = atoi(argv[1]))<=0||
       (pyramid_height = atoi(argv[2]))<=0||
       (total_iterations = atoi(argv[3]))<=0)
		usage(argc, argv);
		
	tfile=argv[4];
    pfile=argv[5];
    ofile=argv[6];
	
    size=grid_rows*grid_cols;

    // --------------- pyramid parameters --------------- 
    int borderCols = (pyramid_height)*EXPAND_RATE/2;
    int borderRows = (pyramid_height)*EXPAND_RATE/2;
    int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1);
    int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1);

    FilesavingTemp = (float *) malloc(size*sizeof(float));
    FilesavingPower = (float *) malloc(size*sizeof(float));
    // MatrixOut = (float *) calloc (size, sizeof(float));

    if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut)
        fatal("unable to allocate memory");
	
	// Read input data from disk
    readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
    readinput(FilesavingPower, grid_rows, grid_cols, pfile);
	
	// Load kernel source from file
	const char *source = load_kernel_source("hotspot_kernel.cl");
	size_t sourceSize = strlen(source);
	
	// Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
    // Create an executable from the kernel
	error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	// Show compiler warnings/errors
	static char log[65536]; memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
	if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
    kernel = clCreateKernel(program, "hotspot", &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
		
	long long start_time = get_time();
	
	// Create two temperature matrices and copy the temperature input data
	cl_mem MatrixTemp[2];
	// Create input memory buffers on device
	MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Copy the power input data
	cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Perform the computation
	int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height,
								blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower);
	
	// Copy final temperature data back
	cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	long long end_time = get_time();	
	printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000));
	
	// Write final output to output file
    writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
    
	error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	clReleaseMemObject(MatrixTemp[0]);
	clReleaseMemObject(MatrixTemp[1]);
	clReleaseMemObject(MatrixPower);
	
	return 0;
}
Пример #5
0
int encrypt_cl() {
#ifdef DEBUG 
	printf("start of encrypt_cl\n");
#endif
	

	int err;                            // error code returned from api calls
	unsigned int correct;               // number of correct results returned
	size_t global;                      // global domain size for our calculation
	size_t local;                       // local domain size for our calculation

	cl_device_id *device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_program decrypt_program;                 // compute program
	cl_kernel encrypt_kernel;                   // compute kernel
	//cl_kernel decrypt_kernel;                   // compute kernel
	cl_event event;
	
	static cl_mem buffer_state;
	static cl_mem buffer_roundkeys;

#ifdef DEBUG 
	printf("data, keydata, results\n");
#endif
	float results[DATA_SIZE];           // results returned from device
	
	unsigned char in[DATA_SIZE];              //plain text
	unsigned char out[DATA_SIZE];              // encryped text


#ifdef DEBUG 
	printf("initFns\n");
#endif

	initFns();
	cl_platform_id platform = NULL;//the chosen platform
	err = clGetPlatformIDs(1, &platform, NULL);
    CHECK_CL_SUCCESS("clGetPlatformIDs", err);

	// Connect to a compute device
#ifdef DEBUG 
	printf("Connect to a compute device\n");
#endif
	//
	cl_uint numDevices = 0;
	//int gpu = 1;
	//err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	device_id = (cl_device_id*)malloc(2 * sizeof(cl_device_id));
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, device_id, &numDevices);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

#ifdef DEBUG 
	printf("has %d devices\n", numDevices);
#endif

		//IAH();
		//err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);

	//cl_device_info device_info;
	char buffer[1024];
	clGetDeviceInfo(device_id[0], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);

    /* James: Check if the device is available
     */
    cl_bool device_available = CL_FALSE;
	clGetDeviceInfo(device_id[0], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &device_available, NULL);
    if (device_available != CL_TRUE) 
    {
		printf("Error: Device %i is not available\n", 0);
		return EXIT_FAILURE;
    }

#ifdef DEBUG 
	printf("Device name is %s\n", buffer);
#endif
	clGetDeviceInfo(device_id[1], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);
#ifdef DEBUG 
	printf("Device name is %s\n", buffer);
#endif
	
	// Create a compute context 
#ifdef DEBUG 
	printf("Create a compute context\n");
#endif
	//
	context = clCreateContext(0, 1, &device_id[DEVICE], NULL, NULL, &err);
	//context = clCreateContext(0, 1, device_id, NULL, NULL, &err);
	if (!context)
	{
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to create a compute context: errcode_ret=%i\n", err);
		return EXIT_FAILURE;
	}

	// Create a command commands
#ifdef DEBUG 
	printf("Create a command commands\n");
#endif
	//
	commands = clCreateCommandQueue(context, device_id[DEVICE], CL_QUEUE_PROFILING_ENABLE, &err);
    CHECK_CL_SUCCESS("clCreateCommandQueue", err);
	if (!commands)
	{
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source buffer
#ifdef DEBUG 
	printf("Create the compute program from the source buffer\n");
#endif
	const char *kernel_source = load_kernel_source(AES_KERNEL);
	//printf("kernel source is:\n %s", kernel_source);
	program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, &err);
	if (!program || err != CL_SUCCESS) {
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
#ifdef DEBUG 
	printf("Build the program executable\n");
#endif
	//
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		size_t len;
		char b[2048];

		printf("Error: Failed to build program executable!\n");
		err = clGetProgramBuildInfo(program, device_id[DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(b), b, &len);
        CHECK_CL_SUCCESS("clGetProgramBuildInfo", err);
		printf("%s\n", b);
		exit(1);
	}

	// Create the compute kernel in the program we wish to run
#ifdef DEBUG 
	printf("Create the compute kernel in the program we wish to run\n");
#endif
	encrypt_kernel = clCreateKernel(program, "AES_encrypt", &err);
	if (!encrypt_kernel || err != CL_SUCCESS) {
		printf("Error: Failed to create compute kernel! err = %d\n", err);
		size_t len;
		char b[2048];
		err = clGetProgramBuildInfo(program, device_id[DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(b), b, &len);
        CHECK_CL_SUCCESS("clGetProgramBuildInfo", err);
		printf("%s\n", b);
		exit(1);
	}

	// Create the input and output arrays in device memory for our calculation
#ifdef DEBUG 
	printf("Create the input and output arrays in device memory for our calculation\n");
#endif
	int max_buffer_size = MAX_BUFFER_SIZE;
	// dynamic buffer size please
	buffer_state = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, max_buffer_size, NULL, &err);
    CHECK_CL_SUCCESS("clCreateBuffer", err);
	buffer_roundkeys = clCreateBuffer(context, CL_MEM_READ_ONLY, 16 * 15, NULL, &err);
    CHECK_CL_SUCCESS("clCreateBuffer", err);
	if (!buffer_state || !buffer_roundkeys)
	{
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}    

	// Get the maximum work group size for executing the kernel on the device
#ifdef DEBUG 
	printf("Get the maximum work group size for executing the kernel on the device\n");
#endif
	//
	err = clGetKernelWorkGroupInfo(encrypt_kernel, device_id[DEVICE], CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to retrieve kernel work group info! %d\n", err);
		exit(1);
	}
	printf("local is %d\n", local);


	unsigned int i = 0;
	unsigned int count = DATA_SIZE;

	clock_t tStartF = clock();
	// Fill our data set with random float values
	i = 0;
	printf("encrypt_cl: count = %d\n", count);
	for(i = 0; i < count; i++) {
		in[i] = 0;
		//in[i] = rand();
	}
	//tFill += (double)(clock() - tStartF)/CLOCKS_PER_SEC;

	clock_t tStart = clock();
	unsigned int k = 0;
	double tFill = 0;
	double tMemory = 0;
	double tArgument = 0;
	double tExecute = 0;
	double tRead = 0;

	int ret;
	AES_KEY ks;
	ret = AES_set_encrypt_key(key, 128, &ks);
	//for (k = 0; k<LOOP; k++) {
		//printf("encrypt_cl: i = %d\n", i);

		// Write our data set into the input array in device memory 
		//printf("Write our data set into the input array in device memory\n");
		//

		clock_t tStartM = clock();
		err = clEnqueueWriteBuffer(commands, buffer_state, CL_TRUE, 0, DATA_SIZE, in, 0, NULL, NULL);
        CHECK_CL_SUCCESS("clEnqueueWriteBuffer", err);
		err = clEnqueueWriteBuffer(commands, buffer_roundkeys, CL_TRUE, 0, 16 * 15, &ks.rd_key, 0, NULL, NULL);
        CHECK_CL_SUCCESS("clEnqueueWriteBuffer", err);
		printf("rd_key %s", ks.rd_key);
		//err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
		//err = clEnqueueWriteBuffer(commands, key, CL_TRUE, 0, sizeof(float) * count, keyData, 0, NULL, NULL);
		if (err != CL_SUCCESS)
		{
			printf("Error: Failed to write to source array!\n");
			exit(1);
		}
		err = clFinish(commands);
        CHECK_CL_SUCCESS("clFinish", err);
		tMemory += (double)(clock() - tStartM)/CLOCKS_PER_SEC;

		// Set the arguments to our compute kernel
		//printf("Set the arguments to our compute kernel\n");
		//
		clock_t tStartA = clock();

		err = 0;
		err  = clSetKernelArg(encrypt_kernel, 0, sizeof(cl_mem), &buffer_state);
        CHECK_CL_SUCCESS("clSetKernelArg", err);
		err |= clSetKernelArg(encrypt_kernel, 1, sizeof(cl_mem), &buffer_roundkeys);
        CHECK_CL_SUCCESS("clSetKernelArg", err);
		err |= clSetKernelArg(encrypt_kernel, 2, sizeof(ks.rounds), &ks.rounds);
        CHECK_CL_SUCCESS("clSetKernelArg", err);
		if (err != CL_SUCCESS)
		{
			printf("Error: Failed to set kernel arguments! %d\n", err);
			exit(1);
		}
		tArgument += (double)(clock() - tStartA)/CLOCKS_PER_SEC;


		// Execute the kernel over the entire range of our 1d input data set
		// using the maximum number of work group items for this device
		//
		global = count;
#ifdef DEBUG 
		printf("global is %d\n", global);
#endif

		clock_t tStartE = clock();
		cl_float t = 0.;
		cl_ulong start = 0, end = 0;
		//for (i = 0; i<LOOP; i++) {
			err = clEnqueueNDRangeKernel(commands, encrypt_kernel, 1, NULL, &global, &local, 0, NULL, &event);
            CHECK_CL_SUCCESS("clEnqueueNDRangeKernel", err);
			//err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			if (err)
			{
				printf("Error: Failed to execute kernel!\n");
				return EXIT_FAILURE;
			}
			err = clWaitForEvents(1, &event);
            CHECK_CL_SUCCESS("clWaitForEvents", err);
			err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
            CHECK_CL_SUCCESS("clGetEventProfilingInfo", err);
			err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
            CHECK_CL_SUCCESS("clGetEventProfilingInfo", err);
			//END-START gives you hints on kind of “pure HW execution time”
			//the resolution of the events is 1e-09 sec
			t += (cl_float)(end - start)*(cl_float)(1e-06);
		//}
		printf("profile time: %f ms",t);
		err = clFinish(commands);
        CHECK_CL_SUCCESS("clFinish", err);
		// Wait for the command commands to get serviced before reading back results
		//
		tExecute += (double)(clock() - tStartE)/CLOCKS_PER_SEC;

		// Read back the results from the device to verify the output
		//
		clock_t tStartR = clock();
		//err = clEnqueueReadBuffer( commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL );  
		err = clEnqueueReadBuffer(commands, buffer_state, CL_FALSE, 0, DATA_SIZE, out, 0, NULL, NULL);
		if (err != CL_SUCCESS)
		{
			printf("Error: Failed to read output array! %d\n", err);
			exit(1);
		}
		printf("input data is\n");
		for (i=0; i<DATA_SIZE; i++) {
			printf("%X ", in[i]);
		}
		printf("encrypted data is\n");
		for (i=0; i<DATA_SIZE; i++) {
			printf("%X ", out[i]);
		}
		tRead += (double)(clock() - tStartR)/CLOCKS_PER_SEC;
	//}




	printf("-----------------------------------------------");
	printf("encrypt_cl Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
	printf("cl Fill data Time taken: %.2fs\n", tFill); 
	printf("cl memory copy Time taken: %.2fs\n", tMemory); 
	printf("cl set Argument Time taken: %.2fs\n", tArgument); 
	printf("cl Execute kernel time taken: %.2fs\n", tExecute); 
	printf("cl read memory taken: %.2fs\n", tRead); 

	// Validate our results
	//
	correct = 0;
	for(i = 0; i < count; i++)
	{
		//if( data[i] - sqrt(keyData[i]) < 0.001)
			correct++;
	}

	// Print a brief summary detailing the results
#ifdef DEBUG 
	printf("Computed '%d/%d' correct values!\n", correct, count);
#endif

	// Shutdown and cleanup
	//
	clReleaseMemObject(buffer_state);
    CHECK_CL_SUCCESS("clReleaseMemObject", err);
	clReleaseMemObject(buffer_roundkeys);
    CHECK_CL_SUCCESS("clReleaseMemObject", err);
	clReleaseProgram(program);
    CHECK_CL_SUCCESS("clReleaseProgram", err);
	clReleaseKernel(encrypt_kernel);
    CHECK_CL_SUCCESS("clReleaseKernel", err);
	clReleaseCommandQueue(commands);
    CHECK_CL_SUCCESS("clReleaseCommandQueue", err);
	clReleaseContext(context);
    CHECK_CL_SUCCESS("clReleaseContext", err);
}
void 
kernel_gpu_opencl_wrapper(	record *records,
							long records_mem,
							knode *knodes,
							long knodes_elem,
							long knodes_mem,

							int order,
							long maxheight,
							int count,

							long *currKnode,
							long *offset,
							int *keys,
							record *ans)
{

	//======================================================================================================================================================150
	//	CPU VARIABLES
	//======================================================================================================================================================150

	// timer
	long long time0;
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long time6;

	time0 = get_time();

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_GPU, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	char clOptions[110];
	//  sprintf(clOptions,"-I../../src");                                                                                 
	sprintf(clOptions,"-I./../");

#ifdef DEFAULT_ORDER
	sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER);
#endif


	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							clOptions, 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Create kernel
	cl_kernel kernel;
	kernel = clCreateKernel(program, 
							"findK", 
							&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time1 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY				(MALLOC)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	recordsD
	//==================================================50

	cl_mem recordsD;
	recordsD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								records_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	knodesD
	//==================================================50

	cl_mem knodesD;
	knodesD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								knodes_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	cl_mem currKnodeD;
	currKnodeD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	cl_mem offsetD;
	offsetD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	keysD
	//==================================================50

	cl_mem keysD;
	keysD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansD
	//==================================================50

	cl_mem ansD;
	ansD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(record), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time2 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	GPU MEMORY				(MALLOC) COPY IN
	//====================================================================================================100

	//==================================================50
	//	recordsD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									recordsD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									records_mem,			// size to be copied
									records,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	knodesD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									knodesD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									knodes_mem,				// size to be copied
									knodes,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									currKnodeD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									currKnode,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									offsetD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									offset,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	keysD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									keysD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									keys,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									ansD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(record),	// size to be copied
									ans,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time3 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	// findK kernel
	//======================================================================================================================================================150

	//====================================================================================================100
	//	Execution Parameters
	//====================================================================================================100

	size_t local_work_size[1];
	local_work_size[0] = order < 1024 ? order : 1024;
	size_t global_work_size[1];
	global_work_size[0] = count * local_work_size[0];

	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);

	//====================================================================================================100
	//	Kernel Arguments
	//====================================================================================================100

	clSetKernelArg(	kernel, 
					0, 
					sizeof(long), 
					(void *) &maxheight);
	clSetKernelArg(	kernel, 
					1, 
					sizeof(cl_mem), 
					(void *) &knodesD);
	clSetKernelArg(	kernel, 
					2, 
					sizeof(long), 
					(void *) &knodes_elem);
	clSetKernelArg(	kernel, 
					3, 
					sizeof(cl_mem), 
					(void *) &recordsD);

	clSetKernelArg(	kernel, 
					4, 
					sizeof(cl_mem), 
					(void *) &currKnodeD);
	clSetKernelArg(	kernel, 
					5, 
					sizeof(cl_mem), 
					(void *) &offsetD);
	clSetKernelArg(	kernel, 
					6, 
					sizeof(cl_mem), 
					(void *) &keysD);
	clSetKernelArg(	kernel, 
					7, 
					sizeof(cl_mem), 
					(void *) &ansD);

	//====================================================================================================100
	//	Kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
	error = clFinish(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time4 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY (CONTD.)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansD
	//==================================================50

	error = clEnqueueReadBuffer(command_queue,				// The command queue.
								ansD,						// The image on the device.
								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
								0,							// Offset. None in this case.
								count*sizeof(record),		// Size to copy.
								ans,						// The pointer to the image on the host.
								0,							// Number of events in wait list. Not used.
								NULL,						// Event wait list. Not used.
								NULL);						// Event object for determining status. Not used.
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time5 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY DEALLOCATION
	//======================================================================================================================================================150

	// Release kernels...
	clReleaseKernel(kernel);

	// Now the program...
	clReleaseProgram(program);

	// Clean up the device memory...
	clReleaseMemObject(recordsD);
	clReleaseMemObject(knodesD);

	clReleaseMemObject(currKnodeD);
	clReleaseMemObject(offsetD);
	clReleaseMemObject(keysD);
	clReleaseMemObject(ansD);

	// Flush the queue
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// ...and finally, the queue and context.
	clReleaseCommandQueue(command_queue);

	// ???
	clReleaseContext(context);

	time6 = get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");

	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);

	printf("Total time:\n");
	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);

	//======================================================================================================================================================150
	//	END
	//======================================================================================================================================================150

}
Пример #7
0
int 
kernel_gpu_opencl_wrapper(	int xmax,
							int workload,

							fp ***y,
							fp **x,
							fp **params,
							fp *com)
{

	//======================================================================================================================================================150
	//	VARIABLES
	//======================================================================================================================================================150

	long long time0;
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long timecopyin = 0;
	long long timekernel = 0;
	long long timecopyout = 0;
	long long timeother;
	//stage1_start
	time0 = get_time();

	int i;

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_GPU, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CRATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							"-I./../", 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Create kernel
	cl_kernel kernel;
	kernel = clCreateKernel(program, 
							"kernel_gpu_opencl", 
							&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();

	time1 = get_time();
//	double start_timer = omp_get_wtime();
	//======================================================================================================================================================150
	//	ALLOCATE MEMORY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	d_initvalu_mem
	//====================================================================================================100

	int d_initvalu_mem;
	d_initvalu_mem = EQUATIONS * sizeof(fp);
	cl_mem d_initvalu;
	d_initvalu = clCreateBuffer(context,					// context
								CL_MEM_READ_WRITE,			// flags
								d_initvalu_mem,				// size of buffer
								NULL,						// host pointer (optional)
								&error );					// returned error
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	d_finavalu_mem
	//====================================================================================================100

	int d_finavalu_mem;
	d_finavalu_mem = EQUATIONS * sizeof(fp);
	cl_mem d_finavalu;
	d_finavalu = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								d_finavalu_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	d_params_mem
	//====================================================================================================100

	int d_params_mem;
	d_params_mem = PARAMETERS * sizeof(fp);
	cl_mem d_params;
	d_params = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								d_params_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	d_com_mem
	//====================================================================================================100

	int d_com_mem;
	d_com_mem = 3 * sizeof(fp);
	cl_mem d_com;
	d_com = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							d_com_mem, 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	
	time2 = get_time();

	//======================================================================================================================================================150
	//	EXECUTION
	//======================================================================================================================================================150

	int status;

	for(i=0; i<workload; i++){

		status = solver(	y[i],
							x[i],
							xmax,
							params[i],
							com,

							d_initvalu,
							d_finavalu,
							d_params,
							d_com,

							command_queue,
							kernel,

							&timecopyin,
							&timekernel,
							&timecopyout);

		if(status !=0){
			printf("STATUS: %d\n", status);
		}

	}

	// // // print results
	// // int k;
	// // for(i=0; i<workload; i++){
		// // printf("WORKLOAD %d:\n", i);
		// // for(j=0; j<(xmax+1); j++){
			// // printf("\tTIME %d:\n", j);
			// // for(k=0; k<EQUATIONS; k++){
				// // printf("\t\ty[%d][%d][%d]=%13.10f\n", i, j, k, y[i][j][k]);
			// // }
		// // }
	// // }
//	double end_timer = omp_get_wtime();
//	printf("Time3-Time1 : %.8f\n",(end_timer - start_timer));
	time3 = get_time();

	//======================================================================================================================================================150
	//	FREE GPU MEMORY
	//======================================================================================================================================================150

	// Release kernels...
	clReleaseKernel(kernel);

	// Now the program...
	clReleaseProgram(program);

	// Clean up the device memory...
	clReleaseMemObject(d_initvalu);
	clReleaseMemObject(d_finavalu);
	clReleaseMemObject(d_params);
	clReleaseMemObject(d_com);

	// Flush the queue
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// ...and finally, the queue and context.
	clReleaseCommandQueue(command_queue);

	// ???
	clReleaseContext(context);

	time4= get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	printf("Time spent in different stages of the application:\n");
	printf("%15.12f s, %15.12f % : CPU: GPU SETUP\n", 								(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time4-time0) * 100);
	printf("%15.12f s, %15.12f % : CPU: ALLOCATE GPU MEMORY\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time4-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU: COMPUTATION\n", 							(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time4-time0) * 100);

	printf("\tGPU: COMPUTATION Components:\n");
	printf("\t%15.12f s, %15.12f % : GPU: COPY DATA IN\n", 							(float) (timecopyin) / 1000000, (float) (timecopyin) / (float) (time4-time0) * 100);
	printf("\t%15.12f s, %15.12f % : GPU: KERNEL\n", 								(float) (timekernel) / 1000000, (float) (timekernel) / (float) (time4-time0) * 100);
	printf("\t%15.12f s, %15.12f % : GPU: COPY DATA OUT\n", 						(float) (timecopyout) / 1000000, (float) (timecopyout) / (float) (time4-time0) * 100);
	timeother = time3-time2-timecopyin-timekernel-timecopyout;
	printf("\t%15.12f s, %15.12f % : GPU: OTHER\n", 								(float) (timeother) / 1000000, (float) (timeother) / (float) (time4-time0) * 100);

	printf("%15.12f s, %15.12f % : CPU: FREE GPU MEMORY\n", 						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time4-time0) * 100);
	printf("Total time 1:\n");
	printf("%.12f s\n", 															(float) (time4-time0) / 1000000);

	//======================================================================================================================================================150
	//	RETURN
	//======================================================================================================================================================150

	return 0;

	//======================================================================================================================================================150
	//	END
	//======================================================================================================================================================150

}