Esempio n. 1
0
int compute_tran_temp(cl_mem MatrixPower, cl_mem MatrixTemp[2], int col, int row, \
		int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows,
		float *TempCPU, float *PowerCPU) 
{ 
	
	float grid_height = chip_height / row;
	float grid_width = chip_width / col;

	float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
	float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
	float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
	float Rz = t_chip / (K_SI * grid_height * grid_width);

	float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
	float step = PRECISION / max_slope;
	int t;

	int src = 0, dst = 1;
	
	cl_int error;
	
	// Determine GPU work group grid
	size_t global_work_size[2];
	global_work_size[0] = BLOCK_SIZE * blockCols;
	global_work_size[1] = BLOCK_SIZE * blockRows;
	size_t local_work_size[2];
	local_work_size[0] = BLOCK_SIZE;
	local_work_size[1] = BLOCK_SIZE;
	
	
	long long start_time = get_time();	
	
	for (t = 0; t < total_iterations; t += num_iterations) {
		
		// Specify kernel arguments
		int iter = MIN(num_iterations, total_iterations - t);
		clSetKernelArg(kernel, 0, sizeof(int), (void *) &iter);
		clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &MatrixPower);
		clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &MatrixTemp[src]);
		clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &MatrixTemp[dst]);
		clSetKernelArg(kernel, 4, sizeof(int), (void *) &col);
		clSetKernelArg(kernel, 5, sizeof(int), (void *) &row);
		clSetKernelArg(kernel, 6, sizeof(int), (void *) &borderCols);
		clSetKernelArg(kernel, 7, sizeof(int), (void *) &borderRows);
		clSetKernelArg(kernel, 8, sizeof(float), (void *) &Cap);
		clSetKernelArg(kernel, 9, sizeof(float), (void *) &Rx);
		clSetKernelArg(kernel, 10, sizeof(float), (void *) &Ry);
		clSetKernelArg(kernel, 11, sizeof(float), (void *) &Rz);
		clSetKernelArg(kernel, 12, sizeof(float), (void *) &step);
		
		// Launch kernel
		error = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
		if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
		
		// Flush the queue
		error = clFlush(command_queue);
		if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
		
		// Swap input and output GPU matrices
		src = 1 - src;
		dst = 1 - dst;
	}
	
	// Wait for all operations to finish
	error = clFinish(command_queue);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	long long end_time = get_time();
	long long total_time = (end_time - start_time);	
	printf("\nKernel time: %.3f seconds\n", ((float) total_time) / (1000*1000));
	
	return src;
}
void 
kernel_gpu_opencl_wrapper(	fp* image,											// input image
							int Nr,												// IMAGE nbr of rows
							int Nc,												// IMAGE nbr of cols
							long Ne,											// IMAGE nbr of elem
							int niter,											// nbr of iterations
							fp lambda,											// update step size
							long NeROI,											// ROI nbr of elements
							int* iN,
							int* iS,
							int* jE,
							int* jW,
							int iter,											// primary loop
							int mem_size_i,
							int mem_size_j)
{

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_ALL, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

  char clOptions[150];
//  sprintf(clOptions,"-I../../src");                                                    
  sprintf(clOptions,"-I.");
#ifdef RD_WG_SIZE
  sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE);
#endif
#ifdef RD_WG_SIZE_0
  sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0);
#endif
#ifdef RD_WG_SIZE_0_0
  sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0);
#endif

	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							clOptions, 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE Kernels
	//====================================================================================================100

	// Extract kernel
	cl_kernel extract_kernel;
	extract_kernel = clCreateKernel(program, 
									"extract_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Prepare kernel
	cl_kernel prepare_kernel;
	prepare_kernel = clCreateKernel(program, 
									"prepare_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Reduce kernel
	cl_kernel reduce_kernel;
	reduce_kernel = clCreateKernel(	program, 
									"reduce_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// SRAD kernel
	cl_kernel srad_kernel;
	srad_kernel = clCreateKernel(	program, 
									"srad_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// SRAD2 kernel
	cl_kernel srad2_kernel;
	srad2_kernel = clCreateKernel(	program, 
									"srad2_kernel", 
									&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Compress kernel
	cl_kernel compress_kernel;
	compress_kernel = clCreateKernel(	program, 
										"compress_kernel", 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	TRIGGERING INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();		// the above does it

	//======================================================================================================================================================150
	// 	GPU VARIABLES
	//======================================================================================================================================================150

	// CUDA kernel execution parameters
	int blocks_x;

	//======================================================================================================================================================150
	// 	ALLOCATE MEMORY IN GPU
	//======================================================================================================================================================150

	//====================================================================================================100
	// common memory size
	//====================================================================================================100

	int mem_size;															// matrix memory size
	mem_size = sizeof(fp) * Ne;												// get the size of float representation of input IMAGE

	//====================================================================================================100
	// allocate memory for entire IMAGE on DEVICE
	//====================================================================================================100

	cl_mem d_I;
	d_I = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for coordinates on DEVICE
	//====================================================================================================100

	cl_mem d_iN;
	d_iN = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_i,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_iS;
	d_iS = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_i,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_jE;
	d_jE = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_j,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_jW;
	d_jW = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size_j,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for derivatives
	//====================================================================================================100

	cl_mem d_dN;
	d_dN = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_dS;
	d_dS = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_dW;
	d_dW = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_dE;
	d_dE = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for coefficient on DEVICE
	//====================================================================================================100

	cl_mem d_c;
	d_c = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// allocate memory for partial sums on DEVICE
	//====================================================================================================100

	cl_mem d_sums;
	d_sums = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	cl_mem d_sums2;
	d_sums2 = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							mem_size,
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	COPY INPUT TO CPU
	//======================================================================================================================================================150

	//====================================================================================================100
	// Image
	//====================================================================================================100

	error = clEnqueueWriteBuffer(	command_queue, 
									d_I, 
									1, 
									0, 
									mem_size, 
									image, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// coordinates
	//====================================================================================================100

	error = clEnqueueWriteBuffer(	command_queue, 
									d_iN, 
									1, 
									0, 
									mem_size_i, 
									iN, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clEnqueueWriteBuffer(	command_queue, 
									d_iS, 
									1, 
									0, 
									mem_size_i, 
									iS, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clEnqueueWriteBuffer(	command_queue, 
									d_jE, 
									1, 
									0, 
									mem_size_j, 
									jE, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clEnqueueWriteBuffer(	command_queue, 
									d_jW, 
									1, 
									0, 
									mem_size_j, 
									jW, 
									0, 
									0, 
									0);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	KERNEL EXECUTION PARAMETERS
	//======================================================================================================================================================150

	// threads
	size_t local_work_size[1];
	local_work_size[0] = NUMBER_THREADS;

	// workgroups
	int blocks_work_size;
	size_t global_work_size[1];
	blocks_x = Ne/(int)local_work_size[0];
	if (Ne % (int)local_work_size[0] != 0){												// compensate for division remainder above by adding one grid
		blocks_x = blocks_x + 1;																	
	}
	blocks_work_size = blocks_x;
	global_work_size[0] = blocks_work_size * local_work_size[0];						// define the number of blocks in the grid

	printf("max # of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);

	//======================================================================================================================================================150
	// 	Extract Kernel - SCALE IMAGE DOWN FROM 0-255 TO 0-1 AND EXTRACT
	//======================================================================================================================================================150

	//====================================================================================================100
	//	set arguments
	//====================================================================================================100

	error = clSetKernelArg(	extract_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	extract_kernel, 
							1, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	launch kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									extract_kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	Synchronization - wait for all operations in the command queue so far to finish
	//====================================================================================================100

	// error = clFinish(command_queue);
	// if (error != CL_SUCCESS) 
		// fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	WHAT IS CONSTANT IN COMPUTATION LOOP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	Prepare Kernel
	//====================================================================================================100

	error = clSetKernelArg(	prepare_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	prepare_kernel, 
							1, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	prepare_kernel, 
							2, 
							sizeof(cl_mem), 
							(void *) &d_sums);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	prepare_kernel, 
							3, 
							sizeof(cl_mem), 
							(void *) &d_sums2);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	Reduce Kernel
	//====================================================================================================100

	int blocks2_x;
	int blocks2_work_size;
	size_t global_work_size2[1];
	long no;
	int mul;
	int mem_size_single = sizeof(fp) * 1;
	fp total;
	fp total2;
	fp meanROI;
	fp meanROI2;
	fp varROI;
	fp q0sqr;

	error = clSetKernelArg(	reduce_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	reduce_kernel, 
							3, 
							sizeof(cl_mem), 
							(void *) &d_sums);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	reduce_kernel, 
							4, 
							sizeof(cl_mem), 
							(void *) &d_sums2);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	SRAD Kernel
	//====================================================================================================100

	error = clSetKernelArg(	srad_kernel, 
							0, 
							sizeof(fp), 
							(void *) &lambda);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							1, 
							sizeof(int), 
							(void *) &Nr);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							2, 
							sizeof(int), 
							(void *) &Nc);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							3, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							4, 
							sizeof(cl_mem), 
							(void *) &d_iN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							5, 
							sizeof(cl_mem), 
							(void *) &d_iS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							6, 
							sizeof(cl_mem), 
							(void *) &d_jE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							7, 
							sizeof(cl_mem), 
							(void *) &d_jW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							8, 
							sizeof(cl_mem), 
							(void *) &d_dN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							9, 
							sizeof(cl_mem), 
							(void *) &d_dS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							10, 
							sizeof(cl_mem), 
							(void *) &d_dW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							11, 
							sizeof(cl_mem), 
							(void *) &d_dE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							13, 
							sizeof(cl_mem), 
							(void *) &d_c);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad_kernel, 
							14, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	SRAD2 Kernel
	//====================================================================================================100

	error = clSetKernelArg(	srad2_kernel, 
							0, 
							sizeof(fp), 
							(void *) &lambda);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							1, 
							sizeof(int), 
							(void *) &Nr);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							2, 
							sizeof(int), 
							(void *) &Nc);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							3, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							4, 
							sizeof(cl_mem), 
							(void *) &d_iN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							5, 
							sizeof(cl_mem), 
							(void *) &d_iS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							6, 
							sizeof(cl_mem), 
							(void *) &d_jE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							7, 
							sizeof(cl_mem), 
							(void *) &d_jW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							8, 
							sizeof(cl_mem), 
							(void *) &d_dN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							9, 
							sizeof(cl_mem), 
							(void *) &d_dS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							10, 
							sizeof(cl_mem), 
							(void *) &d_dW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							11, 
							sizeof(cl_mem), 
							(void *) &d_dE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							12, 
							sizeof(cl_mem), 
							(void *) &d_c);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	srad2_kernel, 
							13, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	COMPUTATION
	//======================================================================================================================================================150

	printf("Iterations Progress: ");

	// execute main loop
	for (iter=0; iter<niter; iter++){										// do for the number of iterations input parameter

		printf("%d ", iter);
		fflush(NULL);

		//====================================================================================================100
		// Prepare kernel
		//====================================================================================================100

		// launch kernel
		error = clEnqueueNDRangeKernel(	command_queue, 
										prepare_kernel, 
										1, 
										NULL, 
										global_work_size, 
										local_work_size, 
										0, 
										NULL, 
										NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// synchronize
		// error = clFinish(command_queue);
		// if (error != CL_SUCCESS) 
			// fatal_CL(error, __LINE__);

		//====================================================================================================100
		//	Reduce Kernel - performs subsequent reductions of sums
		//====================================================================================================100

		// initial values
		blocks2_work_size = blocks_work_size;							// original number of blocks
		global_work_size2[0] = global_work_size[0];
		no = Ne;														// original number of sum elements
		mul = 1;														// original multiplier

		// loop
		while(blocks2_work_size != 0){

			// set arguments that were uptaded in this loop
			error = clSetKernelArg(	reduce_kernel, 
									1, 
									sizeof(long), 
									(void *) &no);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);
			error = clSetKernelArg(	reduce_kernel, 
									2, 
									sizeof(int), 
									(void *) &mul);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);

			error = clSetKernelArg(	reduce_kernel, 
									5, 
									sizeof(int), 
									(void *) &blocks2_work_size);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);

			// launch kernel
			error = clEnqueueNDRangeKernel(	command_queue, 
											reduce_kernel, 
											1, 
											NULL, 
											global_work_size2, 
											local_work_size, 
											0, 
											NULL, 
											NULL);
			if (error != CL_SUCCESS) 
				fatal_CL(error, __LINE__);

			// synchronize
			// error = clFinish(command_queue);
			// if (error != CL_SUCCESS) 
				// fatal_CL(error, __LINE__);

			// update execution parameters
			no = blocks2_work_size;												// get current number of elements
			if(blocks2_work_size == 1){
				blocks2_work_size = 0;
			}
			else{
				mul = mul * NUMBER_THREADS;										// update the increment
				blocks_x = blocks2_work_size/(int)local_work_size[0];			// number of blocks
				if (blocks2_work_size % (int)local_work_size[0] != 0){			// compensate for division remainder above by adding one grid
					blocks_x = blocks_x + 1;
				}
				blocks2_work_size = blocks_x;
				global_work_size2[0] = blocks2_work_size * (int)local_work_size[0];
			}

		}

		// copy total sums to device
		error = clEnqueueReadBuffer(command_queue,
									d_sums,
									CL_TRUE,
									0,
									mem_size_single,
									&total,
									0,
									NULL,
									NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		error = clEnqueueReadBuffer(command_queue,
									d_sums2,
									CL_TRUE,
									0,
									mem_size_single,
									&total2,
									0,
									NULL,
									NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		//====================================================================================================100
		// calculate statistics
		//====================================================================================================100
		
		meanROI	= total / (fp)(NeROI);										// gets mean (average) value of element in ROI
		meanROI2 = meanROI * meanROI;										//
		varROI = (total2 / (fp)(NeROI)) - meanROI2;							// gets variance of ROI								
		q0sqr = varROI / meanROI2;											// gets standard deviation of ROI

		//====================================================================================================100
		// execute srad kernel
		//====================================================================================================100

		// set arguments that were uptaded in this loop
		error = clSetKernelArg(	srad_kernel, 
							12, 
							sizeof(fp), 
							(void *) &q0sqr);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// launch kernel
		error = clEnqueueNDRangeKernel(	command_queue, 
										srad_kernel, 
										1, 
										NULL, 
										global_work_size, 
										local_work_size, 
										0, 
										NULL, 
										NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// synchronize
		// error = clFinish(command_queue);
		// if (error != CL_SUCCESS) 
			// fatal_CL(error, __LINE__);

		//====================================================================================================100
		// execute srad2 kernel
		//====================================================================================================100

		// launch kernel
		error = clEnqueueNDRangeKernel(	command_queue, 
										srad2_kernel, 
										1, 
										NULL, 
										global_work_size, 
										local_work_size, 
										0, 
										NULL, 
										NULL);
		if (error != CL_SUCCESS) 
			fatal_CL(error, __LINE__);

		// synchronize
		// error = clFinish(command_queue);
		// if (error != CL_SUCCESS) 
			// fatal_CL(error, __LINE__);

		//====================================================================================================100
		// End
		//====================================================================================================100

	}

	printf("\n");

	//======================================================================================================================================================150
	// 	Compress Kernel - SCALE IMAGE UP FROM 0-1 TO 0-255 AND COMPRESS
	//======================================================================================================================================================150

	//====================================================================================================100
	// set parameters
	//====================================================================================================100

	error = clSetKernelArg(	compress_kernel, 
							0, 
							sizeof(long), 
							(void *) &Ne);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clSetKernelArg(	compress_kernel, 
							1, 
							sizeof(cl_mem), 
							(void *) &d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// launch kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									compress_kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	// synchronize
	//====================================================================================================100

	error = clFinish(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	End
	//====================================================================================================100

	//======================================================================================================================================================150
	// 	COPY RESULTS BACK TO CPU
	//======================================================================================================================================================150

	error = clEnqueueReadBuffer(command_queue,
								d_I,
								CL_TRUE,
								0,
								mem_size,
								image,
								0,
								NULL,
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// int i;
	// for(i=0; i<100; i++){
		// printf("%f ", image[i]);
	// }

	//======================================================================================================================================================150
	// 	FREE MEMORY
	//======================================================================================================================================================150

	// OpenCL structures
	error = clReleaseKernel(extract_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(prepare_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(reduce_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(srad_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(srad2_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseKernel(compress_kernel);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseProgram(program);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// common_change
	error = clReleaseMemObject(d_I);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_c);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clReleaseMemObject(d_iN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_iS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_jE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_jW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clReleaseMemObject(d_dN);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_dS);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_dE);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_dW);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	error = clReleaseMemObject(d_sums);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseMemObject(d_sums2);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// OpenCL structures
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseCommandQueue(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	error = clReleaseContext(context);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//======================================================================================================================================================150
	// 	End
	//======================================================================================================================================================150

}
Esempio n. 3
0
int main(int argc, char** argv) {


	cl_int error;
	cl_uint num_platforms;
	
	// Get the number of platforms
	error = clGetPlatformIDs(0, NULL, &num_platforms);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get the list of platforms
	cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(num_platforms, platforms, NULL);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Print the chosen platform (if there are multiple platforms, choose the first one)
	cl_platform_id platform = platforms[0];
	char pbuf[100];
	error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);
	
	// Create a GPU context
	cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0};
    context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get and print the chosen device (if there are multiple devices, choose the first one)
	size_t devices_size;
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	device = devices[0];
	error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);
	
	// Create a command queue
	command_queue = clCreateCommandQueue(context, device, 0, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	

    int size;
    int grid_rows,grid_cols = 0;
    float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; 
    char *tfile, *pfile, *ofile;
    
    int total_iterations = 60;
    int pyramid_height = 1; // number of iterations
	
	if (argc < 7)
		usage(argc, argv);
	if((grid_rows = atoi(argv[1]))<=0||
	   (grid_cols = atoi(argv[1]))<=0||
       (pyramid_height = atoi(argv[2]))<=0||
       (total_iterations = atoi(argv[3]))<=0)
		usage(argc, argv);
		
	tfile=argv[4];
    pfile=argv[5];
    ofile=argv[6];
	
    size=grid_rows*grid_cols;

    // --------------- pyramid parameters --------------- 
    int borderCols = (pyramid_height)*EXPAND_RATE/2;
    int borderRows = (pyramid_height)*EXPAND_RATE/2;
    int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1);
    int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1);

    FilesavingTemp = (float *) malloc(size*sizeof(float));
    FilesavingPower = (float *) malloc(size*sizeof(float));
    // MatrixOut = (float *) calloc (size, sizeof(float));

    if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut)
        fatal("unable to allocate memory");
	
	// Read input data from disk
    readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
    readinput(FilesavingPower, grid_rows, grid_cols, pfile);
	
	// Load kernel source from file
	const char *source = load_kernel_source("hotspot_kernel.cl");
	size_t sourceSize = strlen(source);
	
	// Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
    // Create an executable from the kernel
	error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	// Show compiler warnings/errors
	static char log[65536]; memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
	if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
    kernel = clCreateKernel(program, "hotspot", &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
		
	long long start_time = get_time();
	
	// Create two temperature matrices and copy the temperature input data
	cl_mem MatrixTemp[2];
	// Create input memory buffers on device
	MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Copy the power input data
	cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Perform the computation
	int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height,
								blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower);
	
	// Copy final temperature data back
	cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	long long end_time = get_time();	
	printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000));
	
	// Write final output to output file
    writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
    
	error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	clReleaseMemObject(MatrixTemp[0]);
	clReleaseMemObject(MatrixTemp[1]);
	clReleaseMemObject(MatrixPower);
	
	return 0;
}
void 
kernel_gpu_opencl_wrapper(	record *records,
							long records_mem,
							knode *knodes,
							long knodes_elem,
							long knodes_mem,

							int order,
							long maxheight,
							int count,

							long *currKnode,
							long *offset,
							int *keys,
							record *ans)
{

	//======================================================================================================================================================150
	//	CPU VARIABLES
	//======================================================================================================================================================150

	// timer
	long long time0;
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long time6;

	time0 = get_time();

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_GPU, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	char clOptions[110];
	//  sprintf(clOptions,"-I../../src");                                                                                 
	sprintf(clOptions,"-I./../");

#ifdef DEFAULT_ORDER
	sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER);
#endif


	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							clOptions, 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Create kernel
	cl_kernel kernel;
	kernel = clCreateKernel(program, 
							"findK", 
							&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time1 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY				(MALLOC)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	recordsD
	//==================================================50

	cl_mem recordsD;
	recordsD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								records_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	knodesD
	//==================================================50

	cl_mem knodesD;
	knodesD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								knodes_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	cl_mem currKnodeD;
	currKnodeD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	cl_mem offsetD;
	offsetD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	keysD
	//==================================================50

	cl_mem keysD;
	keysD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansD
	//==================================================50

	cl_mem ansD;
	ansD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(record), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time2 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	GPU MEMORY				(MALLOC) COPY IN
	//====================================================================================================100

	//==================================================50
	//	recordsD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									recordsD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									records_mem,			// size to be copied
									records,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	knodesD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									knodesD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									knodes_mem,				// size to be copied
									knodes,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									currKnodeD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									currKnode,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									offsetD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									offset,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	keysD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									keysD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									keys,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									ansD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(record),	// size to be copied
									ans,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time3 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	// findK kernel
	//======================================================================================================================================================150

	//====================================================================================================100
	//	Execution Parameters
	//====================================================================================================100

	size_t local_work_size[1];
	local_work_size[0] = order < 1024 ? order : 1024;
	size_t global_work_size[1];
	global_work_size[0] = count * local_work_size[0];

	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);

	//====================================================================================================100
	//	Kernel Arguments
	//====================================================================================================100

	clSetKernelArg(	kernel, 
					0, 
					sizeof(long), 
					(void *) &maxheight);
	clSetKernelArg(	kernel, 
					1, 
					sizeof(cl_mem), 
					(void *) &knodesD);
	clSetKernelArg(	kernel, 
					2, 
					sizeof(long), 
					(void *) &knodes_elem);
	clSetKernelArg(	kernel, 
					3, 
					sizeof(cl_mem), 
					(void *) &recordsD);

	clSetKernelArg(	kernel, 
					4, 
					sizeof(cl_mem), 
					(void *) &currKnodeD);
	clSetKernelArg(	kernel, 
					5, 
					sizeof(cl_mem), 
					(void *) &offsetD);
	clSetKernelArg(	kernel, 
					6, 
					sizeof(cl_mem), 
					(void *) &keysD);
	clSetKernelArg(	kernel, 
					7, 
					sizeof(cl_mem), 
					(void *) &ansD);

	//====================================================================================================100
	//	Kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
	error = clFinish(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time4 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY (CONTD.)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansD
	//==================================================50

	error = clEnqueueReadBuffer(command_queue,				// The command queue.
								ansD,						// The image on the device.
								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
								0,							// Offset. None in this case.
								count*sizeof(record),		// Size to copy.
								ans,						// The pointer to the image on the host.
								0,							// Number of events in wait list. Not used.
								NULL,						// Event wait list. Not used.
								NULL);						// Event object for determining status. Not used.
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time5 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY DEALLOCATION
	//======================================================================================================================================================150

	// Release kernels...
	clReleaseKernel(kernel);

	// Now the program...
	clReleaseProgram(program);

	// Clean up the device memory...
	clReleaseMemObject(recordsD);
	clReleaseMemObject(knodesD);

	clReleaseMemObject(currKnodeD);
	clReleaseMemObject(offsetD);
	clReleaseMemObject(keysD);
	clReleaseMemObject(ansD);

	// Flush the queue
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// ...and finally, the queue and context.
	clReleaseCommandQueue(command_queue);

	// ???
	clReleaseContext(context);

	time6 = get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");

	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);

	printf("Total time:\n");
	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);

	//======================================================================================================================================================150
	//	END
	//======================================================================================================================================================150

}
int 
kernel_gpu_opencl_wrapper(	int xmax,
							int workload,

							fp ***y,
							fp **x,
							fp **params,
							fp *com)
{

	//======================================================================================================================================================150
	//	VARIABLES
	//======================================================================================================================================================150

	long long time0;
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long timecopyin = 0;
	long long timekernel = 0;
	long long timecopyout = 0;
	long long timeother;
	//stage1_start
	time0 = get_time();

	int i;

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_GPU, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CRATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							"-I./../", 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Create kernel
	cl_kernel kernel;
	kernel = clCreateKernel(program, 
							"kernel_gpu_opencl", 
							&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();

	time1 = get_time();
//	double start_timer = omp_get_wtime();
	//======================================================================================================================================================150
	//	ALLOCATE MEMORY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	d_initvalu_mem
	//====================================================================================================100

	int d_initvalu_mem;
	d_initvalu_mem = EQUATIONS * sizeof(fp);
	cl_mem d_initvalu;
	d_initvalu = clCreateBuffer(context,					// context
								CL_MEM_READ_WRITE,			// flags
								d_initvalu_mem,				// size of buffer
								NULL,						// host pointer (optional)
								&error );					// returned error
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	d_finavalu_mem
	//====================================================================================================100

	int d_finavalu_mem;
	d_finavalu_mem = EQUATIONS * sizeof(fp);
	cl_mem d_finavalu;
	d_finavalu = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								d_finavalu_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	d_params_mem
	//====================================================================================================100

	int d_params_mem;
	d_params_mem = PARAMETERS * sizeof(fp);
	cl_mem d_params;
	d_params = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								d_params_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	d_com_mem
	//====================================================================================================100

	int d_com_mem;
	d_com_mem = 3 * sizeof(fp);
	cl_mem d_com;
	d_com = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							d_com_mem, 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	
	time2 = get_time();

	//======================================================================================================================================================150
	//	EXECUTION
	//======================================================================================================================================================150

	int status;

	for(i=0; i<workload; i++){

		status = solver(	y[i],
							x[i],
							xmax,
							params[i],
							com,

							d_initvalu,
							d_finavalu,
							d_params,
							d_com,

							command_queue,
							kernel,

							&timecopyin,
							&timekernel,
							&timecopyout);

		if(status !=0){
			printf("STATUS: %d\n", status);
		}

	}

	// // // print results
	// // int k;
	// // for(i=0; i<workload; i++){
		// // printf("WORKLOAD %d:\n", i);
		// // for(j=0; j<(xmax+1); j++){
			// // printf("\tTIME %d:\n", j);
			// // for(k=0; k<EQUATIONS; k++){
				// // printf("\t\ty[%d][%d][%d]=%13.10f\n", i, j, k, y[i][j][k]);
			// // }
		// // }
	// // }
//	double end_timer = omp_get_wtime();
//	printf("Time3-Time1 : %.8f\n",(end_timer - start_timer));
	time3 = get_time();

	//======================================================================================================================================================150
	//	FREE GPU MEMORY
	//======================================================================================================================================================150

	// Release kernels...
	clReleaseKernel(kernel);

	// Now the program...
	clReleaseProgram(program);

	// Clean up the device memory...
	clReleaseMemObject(d_initvalu);
	clReleaseMemObject(d_finavalu);
	clReleaseMemObject(d_params);
	clReleaseMemObject(d_com);

	// Flush the queue
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// ...and finally, the queue and context.
	clReleaseCommandQueue(command_queue);

	// ???
	clReleaseContext(context);

	time4= get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	printf("Time spent in different stages of the application:\n");
	printf("%15.12f s, %15.12f % : CPU: GPU SETUP\n", 								(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time4-time0) * 100);
	printf("%15.12f s, %15.12f % : CPU: ALLOCATE GPU MEMORY\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time4-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU: COMPUTATION\n", 							(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time4-time0) * 100);

	printf("\tGPU: COMPUTATION Components:\n");
	printf("\t%15.12f s, %15.12f % : GPU: COPY DATA IN\n", 							(float) (timecopyin) / 1000000, (float) (timecopyin) / (float) (time4-time0) * 100);
	printf("\t%15.12f s, %15.12f % : GPU: KERNEL\n", 								(float) (timekernel) / 1000000, (float) (timekernel) / (float) (time4-time0) * 100);
	printf("\t%15.12f s, %15.12f % : GPU: COPY DATA OUT\n", 						(float) (timecopyout) / 1000000, (float) (timecopyout) / (float) (time4-time0) * 100);
	timeother = time3-time2-timecopyin-timekernel-timecopyout;
	printf("\t%15.12f s, %15.12f % : GPU: OTHER\n", 								(float) (timeother) / 1000000, (float) (timeother) / (float) (time4-time0) * 100);

	printf("%15.12f s, %15.12f % : CPU: FREE GPU MEMORY\n", 						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time4-time0) * 100);
	printf("Total time 1:\n");
	printf("%.12f s\n", 															(float) (time4-time0) / 1000000);

	//======================================================================================================================================================150
	//	RETURN
	//======================================================================================================================================================150

	return 0;

	//======================================================================================================================================================150
	//	END
	//======================================================================================================================================================150

}
Esempio n. 6
0
cl_int clEnqueueNDRangeKernel_tony(cl_command_queue * cmdqueue,
                  cl_kernel kernel,cl_uint work_dim,
				  const size_t *global_work_size,const size_t *local_work_size)
  {
	 cl_event eventList[2];
     cl_int error;
     int CPU_RUN=0;
     int GPU_RUN=0;
	if(cpu_offset==0){
     		if(tony_device!=1){
			GPU_RUN=1;
		}	
	}
	else if(cpu_offset==100){
		if(tony_device!=2){
			CPU_RUN=1;
		}
	}
	else{
		if(tony_device==0){
			CPU_RUN=1;	
			GPU_RUN=1;	
		}else if (tony_device==1){
			CPU_RUN=1;
		}else{
			GPU_RUN=1;
		}
	}
     size_t remain_global_work_size[2];
     size_t global_offset[2];
     //NOTES(tony): if dim!=0, offset means offset of first dimensional.
     //we only care for first dimensional.Keep rest remain
         int d;
         for(d=1;d<work_dim;d++){
                 remain_global_work_size[d]=global_work_size[d];
                 global_offset[d]=global_work_size[d];
         }

         global_offset[0]=((double)cpu_offset/100)*(global_work_size[0]);
         global_offset[0]=Round(global_offset[0],(local_work_size[0]));
         remain_global_work_size[0]=global_work_size[0]-global_offset[0];
	 if(remain_global_work_size[0]==0){
	GPU_RUN=0;
	}
         //const size_t *const_remain = remain_global_work_size;
         //printf("global_workSize[0] %d , local_work_size[0] %d, global_offset is %d\n",global_work_size[0],local_work_size    [0],global_offset[0]);

         int numOfArgs;
         clGetKernelInfo(kernel,CL_KERNEL_NUM_ARGS,sizeof(int),&numOfArgs,NULL);
         //printf("Num of args is %d\n", numOfArgs);
         int groupOffset =0;
         if(CPU_RUN){

			 //printf("Launch into cpu,globaloffset is %d\n",global_offset[0]);			  
			 clSetKernelArg(kernel, numOfArgs-1, sizeof(int),(void*)&groupOffset);
			 error=clEnqueueNDRangeKernel(cmdqueue[0],kernel,work_dim,NULL,global_offset,local_work_size,0,NULL,&(eventList[0]    ));
			if (error != CL_SUCCESS)
                fatal_CL(error, __LINE__);
         }
    
         if(GPU_RUN){		 
			 groupOffset =global_offset[0] / (local_work_size[0]);
			 //printf("Launch into gpu,remain_global_work_sizeis %d\n",remain_global_work_size[0]);	
			 clSetKernelArg(kernel, numOfArgs-1, sizeof(int),(void*)&groupOffset);
			 error =clEnqueueNDRangeKernel(cmdqueue[1],kernel,work_dim,global_offset,remain_global_work_size,local_work_size,0    ,NULL,&(eventList[1]));
			if (error != CL_SUCCESS)
                fatal_CL(error, __LINE__);
         }
   
     if(CPU_RUN)   error|=clFlush(cmdqueue[0]);
     if(GPU_RUN)       error|=clFlush(cmdqueue[1]);
     //  printf("Try to flush\n");
     if(CPU_RUN)       error|=clWaitForEvents(1,&eventList[0]);
     if(GPU_RUN)       error|=clWaitForEvents(1,&eventList[1]);
     //clWaitForEvents(2,eventList);
     //  printf("kernel finished\n");
     return error;
 }
Esempio n. 7
0
int compute_tran_temp(cl_mem MatrixPower, cl_mem MatrixTemp[2], int col, int row, \
		int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows,
		float *TempCPU, float *PowerCPU) 
{ 

	float grid_height = chip_height / row;
	float grid_width = chip_width / col;

	float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
	float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
	float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
	float Rz = t_chip / (K_SI * grid_height * grid_width);

	float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
	float step = PRECISION / max_slope;
	int t;

	int src = 0, dst = 1;

	cl_int error;

	// Determine GPU work group grid
	size_t global_work_size[2];
	global_work_size[0] = BLOCK_SIZE * blockCols;
	global_work_size[1] = BLOCK_SIZE * blockRows;
	size_t local_work_size[2];
	local_work_size[0] = BLOCK_SIZE;
	local_work_size[1] = BLOCK_SIZE;


	long long start_time = get_time();	

	for (t = 0; t < total_iterations; t += num_iterations) {

		// Specify kernel arguments
		int iter = MIN(num_iterations, total_iterations - t);
		clSetKernelArg(kernel, 0, sizeof(int), (void *) &iter);
		clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &MatrixPower);
		clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &MatrixTemp[src]);
		clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &MatrixTemp[dst]);
		clSetKernelArg(kernel, 4, sizeof(int), (void *) &col);
		clSetKernelArg(kernel, 5, sizeof(int), (void *) &row);
		clSetKernelArg(kernel, 6, sizeof(int), (void *) &borderCols);
		clSetKernelArg(kernel, 7, sizeof(int), (void *) &borderRows);
		clSetKernelArg(kernel, 8, sizeof(float), (void *) &Cap);
		clSetKernelArg(kernel, 9, sizeof(float), (void *) &Rx);
		clSetKernelArg(kernel, 10, sizeof(float), (void *) &Ry);
		clSetKernelArg(kernel, 11, sizeof(float), (void *) &Rz);
		clSetKernelArg(kernel, 12, sizeof(float), (void *) &step);

		fprintf(stderr, "global_work_size[0]=%lu, global_work_size[1]=%lu\n", global_work_size[0], global_work_size[1]);
		// Launch kernel
#pragma dividend local_work_group_size local_work_size dim 2 dim1(1:32:2:32) dim2(1:32:2:32)
	//This lws will be used to profile the OpenCL kernel with id 1
			size_t _dividend_lws_local_work_size_k1[3];
		{
		_dividend_lws_local_work_size_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL);
		_dividend_lws_local_work_size_k1[1] = getLWSValue("DIVIDEND_LWS1_D1",DIVIDEND_LWS1_D1_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_local_work_size_k1[2] = 1;
		}
				error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(command_queue, kernel, 2, NULL, global_work_size, _dividend_lws_local_work_size_k1, 0, NULL, NULL);
		if (error != CL_SUCCESS) fatal_CL(error, __LINE__);

		// Flush the queue
		error = clFlush(command_queue);
		if (error != CL_SUCCESS) fatal_CL(error, __LINE__);

		// Swap input and output GPU matrices
		src = 1 - src;
		dst = 1 - dst;
	}

	// Wait for all operations to finish
	error = DIVIDEND_CL_WRAP(clFinish)(command_queue);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);

	long long end_time = get_time();
	long long total_time = (end_time - start_time);	
	printf("\nKernel time: %.3f seconds\n", ((float) total_time) / (1000*1000));

	return src;
}