Example #1
0
int main(int argc, char** argv) {

   // Set up the data on the host	
   clock_t start, start0;
   start0 = clock();
   start = clock();
   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";



   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth, 
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Pad the number of columns 
#ifdef NON_OPTIMIZED
   int deviceWidth = imageWidth;
#else  // READ_ALIGNED || READ4
   int deviceWidth = roundUp(imageWidth, WGX);
#endif
   int deviceHeight = imageHeight;
   // Size of the input and output images on the device
   int deviceDataSize = imageHeight*deviceWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);
   int i, j;
   for(i = 0; i < imageHeight; i++) {
       for(j = 0; j < imageWidth; j++) {
           outputImage[i*imageWidth+j] = 0;
       }
   }

   // 45 degree motion blur
   float filter[49] = 
      {0,      0,      0,      0,      0, 0.0145,      0,
       0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
       0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
       0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
       0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
  0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
       0, 0.0145,      0,      0,      0,      0,      0};
 
   int filterWidth = 7;
   int paddingPixels = (int)(filterWidth/2) * 2; 
   stoptime(start, "set up input, output.");
   start = clock();
   // Set up the OpenCL environment

   // Discovery platform
   cl_platform_id platform;
   clGetPlatformIDs(1, &platform, NULL);

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device,
      NULL);

    size_t time_res;
    clGetDeviceInfo(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
            sizeof(time_res), &time_res, NULL);
    printf("Device profiling timer resolution: %zu ns.\n", time_res);

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 
       (cl_context_properties)(platform), 0};
   cl_context context; 
   context = clCreateContext(props, 1, &device, NULL, NULL, 
      NULL);

   // Create command queue
   cl_ulong time_start, time_end, exec_time;
   cl_event timing_event;
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

   // Create memory buffers
   cl_mem d_inputImage;
   cl_mem d_outputImage;
   cl_mem d_filter;
   d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       deviceDataSize, NULL, NULL);
   d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
       deviceDataSize, NULL, NULL);
   d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       49*sizeof(float),NULL, NULL);
   
   // Write input data to the device
#ifdef NON_OPTIMIZED
   clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize,
       inputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   size_t buffer_origin[3] = {0,0,0};
   size_t host_origin[3] = {0,0,0};
   size_t region[3] = {deviceWidth*sizeof(float), 
      imageHeight, 1};
   clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0,
      inputImage, 0, NULL, NULL);
#endif
	
   // Write the filter to the device
   clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, 
      49*sizeof(float), filter, 0, NULL, NULL);
	
   // Read in the program from file
   char* source = readSource("convolution.cl");

   // Create the program
   cl_program program;
	
   // Create and compile the program
   program = clCreateProgramWithSource(context, 1, 
       (const char**)&source, NULL, NULL);
   cl_int build_status;
   build_status = clBuildProgram(program, 1, &device, NULL, NULL,
      NULL);
      
   // Create the kernel
   cl_kernel kernel;
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   // Only the host-side code differs for the aligned reads
   kernel = clCreateKernel(program, "convolution", NULL);
#else // READ4
   kernel = clCreateKernel(program, "convolution_read4", NULL);
#endif
	
   // Selected work group size is 16x16
   int wgWidth = WGX;
   int wgHeight = WGY;

   // When computing the total number of work items, the 
   // padding work items do not need to be considered
   int totalWorkItemsX = roundUp(imageWidth-paddingPixels, 
      wgWidth);
   int totalWorkItemsY = roundUp(imageHeight-paddingPixels, 
      wgHeight);

   // Size of a work group
   size_t localSize[2] = {wgWidth, wgHeight};
   // Size of the NDRange
   size_t globalSize[2] = {totalWorkItemsX, totalWorkItemsY};

   // The amount of local data that is cached is the size of the
   // work groups plus the padding pixels
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   int localWidth = localSize[0] + paddingPixels;
#else // READ4
   // Round the local width up to 4 for the read4 kernel
   int localWidth = roundUp(localSize[0]+paddingPixels, 4);
#endif
   int localHeight = localSize[1] + paddingPixels;

   // Compute the size of local memory (needed for dynamic 
   // allocation)
   size_t localMemSize = (localWidth * localHeight * 
      sizeof(float));

   // Set the kernel arguments
   clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
   clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
   clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter);
   clSetKernelArg(kernel, 3, sizeof(int), &deviceHeight);
   clSetKernelArg(kernel, 4, sizeof(int), &deviceWidth); 
   clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
   clSetKernelArg(kernel, 6, localMemSize, NULL);
   clSetKernelArg(kernel, 7, sizeof(int), &localHeight); 
   clSetKernelArg(kernel, 8, sizeof(int), &localWidth);

   stoptime(start, "set up kernel");
   start = clock();
   // Execute the kernel
   clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, 
      localSize, 0, NULL, &timing_event);

   // Wait for kernel to complete
   clFinish(queue);
   stoptime(start, "run kernel");
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
   exec_time = time_end-time_start;
   printf("Profile execution time = %.3lf sec.\n", (double) exec_time/1000000000);

   // Read back the output image
#ifdef NON_OPTIMIZED
   clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, 
      deviceDataSize, outputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   // Begin reading output from (3,3) on the device 
   // (for 7x7 filter with radius 3)
   buffer_origin[0] = 3*sizeof(float);
   buffer_origin[1] = 3;
   buffer_origin[2] = 0;

   // Read data into (3,3) on the host
   host_origin[0] = 3*sizeof(float);
   host_origin[1] = 3;
   host_origin[2] = 0;
	
   // Region is image size minus padding pixels
   region[0] = (imageWidth-paddingPixels)*sizeof(float);
   region[1] = (imageHeight-paddingPixels);
   region[2] = 1;
	
	// Perform the read
   clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, 
      outputImage, 0, NULL, NULL);
#endif
  
   // Homegrown function to write the image to file
   storeImage(outputImage, outputFile, imageHeight, 
      imageWidth, inputFile);
   
   // Free OpenCL objects
   clReleaseMemObject(d_inputImage);
   clReleaseMemObject(d_outputImage);
   clReleaseMemObject(d_filter);
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(queue);
   clReleaseContext(context);

   return 0;
}
int MathBenchmark::runCLKernels() {
	cl_int status = 0;
	size_t WorkDim = 1;
	size_t szGlobalWorkSize = globalThreads;
	size_t szLocalWorkSize = localThreads;
	size_t szForNum = repeat;

	//create event to record time
	cl_event event_sinpi_withDD, event_sinpi_withoutDD;
	cl_event event_cospi_withDD, event_cospi_withoutDD;
	cl_event event_tanpi_withDD, event_tanpi_withoutDD;
	cl_event event_sincos_withDD, event_sincos_withoutDD;

	//Create Variable for story result reading from device to host
	void *sinpi_withDD_num, *sinpi_withoutDD_num;
	void *cospi_withDD_num, *cospi_withoutDD_num;
	void *tanpi_withDD_num, *tanpi_withoutDD_num;
	void *sincos_withDD_num, *sincos_withoutDD_num;

	sinpi_withDD_num = (void *) malloc(sizeof(cl_float));
	sinpi_withoutDD_num = (void *) malloc(sizeof(cl_float));
	cospi_withDD_num = (void *) malloc(sizeof(cl_float));
	cospi_withoutDD_num = (void *) malloc(sizeof(cl_float));
	tanpi_withDD_num = (void *) malloc(sizeof(cl_float));
	tanpi_withoutDD_num = (void *) malloc(sizeof(cl_float));
	sincos_withDD_num = (void *) malloc(sizeof(cl_float));
	sincos_withoutDD_num = (void *) malloc(sizeof(cl_float));

	float sinpi_withDD_maxGflops = 0.0;
	float sinpi_withoutDD_maxGflops = 0.0;
	float cospi_withDD_maxGflops = 0.0;
	float cospi_withoutDD_maxGflops = 0.0;
	float tanpi_withDD_maxGflops = 0.0;
	float tanpi_withoutDD_maxGflops = 0.0;
	float sincos_withDD_maxGflops = 0.0;
	float sincos_withoutDD_maxGflops = 0.0;

	//create buffer
	cl_mem result_sinpi_withDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);
	cl_mem result_sinpi_withoutDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);
	cl_mem result_cospi_withDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);
	cl_mem result_cospi_withoutDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);
	cl_mem result_tanpi_withDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);
	cl_mem result_tanpi_withoutDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);
	cl_mem result_sincos_withDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);
	cl_mem result_sincos_withoutDD = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float), NULL, NULL);


	//set kernel_sinpi_withDD Argument
	status |= clSetKernelArg(kernel[0] , 0, sizeof(cl_mem),
			(void*) &result_sinpi_withDD);
	status |= clSetKernelArg(kernel[0] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_sinpi_withDD)");
	
	//set kernel_sinpi_withoutDD Argument
	status |= clSetKernelArg(kernel[1] , 0, sizeof(cl_mem),
			(void*) &result_sinpi_withoutDD);
	status |= clSetKernelArg(kernel[1] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_sinpi_withoutDD)");

	//set kernel_cospi_withDD Argument
	status |= clSetKernelArg(kernel[2] , 0, sizeof(cl_mem),
			(void*) &result_cospi_withDD);
	status |= clSetKernelArg(kernel[2] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_cospi_withDD)");

	//set kernel_cospi_withoutDD Argument
	status |= clSetKernelArg(kernel[3] , 0, sizeof(cl_mem),
			(void*) &result_cospi_withoutDD);
	status |= clSetKernelArg(kernel[3] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_cospi_withoutDD)");

	//set kernel_tanpi_withDD Argument
	status |= clSetKernelArg(kernel[4] , 0, sizeof(cl_mem),
			(void*) &result_tanpi_withDD);
	status |= clSetKernelArg(kernel[4] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_tanpi_withDD)");

	//set kernel_tanpi_withoutDD Argument
	status |= clSetKernelArg(kernel[5] , 0, sizeof(cl_mem),
			(void*) &result_tanpi_withoutDD);
	status |= clSetKernelArg(kernel[5] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_tanpi_withoutDD)");


	//set kernel_sincos_withDD Argument
	status |= clSetKernelArg(kernel[6] , 0, sizeof(cl_mem),
			(void*) &result_sincos_withDD);
	status |= clSetKernelArg(kernel[6] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_sincos_withDD)");

	//set kernel_sincos_withoutDD Argument
	status |= clSetKernelArg(kernel[7] , 0, sizeof(cl_mem),
			(void*) &result_sincos_withoutDD);
	status |= clSetKernelArg(kernel[7] , 1, sizeof(size_t), (void*) &szForNum);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (kernel_sincos_withoutDD)");



	int i = 0;
	int gws, lws;
	for (gws = 1024; gws <= 65536 * 8; gws *= 2)
		for (lws = 64; lws <= 256; lws *= 2) {
			
			float executionTime_sinpi_withDD_max = 0.0;
			float executionTime_sinpi_withDD_avg = 0.0;
			float executionTime_sinpi_withDD_min = 999999999.0;

			float executionTime_sinpi_withoutDD_max = 0.0;
			float executionTime_sinpi_withoutDD_avg = 0.0;
			float executionTime_sinpi_withoutDD_min = 999999999.0;

			float executionTime_cospi_withDD_max = 0.0;
			float executionTime_cospi_withDD_avg = 0.0;
			float executionTime_cospi_withDD_min = 999999999.0;

			float executionTime_cospi_withoutDD_max = 0.0;
			float executionTime_cospi_withoutDD_avg = 0.0;
			float executionTime_cospi_withoutDD_min = 999999999.0;

			float executionTime_tanpi_withDD_max = 0.0;
			float executionTime_tanpi_withDD_avg = 0.0;
			float executionTime_tanpi_withDD_min = 999999999.0;

			float executionTime_tanpi_withoutDD_max = 0.0;
			float executionTime_tanpi_withoutDD_avg = 0.0;
			float executionTime_tanpi_withoutDD_min = 999999999.0;

			float executionTime_sincos_withDD_max = 0.0;
			float executionTime_sincos_withDD_avg = 0.0;
			float executionTime_sincos_withDD_min = 999999999.0;

			float executionTime_sincos_withoutDD_max = 0.0;
			float executionTime_sincos_withoutDD_avg = 0.0;
			float executionTime_sincos_withoutDD_min = 999999999.0;

			szGlobalWorkSize = gws;
			szLocalWorkSize = lws;
			printf("-----------------------------------------------------\n");
			printf("Set gws = %d , lws = %d\n", gws, lws);

			//launch kernel_sinpi_withDD
			if (!strcmp(kernelname.c_str(), "sinpi_withDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_sinpi_withDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[0] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_sinpi_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_sinpi
					status = clWaitForEvents(1, &event_sinpi_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_sinpi_withDD, end_sinpi_withDD;
					status = clGetEventProfilingInfo(event_sinpi_withDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_sinpi_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_sinpi_withDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_sinpi_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_sinpi_withDD = (end_sinpi_withDD
							- start_sinpi_withDD);
					if (executionTime_sinpi_withDD_max
							< executionTime_sinpi_withDD) {
						executionTime_sinpi_withDD_max = executionTime_sinpi_withDD;
					}
					if (executionTime_sinpi_withDD_min
							> executionTime_sinpi_withDD) {
						executionTime_sinpi_withDD_min = executionTime_sinpi_withDD;
					}
					executionTime_sinpi_withDD_avg += executionTime_sinpi_withDD;

				}
				executionTime_sinpi_withDD_avg = (executionTime_sinpi_withDD_avg
						- executionTime_sinpi_withDD_max
						- executionTime_sinpi_withDD_min) / (iterations - 2);
				size_t time_sinpi_withDD=0;
					time_sinpi_withDD= 64*szForNum * vectorSize;
				float Gflops_sinpi_withDD =
						(time_sinpi_withDD * szGlobalWorkSize)
								/ executionTime_sinpi_withDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_sinpi_withDD,
						CL_TRUE, 0, sizeof(cl_mem), sinpi_withDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-sinpi_withDD : %f\n", Gflops_sinpi_withDD);
				printf("Result-sinpi_withDD   :   %f\n\n", *((float*)sinpi_withDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_sinpi_withDD > sinpi_withDD_maxGflops){
						sinpi_withDD_maxGflops = Gflops_sinpi_withDD;

				}
			}

			//launch kernel_sinpi_withoutDD
			if (!strcmp(kernelname.c_str(), "sinpi_withoutDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_sinpi_withoutDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[1] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_sinpi_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_sinpi
					status = clWaitForEvents(1, &event_sinpi_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_sinpi_withoutDD, end_sinpi_withoutDD;
					status = clGetEventProfilingInfo(event_sinpi_withoutDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_sinpi_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_sinpi_withoutDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_sinpi_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_sinpi_withoutDD = (end_sinpi_withoutDD
							- start_sinpi_withoutDD);
					if (executionTime_sinpi_withoutDD_max
							< executionTime_sinpi_withoutDD) {
						executionTime_sinpi_withoutDD_max = executionTime_sinpi_withoutDD;
					}
					if (executionTime_sinpi_withoutDD_min
							> executionTime_sinpi_withoutDD) {
						executionTime_sinpi_withoutDD_min = executionTime_sinpi_withoutDD;
					}
					executionTime_sinpi_withoutDD_avg += executionTime_sinpi_withoutDD;

				}
				executionTime_sinpi_withoutDD_avg = (executionTime_sinpi_withoutDD_avg
						- executionTime_sinpi_withoutDD_max
						- executionTime_sinpi_withoutDD_min) / (iterations - 2);
				size_t time_sinpi_withoutDD=0;
				//if(vectorSize==1||vectorSize==2||vectorSize==4||vectorSize==8)
					time_sinpi_withoutDD=  10 * 10  *szForNum*vectorSize;
				/*if(vectorSize==8)
					time_sinpi_withoutDD=  640 * 10 * szForNum * vectorSize;*/
				//if(vectorSize==16)
				//	time_sinpi_withoutDD=  10 * 10 * szForNum * vectorSize;
				float Gflops_sinpi_withoutDD =
						(time_sinpi_withoutDD * szGlobalWorkSize)
								/ executionTime_sinpi_withoutDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_sinpi_withoutDD,
						CL_TRUE, 0, sizeof(cl_mem), sinpi_withoutDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-sinpi_withoutDD : %f\n", Gflops_sinpi_withoutDD);
				printf("Result-sinpi_withoutDD   :   %f\n\n", *((float*)sinpi_withoutDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_sinpi_withoutDD > sinpi_withoutDD_maxGflops){
						sinpi_withoutDD_maxGflops = Gflops_sinpi_withoutDD;

				}

			}

			//launch kernel_cospi_withDD
			if (!strcmp(kernelname.c_str(), "cospi_withDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_cospi_withDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[2] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_cospi_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_cospi
					status = clWaitForEvents(1, &event_cospi_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_cospi_withDD, end_cospi_withDD;
					status = clGetEventProfilingInfo(event_cospi_withDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_cospi_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_cospi_withDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_cospi_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_cospi_withDD = (end_cospi_withDD
							- start_cospi_withDD);
					if (executionTime_cospi_withDD_max
							< executionTime_cospi_withDD) {
						executionTime_cospi_withDD_max = executionTime_cospi_withDD;
					}
					if (executionTime_cospi_withDD_min
							> executionTime_cospi_withDD) {
						executionTime_cospi_withDD_min = executionTime_cospi_withDD;
					}
					executionTime_cospi_withDD_avg += executionTime_cospi_withDD;

				}
				executionTime_cospi_withDD_avg = (executionTime_cospi_withDD_avg
						- executionTime_cospi_withDD_max
						- executionTime_cospi_withDD_min) / (iterations - 2);
				size_t time_cospi_withDD=0;
				/*if(vectorSize==1||vectorSize==2)
					time_cospi_withDD= 64 * szForNum * vectorSize;
				if(vectorSize==4)
					time_cospi_withDD= 32 * szForNum * vectorSize;
				if(vectorSize==8||vectorSize==16)*/
					time_cospi_withDD= 64 * szForNum * vectorSize;
				float Gflops_cospi_withDD =
						(time_cospi_withDD * szGlobalWorkSize)
								/ executionTime_cospi_withDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_cospi_withDD,
						CL_TRUE, 0, sizeof(cl_mem), cospi_withDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-cospi_withDD : %f\n", Gflops_cospi_withDD);
				printf("Result-cospi_withDD   :   %f\n\n", *((float*)cospi_withDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_cospi_withDD > cospi_withDD_maxGflops){
						cospi_withDD_maxGflops = Gflops_cospi_withDD;

				}
			}

			//launch kernel_cospi_withoutDD
			if (!strcmp(kernelname.c_str(), "cospi_withoutDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_cospi_withoutDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[3] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_cospi_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_cospi
					status = clWaitForEvents(1, &event_cospi_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_cospi_withoutDD, end_cospi_withoutDD;
					status = clGetEventProfilingInfo(event_cospi_withoutDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_cospi_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_cospi_withoutDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_cospi_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_cospi_withoutDD = (end_cospi_withoutDD
							- start_cospi_withoutDD);
					if (executionTime_cospi_withoutDD_max
							< executionTime_cospi_withoutDD) {
						executionTime_cospi_withoutDD_max = executionTime_cospi_withoutDD;
					}
					if (executionTime_cospi_withoutDD_min
							> executionTime_cospi_withoutDD) {
						executionTime_cospi_withoutDD_min = executionTime_cospi_withoutDD;
					}
					executionTime_cospi_withoutDD_avg += executionTime_cospi_withoutDD;

				}
				executionTime_cospi_withoutDD_avg = (executionTime_cospi_withoutDD_avg
						- executionTime_cospi_withoutDD_max
						- executionTime_cospi_withoutDD_min) / (iterations - 2);
				size_t time_cospi_withoutDD=0;
				//if(vectorSize==1)
					time_cospi_withoutDD= 10 * 10 * szForNum * vectorSize;
				/*if(vectorSize==2||vectorSize==4)
					time_cospi_withoutDD= 128 * 12 * szForNum * vectorSize;
				if(vectorSize==8)
					time_cospi_withoutDD= 16 * 12 * szForNum * vectorSize;
				if(vectorSize==16)
					time_cospi_withoutDD= 8 * 12  * szForNum * vectorSize;*/
				float Gflops_cospi_withoutDD =
						(time_cospi_withoutDD * szGlobalWorkSize)
								/ executionTime_cospi_withoutDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_cospi_withoutDD,
						CL_TRUE, 0, sizeof(cl_mem), cospi_withoutDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-cospi_withoutDD : %f\n", Gflops_cospi_withoutDD);
				printf("Result-cospi_withoutDD   :   %f\n\n", *((float*)cospi_withoutDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_cospi_withoutDD > cospi_withoutDD_maxGflops){
						cospi_withoutDD_maxGflops = Gflops_cospi_withoutDD;

				}
			}

			//launch kernel_tanpi_withDD
			if (!strcmp(kernelname.c_str(), "tanpi_withDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_tanpi_withDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[4] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_tanpi_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_tanpi
					status = clWaitForEvents(1, &event_tanpi_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_tanpi_withDD, end_tanpi_withDD;
					status = clGetEventProfilingInfo(event_tanpi_withDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_tanpi_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_tanpi_withDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_tanpi_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_tanpi_withDD = (end_tanpi_withDD
							- start_tanpi_withDD);
					if (executionTime_tanpi_withDD_max
							< executionTime_tanpi_withDD) {
						executionTime_tanpi_withDD_max = executionTime_tanpi_withDD;
					}
					if (executionTime_tanpi_withDD_min
							> executionTime_tanpi_withDD) {
						executionTime_tanpi_withDD_min = executionTime_tanpi_withDD;
					}
					executionTime_tanpi_withDD_avg += executionTime_tanpi_withDD;

				}
				executionTime_tanpi_withDD_avg = (executionTime_tanpi_withDD_avg
						- executionTime_tanpi_withDD_max
						- executionTime_tanpi_withDD_min) / (iterations - 2);
				size_t time_tanpi_withDD=0;
					time_tanpi_withDD= 64 * szForNum * vectorSize;
				float Gflops_tanpi_withDD =
						(time_tanpi_withDD * szGlobalWorkSize)
								/ executionTime_tanpi_withDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_tanpi_withDD,
						CL_TRUE, 0, sizeof(cl_mem), tanpi_withDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-tanpi_withDD : %f\n", Gflops_tanpi_withDD);
				printf("Result-tanpi_withDD   :   %f\n\n", *((float*)tanpi_withDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_tanpi_withDD > tanpi_withDD_maxGflops){
						tanpi_withDD_maxGflops = Gflops_tanpi_withDD;

				}
			}

			//launch kernel_tanpi_withoutDD
			if (!strcmp(kernelname.c_str(), "tanpi_withoutDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_tanpi_withoutDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[5] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_tanpi_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_tanpi
					status = clWaitForEvents(1, &event_tanpi_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_tanpi_withoutDD, end_tanpi_withoutDD;
					status = clGetEventProfilingInfo(event_tanpi_withoutDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_tanpi_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_tanpi_withoutDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_tanpi_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_tanpi_withoutDD = (end_tanpi_withoutDD
							- start_tanpi_withoutDD);
					if (executionTime_tanpi_withoutDD_max
							< executionTime_tanpi_withoutDD) {
						executionTime_tanpi_withoutDD_max = executionTime_tanpi_withoutDD;
					}
					if (executionTime_tanpi_withoutDD_min
							> executionTime_tanpi_withoutDD) {
						executionTime_tanpi_withoutDD_min = executionTime_tanpi_withoutDD;
					}
					executionTime_tanpi_withoutDD_avg += executionTime_tanpi_withoutDD;

				}
				executionTime_tanpi_withoutDD_avg = (executionTime_tanpi_withoutDD_avg
						- executionTime_tanpi_withoutDD_max
						- executionTime_tanpi_withoutDD_min) / (iterations - 2);
				size_t time_tanpi_withoutDD=0;
				time_tanpi_withoutDD= 10 * 10* szForNum * vectorSize;
				float Gflops_tanpi_withoutDD =
						(time_tanpi_withoutDD * szGlobalWorkSize)
								/ executionTime_tanpi_withoutDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_tanpi_withoutDD,
						CL_TRUE, 0, sizeof(cl_mem), tanpi_withoutDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-tanpi_withoutDD : %f\n", Gflops_tanpi_withoutDD);
				printf("Result-tanpi_withoutDD   :   %f\n\n", *((float*)tanpi_withoutDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_tanpi_withoutDD > tanpi_withoutDD_maxGflops){
						tanpi_withoutDD_maxGflops = Gflops_tanpi_withoutDD;

				}

			}

			//launch kernel_sincos_withDD
			if (!strcmp(kernelname.c_str(), "sincos_withDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_sincos_withDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[6] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_sincos_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_sincos
					status = clWaitForEvents(1, &event_sincos_withDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_sincos_withDD, end_sincos_withDD;
					status = clGetEventProfilingInfo(event_sincos_withDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_sincos_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_sincos_withDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_sincos_withDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_sincos_withDD = (end_sincos_withDD
							- start_sincos_withDD);
					if (executionTime_sincos_withDD_max
							< executionTime_sincos_withDD) {
						executionTime_sincos_withDD_max = executionTime_sincos_withDD;
					}
					if (executionTime_sincos_withDD_min
							> executionTime_sincos_withDD) {
						executionTime_sincos_withDD_min = executionTime_sincos_withDD;
					}
					executionTime_sincos_withDD_avg += executionTime_sincos_withDD;

				}
				executionTime_sincos_withDD_avg = (executionTime_sincos_withDD_avg
						- executionTime_sincos_withDD_max
						- executionTime_sincos_withDD_min) / (iterations - 2);
				size_t time_sincos_withDD=0;
					time_sincos_withDD= 64*szForNum * vectorSize;
				float Gflops_sincos_withDD =
						(time_sincos_withDD * szGlobalWorkSize)
								/ executionTime_sincos_withDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_sincos_withDD,
						CL_TRUE, 0, sizeof(cl_mem), sincos_withDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-sincos_withDD : %f\n", Gflops_sincos_withDD);
				printf("Result-sincos_withDD   :   %f\n\n", *((float*)sincos_withDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_sincos_withDD > sincos_withDD_maxGflops){
						sincos_withDD_maxGflops = Gflops_sincos_withDD;

				}
			}

			//launch kernel_sincos_withoutDD
			if (!strcmp(kernelname.c_str(), "sincos_withoutDD")
					|| !strcmp(kernelname.c_str(), "all_kernels")) {
				std::cout << "KERNEL NAME:" <<kernelname.c_str()<< vectorSize <<std::endl;
				printf("Begin to launch kernel_sincos_withoutDD\n");
				for (i = 0; i < iterations; i++) {
					status = clEnqueueNDRangeKernel(commandQueue, kernel[7] ,
							WorkDim, NULL, &szGlobalWorkSize, &szLocalWorkSize,
							0, NULL, &event_sincos_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					//record time kernel_sincos
					status = clWaitForEvents(1, &event_sincos_withoutDD);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					cl_ulong start_sincos_withoutDD, end_sincos_withoutDD;
					status = clGetEventProfilingInfo(event_sincos_withoutDD,
							CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
							&start_sincos_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");
					status = clGetEventProfilingInfo(event_sincos_withoutDD,
							CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
							&end_sincos_withoutDD, NULL);
					CHECK_OPENCL_ERROR(status,
							"clEnqueueNDRangeKernel failed. (kernel_none)");

					float executionTime_sincos_withoutDD = (end_sincos_withoutDD
							- start_sincos_withoutDD);
					if (executionTime_sincos_withoutDD_max
							< executionTime_sincos_withoutDD) {
						executionTime_sincos_withoutDD_max = executionTime_sincos_withoutDD;
					}
					if (executionTime_sincos_withoutDD_min
							> executionTime_sincos_withoutDD) {
						executionTime_sincos_withoutDD_min = executionTime_sincos_withoutDD;
					}
					executionTime_sincos_withoutDD_avg += executionTime_sincos_withoutDD;

				}
				executionTime_sincos_withoutDD_avg = (executionTime_sincos_withoutDD_avg
						- executionTime_sincos_withoutDD_max
						- executionTime_sincos_withoutDD_min) / (iterations - 2);
				size_t time_sincos_withoutDD=0;
				//if(vectorSize==1||vectorSize==2||vectorSize==4||vectorSize==8)
					time_sincos_withoutDD= 10 * 10 * szForNum * vectorSize;
				/*if(vectorSize==8)
					time_sincos_withoutDD=  640 * 10 * szForNum * vectorSize;*/
				//if(vectorSize==16)
				//	time_sincos_withoutDD=  10 * 10 * szForNum * vectorSize;
				float Gflops_sincos_withoutDD =
						(time_sincos_withoutDD * szGlobalWorkSize)
								/ executionTime_sincos_withoutDD_avg;
				status = clEnqueueReadBuffer(commandQueue, result_sincos_withoutDD,
						CL_TRUE, 0, sizeof(cl_mem), sincos_withoutDD_num, NULL, NULL,
						NULL);
				CHECK_OPENCL_ERROR(status,
						"clEnqueueNDRangeKernel failed. (kernel_none)");
				printf("GFLOPs-sincos_withoutDD : %f\n", Gflops_sincos_withoutDD);
				printf("Result-sincos_withoutDD   :   %f\n\n", *((float*)sincos_withoutDD_num));
				printf("-----------------------------------------------------\n");
				if (Gflops_sincos_withoutDD > sincos_withoutDD_maxGflops){
						sincos_withoutDD_maxGflops = Gflops_sincos_withoutDD;

				}

			}

	}	
	printf("sinpi_withDD_maxGflops = %f\n",sinpi_withDD_maxGflops);
	printf("sinpi_withoutDD_maxGflops = %f\n",sinpi_withoutDD_maxGflops);
	printf("cospi_withDD_maxGflops = %f\n",cospi_withDD_maxGflops);
	printf("cospi_withoutDD_maxGflops = %f\n",cospi_withoutDD_maxGflops);
	printf("tanpi_withDD_maxGflops = %f\n",tanpi_withDD_maxGflops);
	printf("tanpi_withoutDD_maxGflops = %f\n",tanpi_withoutDD_maxGflops);
	printf("sincos_withDD_maxGflops = %f\n",sincos_withDD_maxGflops);
	printf("sincos_withoutDD_maxGflops = %f\n",sincos_withoutDD_maxGflops);

	return SDK_SUCCESS;
}
Example #3
0
/*!
 * @function clut_blurImage_local_unlimited
 * Blurs the image at [filename] with a filter of size [filter_size], and saves the result
 * to the file "output_unlimited.png". This function should be optimized to run on
 * local memory.
 * @param filename
 * The name of the file.
 * @param filter_size
 * The size of the blur filter.
 * @return
 * 0 on success, non-0 on failure.
 */
int clut_blurImage_local_unlimited(const cl_device_id device, const char * const filename, const unsigned int filter_size)
{
	const char * const fname = "clut_blurImage_local";
	int return_value = 1;
	cl_int ret;

	if (NULL == filename) {
		Debug_out(DEBUG_HOMEWORK, "%s: NULL pointer argument.\n", fname);
		goto error1;
	}

	/* compute work group size */
	size_t local_width, local_height;
	if (0 != clut_getMaxWGSize(device, &local_width, &local_height)) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unable to get work group sizes.\n", fname);
		goto error1;
	}
	Debug_out(DEBUG_HOMEWORK, "%s: Max work group size is [%zu]x[%zu].\n", fname, local_width, local_height);

	/* openCL wants to know the size of __local statically allocated arrays at compile time,
	 * so the local size must be set with a #define */
	char *flags = calloc(128, sizeof(char));
	if (NULL == flags) {
		Debug_out(DEBUG_HOMEWORK, "%s: A calloc failed.\n", fname);
		goto error1;
	}
	sprintf(flags, "-D LOCAL_WIDTH=%zu -D LOCAL_HEIGHT=%zu -D FILTER_SIZE=%d", local_width, local_height, filter_size);
	Debug_out(DEBUG_HOMEWORK, "%s: Local flags are: '%s'.\n", fname, flags);

	/* Create context */
	cl_context context = clCreateContext(NULL, 1, &device, clut_contextCallback, "clut_blurImage_local_unlimited", &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create context", error2);
	Debug_out(DEBUG_HOMEWORK, "%s: Created context successfully.\n", fname);

	/* Create program */
	cl_program program = clut_createProgramFromFile(context, "homework_unlimited.cl", flags);
	if (NULL == program) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unable to create program.\n", fname);
		goto error3;
	}
	Debug_out(DEBUG_HOMEWORK, "%s: Program created.\n", fname);

	/* Create kernel */
	cl_kernel kernel = clCreateKernel(program, "blurImage_local_unlimited", &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create kernel", error4);
	Debug_out(DEBUG_HOMEWORK, "%s: Kernel created.\n", fname);

	/* Create command_queue */
	cl_command_queue command_queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create command queue", error5);
	Debug_out(DEBUG_HOMEWORK, "%s: Command queue created.\n", fname);

	/* open source image */
	int width, height;
	cl_mem source_image = clut_loadImageFromFile(context, filename, &width, &height);
	if (NULL == source_image) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unable to read source image.\n", fname);
		goto error6;
	}

	if ((filter_size > (unsigned int) width) || (filter_size > (unsigned int) height)) {
		Debug_out(DEBUG_HOMEWORK, "%s: Filter does not fit in image.\n", fname);
		goto error7;
	}

	/* crate destination image */
	cl_image_format image_format = {0, 0};

	cl_image_desc image_desc = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
	image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
//	image_desc.image_width		= 0;
//	image_desc.image_height		= 0;
//	image_desc.image_depth		= 0; /* only for 3D images */
//	image_desc.image_array_size	= 0; /* only for image arrays */
//	image_desc.image_row_pitch	= 0;
//	image_desc.image_slice_pitch	= 0; /* only for 3D images */
//	image_desc.num_mip_levels	= 0; /* mandatory */
//	image_desc.num_samples		= 0; /* mandatory */
//	image_desc.buffer		= NULL; /* only for 1D image buffers */

	ret = clGetImageInfo(source_image, CL_IMAGE_FORMAT, sizeof(image_format), &image_format, NULL);
	CLUT_CHECK_ERROR(ret, "Unable to get source image format information", error7);

	int components = clut_getImageFormatComponents(image_format);
	if (0 > components) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unknown components for source image.\n", fname);
		goto error7;
	}
	Debug_out(DEBUG_HOMEWORK, "%s: Source image has %d components.\n", fname, components);

	image_desc.image_width = width - filter_size + 1;
	image_desc.image_height = height - filter_size + 1;
	image_desc.image_row_pitch = image_desc.image_width * components;

	cl_mem result_image = clCreateImage(context, CL_MEM_WRITE_ONLY, &image_format, &image_desc, NULL, &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create second image", error7);

	/* fill result image with black */
	const unsigned int fill_color[4] = { 0, 0, 0, 255 };
	const size_t fill_origin[3] = { 0, 0, 0 };
	const size_t fill_region[3] = { width - filter_size + 1, height - filter_size + 1, 1 };
	ret = clEnqueueFillImage(command_queue, result_image, fill_color, fill_origin, fill_region, 0, NULL, NULL);
	CLUT_CHECK_ERROR(ret, "Unable to fill result image", error8);

	Debug_out(DEBUG_HOMEWORK, "%s: Images created.\n", fname);

	/* create filter matrix */
	unsigned char *filter_matrix = createFilterMatrix(filter_size);
	if (NULL == filter_matrix) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unable to create filter matrix.\n", fname);
		goto error8;
	}
	Debug_out(DEBUG_HOMEWORK, "%s: Filter matrix created.\n", fname);
//	printFilterMatrix(filter_matrix, filter_size);

	/* copy filter matrix to device */
	cl_mem filter_matrix_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, filter_size * filter_size, filter_matrix, &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create filter matrix buffer on device", error9);

	/* set kernel arguments */
	ret = clSetKernelArg(kernel, 0, sizeof(source_image), (void *) &source_image);
	CLUT_CHECK_ERROR(ret, "Unable to set source image argument", error10);
	Debug_out(DEBUG_HOMEWORK, "%s: Source image argument set.\n", fname);
	ret = clSetKernelArg(kernel, 1, sizeof(result_image), (void *) &result_image);
	CLUT_CHECK_ERROR(ret, "Unable to set result image argument", error10);
	Debug_out(DEBUG_HOMEWORK, "%s: Result image argument set.\n", fname);
	ret = clSetKernelArg(kernel, 2, sizeof(filter_matrix_buffer), (void *) &filter_matrix_buffer);
	CLUT_CHECK_ERROR(ret, "Unable to set filter matrix argument", error10);
	Debug_out(DEBUG_HOMEWORK, "%s: Filter matrix argument set.\n", fname);

	Debug_out(DEBUG_HOMEWORK, "%s: All kernel arguments set.\n", fname);

	const size_t work_size[2] = {
		COMPUTE_GLOBAL_SIZE(height - filter_size + 1, local_height),
		COMPUTE_GLOBAL_SIZE(width - filter_size + 1, local_width) };
	const size_t wg_size[2] = { local_height, local_width };
	Debug_out(DEBUG_HOMEWORK, "%s: work size is [%zu]x[%zu].\n", fname, work_size[0], work_size[1]);

	/* run kernel */
	cl_event kernel_event;
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, work_size, wg_size, 0, NULL, &kernel_event);
	CLUT_CHECK_ERROR(ret, "Unable to enqueue kernel", error10);

	ret = clFinish(command_queue);
	CLUT_CHECK_ERROR(ret, "Unable to finish commands in queue", error10);
	Debug_out(DEBUG_HOMEWORK, "%s: Kernel executed.\n", fname);
	ret = clWaitForEvents(1, &kernel_event);
	CLUT_CHECK_ERROR(ret, "Unable to wait for kernel event", error10);

	/* check that kernel executed correctly */
	cl_int kernel_ret;
	ret = clGetEventInfo(kernel_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(kernel_ret), &kernel_ret, NULL);
	CLUT_CHECK_ERROR(ret, "Unable to get kernel status", error10);
	Debug_out(DEBUG_HOMEWORK, "%s: Kernel status is %d.\n", fname, kernel_ret);
	if (CL_COMPLETE != kernel_ret) {
		Debug_out(DEBUG_HOMEWORK, "%s: kernel execution failed: %s.\n", fname, clut_getErrorDescription(kernel_ret));
		goto error10;
	}

	cl_ulong end_time;
	ret = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_END, sizeof(end_time), &end_time, NULL);
	CLUT_CHECK_ERROR(ret, "Unable to get kernel event end time", error10);
	if (0 == end_time) {
		Debug_out(DEBUG_HOMEWORK, "%s: kernel execution took 0 seconds.\n", fname);
		goto error10;
	}

	cl_double time_double = clut_getEventDuration(kernel_event);
	cl_ulong time_ulong = clut_getEventDuration_ns(kernel_event);
	Debug_out(DEBUG_HOMEWORK, "%s: Blurring took %f seconds (%lld nanoseconds).\n", fname, time_double, time_ulong);

	/* save image back to file */
	clut_saveImageToFile("output_unlimited.png", command_queue, result_image);

	/* output filter size, local width, local height, and duration in nanoseconds for profiling */
	printf("%d,%zu,%zu,%lld\n", filter_size, local_width, local_height, clut_getEventDuration_ns(kernel_event));

	return_value = 0;

error10:
	clReleaseMemObject(filter_matrix_buffer);
error9:
	free(filter_matrix);
error8:
	clReleaseMemObject(result_image);
error7:
	clReleaseMemObject(source_image);
error6:
	clReleaseCommandQueue(command_queue);
error5:
	clReleaseKernel(kernel);
error4:
	clReleaseProgram(program);
error3:
	clReleaseContext(context);
error2:
	free(flags);
error1:
	return return_value;
}
Example #4
0
int main(int argc, char** argv)
{
	char x_data[] = {70, 100, 100,   8, 100,  2, 80, 101, 100, 100, 50, 30, 0};
	char y_data[] = {2,    1,   8, 100,  11, 30,  7,  10,  14,   8, 50,  3, 0};
	size_t const count = sizeof(x_data);

	char *const a_data = malloc(count);
	char *const b_data = malloc(count);
	char *const c_data = malloc(count);

	memcpy(a_data, x_data, count);
	memcpy(b_data, y_data, count);
	memset(c_data,   0x00, count);

	cl_platform_id platforms[32];
	cl_uint num_platforms;
	char vendor[1024];
	cl_device_id devices[32];
	cl_uint num_devices;
	char deviceName[1024];
	cl_int err;
	
        err = clGetPlatformIDs(32, platforms, &num_platforms);
	cl_assert(err, "There was a problem getting the platforms");
	for(size_t p = 0; p < num_platforms; ++p) {
		cl_platform_id platform = platforms[p];
		clGetPlatformInfo (platform, CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL);
		printf("Platform Vendor:\t%s\n", vendor);

		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 32, devices, &num_devices);
		cl_assert(err, "There was a problem getting the device list");

		cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
		cl_assert(err, "There was a problem creating the context.");

		cl_program program = clCreateProgramWithSource(context, 1, &addKernel, NULL, &err);
		cl_assert(err, "There was a problem creating the program.");

		err = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL);
		for(size_t d = 0; d < num_devices; ++d) {
			cl_device_id device = devices[d];
			char buffer[2048];
			size_t length = 0;
			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 2048, buffer, &length);
			if (length > 1)
				printf("%s\n", buffer);
			cl_assert(err, "There was a problem building the program.");
		}


		cl_kernel kernel = clCreateKernel(program, "add", &err);
		cl_assert(err, "There was a problem getting the kernel.");

		cl_mem a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,  
				sizeof(char) * count, a_data, &err);
		cl_assert(err, "There was a problem creating the a_buffer.");
		cl_mem b_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,  
				sizeof(char) * count, b_data, &err);
		cl_assert(err, "There was a problem creating the b_buffer.");
		cl_mem c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
				sizeof(char) * count, c_data, &err);
		cl_assert(err, "There was a problem creating the c_buffer");


		err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_buffer);
		err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_buffer);
		err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_buffer);
		err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &count);
		cl_assert(err, "There was a problem setting the arguments.");

		for(size_t d = 0; d < num_devices; ++d) {
			cl_device_id device = devices[d];

			clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
			printf("  Device Name:\t%s\n", deviceName);

			cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
			cl_command_queue commands = clCreateCommandQueue(context, device, properties, &err);
			cl_assert(err, "There was a problem creating the command queue");

			size_t local[]  = { count };
			size_t global[] = { count };
			cl_event event;
			err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global, local, 0, NULL, &event);
			cl_assert(err, "There was a problem queueing the kernel.");

			err = clEnqueueReadBuffer(commands, c_buffer, CL_TRUE, 0, sizeof(char) * count, 
					c_data, 0, NULL, NULL);  
			cl_assert(err, "There was a problem reading the output buffer.");

			clFinish(commands);
			cl_ulong start, stop;
			err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
			err |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(stop), &stop, NULL);
			cl_assert(err, "There was a problem getting profiling information.");
			printf("  Time: \t%lu ns.\n", stop - start);
			printf("  Output: \t%s\n", c_data);
			printf("\n");
			clReleaseCommandQueue(commands);

		}

		clReleaseMemObject(a_buffer);
		clReleaseMemObject(b_buffer);
		clReleaseMemObject(c_buffer);

		clReleaseProgram(program);
		clReleaseKernel(kernel);
		clReleaseContext(context);
	}

        free(a_data);
	free(b_data);
	free(c_data);

	return 0;
}
/**************************************************************
function to calculate Matrix Infinity norm
*************************************************************/
void    matInfinityNormGMSP(cl_uint numDevices,cl_device_id *devices, cl_program program,cl_context context,float * h_Mat, int *h_Rowcol, float *h_InfiNorm,int height,int width)
{
        cl_command_queue        cmdQueue;     // Command Queue  object
        cl_mem                  d_Mat;     //  device input buffer
        cl_mem                  d_Rowcol;     //  device input buffer
        cl_mem                  d_InfiNorm;      // device output buffer
        cl_kernel               kernel;        //  kernel object
        cl_int                  err;            // Holds the error 
        cl_event                events;        // event object
        double                  totalTime=0.0; //holds total time taken for execution
        size_t                  globalWorkSize[1];    // holds global_work size
        size_t                  localWorkSize[1];    // holds local work size
        int                      count;
        char                    dbuff[100];          
        double                  gflops=0.0;             //holds total achieved gflops
	cl_ulong startTime, endTime,elapsedTime; //holds time
	float executionTimeInSeconds;        //holds total execution time
	cl_event gpuExec[1];         // event object

        /* Get device Name */
       err = clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(dbuff), &dbuff, NULL);
       OPENCL_CHECK_STATUS("Failed  to Get device Name",err);

	/** Create the command queue **/
        cmdQueue = clCreateCommandQueue( context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
        if( err != CL_SUCCESS || cmdQueue == 0)
        {
               printf("\n\t Failed to create command queue  \n" );
               exit (-1);
        }
        
        /* create buffers*/
        d_Mat =clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,(height*width)*sizeof(float),h_Mat,&err);
        OPENCL_CHECK_STATUS("Failed to create device input buffer A  ",err);

        
        d_Rowcol =clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,2*sizeof(cl_int),h_Rowcol,&err);
        OPENCL_CHECK_STATUS("Failed to create device input buffer d_rowcol  ",err);

   	 d_InfiNorm = clCreateBuffer ( context, CL_MEM_WRITE_ONLY , sizeof(float),NULL, &err);
        OPENCL_CHECK_STATUS( "Failed to create device output  buffer   ",err);

         // Create the kernel
         kernel = clCreateKernel ( program, "infinityNorm_kernel", &err);
         OPENCL_CHECK_STATUS(" Create kernel failed ",err);

          //  Set the arguments
     	err = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *) &d_Mat);
     	OPENCL_CHECK_STATUS( "Set  kernel argument 0 failed ",err);
           	
	err = clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *) &d_Rowcol);
        OPENCL_CHECK_STATUS( "Set  kernel argument 1 failed ",err);
        
   	err = clSetKernelArg( kernel, 2, sizeof(cl_mem), (void *) &d_InfiNorm);
        OPENCL_CHECK_STATUS( "Set  kernel argument 2 failed ",err);
           
         //set Global work size and local work size
        globalWorkSize [0]= height   ; // ND Range Size for each kernel launch 

         //launch the kernel
         err=clEnqueueNDRangeKernel(cmdQueue,kernel,1,NULL,globalWorkSize,NULL,0,NULL,&gpuExec[0]);
         OPENCL_CHECK_STATUS( " Kernel launch failed ",err);
	
	//completion of all commands to command queue
	err = clFinish(cmdQueue);
        OPENCL_CHECK_STATUS("clFinish",err);

	//calculate start time and end time
	clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);
        clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

	/*calculate total elapsed time*/
 	elapsedTime = endTime-startTime;
	
	/* total execuition time in seconds*/
 	executionTimeInSeconds = (float)(1.0e-9 * elapsedTime);

         //read the result
         err =clEnqueueReadBuffer(cmdQueue,d_InfiNorm,CL_TRUE,0,sizeof(cl_float),h_InfiNorm,0,0,&events);
         OPENCL_CHECK_STATUS(" Read output failed ",err);

	/* calculate gflops*/
        gflops= (1.0e-9 * ((1.0 *height*height) / executionTimeInSeconds));


        // Print the gflops on the screen
	 print_on_screen("Matrix Infinity Norm",executionTimeInSeconds,height,gflops,1);

        //free opencl objects
        if ( kernel )   clReleaseKernel(kernel);
        if ( cmdQueue) clReleaseCommandQueue(cmdQueue);
        if ( events )   clReleaseEvent(events);
	clReleaseMemObject(d_Mat);
        clReleaseMemObject(d_Rowcol);
        clReleaseMemObject(d_InfiNorm);
}
Example #6
0
//--------------------------------------------------------------------------------------
// Name: Compute()
// Desc:
//--------------------------------------------------------------------------------------
BOOL CSample::Compute()
{
    m_Timer.Reset();
    m_Timer.Start();

    char str[256];

    // Set the kernel arguments
    cl_int errNum = 0;
    errNum |= clSetKernelArg( m_kernel, 0, sizeof(cl_mem), &m_srcA );
    errNum |= clSetKernelArg( m_kernel, 1, sizeof(cl_mem), &m_srcB );
    errNum |= clSetKernelArg( m_kernel, 2, sizeof(cl_mem), &m_result );
    if( errNum != CL_SUCCESS )
    {
        FrmLogMessage( "Error setting kernel arguments" );
        return FALSE;
    }

    size_t globalWorkSize[1] = { m_nNumVectors };
    size_t localWorkSize[1] = { 1 };

    cl_event kernel_event;
    cl_ulong t_queued=0, t_submit=0, t_start=0, t_end=0;

    // Queue the kernel for execution
    errNum = clEnqueueNDRangeKernel( m_commandQueue, m_kernel, 1, NULL,
                                    globalWorkSize, localWorkSize, 0, NULL, &kernel_event );
    if( errNum != CL_SUCCESS )
    {
        FrmLogMessage( "Error queueing kernel for execution." );
        return FALSE;
    }

    clWaitForEvents(1 , &kernel_event);

    // Query timestamp for kernel profiling
    //   Queued time is when the command is queued to host.
    //   Submit time is when the command is submitted from host to device.
    //   Start time is when the command starts the execution.
    //   End time is when the command finishes the execution.
    // The delta between start and end, marks the total elapsed time to execute a kernel in device.
    errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_QUEUED,
                sizeof(cl_ulong), &t_queued, NULL);
    if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting queued timestamp." );
    errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_SUBMIT,
                sizeof(cl_ulong), &t_submit, NULL);
    if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting submit timestamp." );
    errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_START,
                sizeof(cl_ulong), &t_start,    NULL);
    if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting start timestamp." );
    errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_END,
                sizeof(cl_ulong), &t_end, NULL);
    if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting end timestamp." );

    FrmLogMessage("Kernel event profiling....(nano sec)\n");
    FrmSprintf(str, sizeof(str), "  -> Queued time: %lu\n", t_queued);
    FrmLogMessage( str );
    FrmSprintf(str, sizeof(str), "  -> Submit time: %lu\n", t_submit);
    FrmLogMessage( str );
    FrmSprintf(str, sizeof(str), "  -> Start time:  %lu\n", t_start);
    FrmLogMessage( str );
    FrmSprintf(str, sizeof(str), "  -> End time:    %lu\n", t_end);
    FrmLogMessage( str );

    clReleaseEvent(kernel_event);

    // Read the result back to host memory
    FRMVECTOR4* pResult;
    pResult = (FRMVECTOR4*) clEnqueueMapBuffer( m_commandQueue, m_result, CL_TRUE, CL_MAP_READ, 0, sizeof(FRMVECTOR4) * m_nNumVectors,
                                                0, NULL, NULL, &errNum );
    if( errNum != CL_SUCCESS )
    {
        FrmLogMessage( "Error enqueuing buffer map." );
        return FALSE;
    }
    m_Timer.Stop();

    FrmSprintf( str, sizeof(str), "Results: '%d' vector additions in '%.6f' seconds.\n", m_nNumVectors, m_Timer.GetTime() );
    FrmLogMessage( str );

    // Test results again CPU reference
    BOOL result = TRUE;
    if ( RunTests() )
    {
        const FLOAT32 epsilon = 0.000001f;

        for( size_t i = 0; i < m_nNumVectors; i++ )
        {
            for ( size_t j = 0; j < 4; j++ )
            {
                FLOAT32 refVal = m_pRefResults[ i ].v[ j ];
                FLOAT32 val = pResult[ i ].v[ j ];

                if( FrmAbs( refVal - val ) > epsilon )
                {
                    FrmSprintf( str, sizeof(str), "Reference test failure, ref = (%f), result = (%f) Diff = (%f).\n", refVal, val, FrmAbs(refVal - val));
                    FrmLogMessage( str );
                    result = FALSE;
                }
            }
        }

    }

    // Unmap buffer
    errNum = clEnqueueUnmapMemObject( m_commandQueue, m_result, pResult, 0, NULL, NULL );
    if( errNum != CL_SUCCESS )
    {
        FrmLogMessage( "ERROR: Unmapping result buffer." );
        return FALSE;
    }
    return result;
}
Example #7
0
int main(int argc, char *argv[])
{
    /* Variaveis obrigatorias do openCL pdccpk*/
    cl_platform_id          platform_ids[2];
    cl_device_id            device_id;
    cl_context              context;
    cl_command_queue        commands;
    cl_program              program;
    cl_kernel               kernel_sobel;

    cl_int                  ret_code;
    cl_uint                 ret_num_devices;
    cl_uint                 ret_num_platforms;
    //
    cl_event                kernel_event;
    cl_ulong                kernel_start_time   = (cl_ulong) 0;
    cl_ulong                kernel_end_time     = (cl_ulong) 0;
    cl_ulong                kernel_run_time     = (cl_ulong) 0;

    cl_event                write_host_dev_event;
    cl_ulong                write_host_dev_start_time   = (cl_ulong) 0;
    cl_ulong                write_host_dev_end_time     = (cl_ulong) 0;
    cl_ulong                write_host_dev_run_time     = (cl_ulong) 0;

    cl_event                read_dev_host_event;
    cl_ulong                read_dev_host_start_time    = (cl_ulong) 0;
    cl_ulong                read_dev_host_end_time      = (cl_ulong) 0;
    cl_ulong                read_dev_host_run_time      = (cl_ulong) 0;

    unsigned __int64        image_tam;
    const unsigned __int64  MEGA_BYTES   =  1048576; // 1024*1024
    double                  image_tam_MB;
    double                  tempo_total;

    /* objetos que serao armazenados na memoria da GPU */
    cl_mem                  image_in_mem, image_out_mem;
    /* objetos que serao armazenados na memoria local (host) */
    unsigned char           *image_in_host, *image_out_host;
    unsigned  int           image_width, image_height;
    size_t                  image_size;
    /*IMPORTANTE: dimensionamento dos compute units para exec do kernel*/
    size_t                  work_global[C_NUM_DIMENSOES];
    size_t                  work_local[C_NUM_DIMENSOES];
    /*Setup dos nomes de arquivos*/
    const char              *kernel_filename = C_NOME_ARQ_KERNEL;
    pgm_t                   ipgm, opgm;
    /* Codigo fonte do kernel dever ser aberto como uma cadeia de caracteres*/
    image_file_t* image_filename;
    char* output_filename;
    FILE                    *fp;
    size_t                  source_size;
    char                    *source_str;

    /* Timer count start */
    timer_reset();
    timer_start();

    if (argc < 2) {
        printf("**Erro: A imagem de entrada Ă© necessaria.\n");
        exit(EXIT_FAILURE);
    }

    //===================================================================================================
    image_filename = (image_file_t *) malloc(sizeof(image_file_t));
    split_image_filename(image_filename, argv[1]);
    output_filename = (char *) malloc(40*sizeof(char));
    sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO);
    //===================================================================================================

    fp = fopen(kernel_filename, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }

    source_str  = (char *) malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    //===================================================================================================
    // Abrindo imagem do arquivo para objeto de memoria local
    if( ler_pgm(&ipgm, argv[1]) == -1)
        exit(EXIT_FAILURE);

    image_in_host = ipgm.buf;
    image_width  = ipgm.width;
    image_height = ipgm.height;

    image_size =  (int) (image_width * image_height) * sizeof(unsigned char);

    image_tam = image_size;

    /* Alocando memoria para a imagem de saida apos o processamento*/
    image_out_host = (unsigned char *) malloc(image_size);
    //===================================================================================================

    /* Recebe um vetor de platform_id e retorna sucesso
     * se encontrar plataformas OpenCL no sistema, inseridos
     * essas plataformas no vetor com no maximo MAX_PLATFORM_ID
     * entradas, caso contrario retorna codigo de erro.
     * CL_CHECK Ă© um macro para retornar o titulo do erro
     * a partir de uma funcao que retorne um codigo de erro
     ***************************************************/

    CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_ids, &ret_num_platforms));

    if (ret_num_platforms == 0) {
        fprintf(stderr, "[Erro] NĂ£o existem plataformas OpenCL\n");
        exit(2);
    }
    //===================================================================================================

    /* Recebe uma platform_id e retorna sucesso
     * se obter um device do tipo GPU dessa plataforma OpenCL
     * caso contrario retorna codigo de erro.
     ***************************************************/

    CL_CHECK(clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices));
    //print_platform_info(&platform_ids[0]);
    //system("pause");
    //exit(0);
    //===================================================================================================

    /* Retorna sucesso se consegui criar um contexto para
     * o device id escolhido, caso contrario retorna codigo de erro.
     ***************************************************/
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code);
    //===================================================================================================

    commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code);
    //===================================================================================================

    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, NULL);
    //===================================================================================================

    ret_code = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    if (ret_code != CL_SUCCESS) {
        char build_str[4096];
        fprintf(stderr, "[ERRO] clBuildProgram '%s' (code: %d)\n",
                error_cl_str(ret_code), ret_code );
        clGetProgramBuildInfo( program, device_id,
                               CL_PROGRAM_BUILD_LOG, sizeof(build_str), build_str, NULL);
        fprintf(stderr, "[ERRO] log: '%s'\n", build_str);
        system("pause");
        exit(4);
    }
    //===================================================================================================

    kernel_sobel = clCreateKernel(program, "sobel_kernel", NULL);
    image_in_mem  = clCreateBuffer(context, CL_MEM_READ_ONLY, image_size, NULL, NULL);
    image_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, image_size , NULL, NULL);
    //===================================================================================================

    CL_CHECK(clEnqueueWriteBuffer(commands, image_in_mem, CL_TRUE, 0, image_size, image_in_host, 0, NULL, &write_host_dev_event));
    CL_CHECK(clSetKernelArg(kernel_sobel, 0, sizeof(cl_mem), &image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_sobel, 1, sizeof(cl_mem), &image_out_mem));
    //===================================================================================================

    work_global[0] = image_width;
    work_global[1] = image_height;
    work_local[0] = MAX_WORK_GROUP_ITEM_SIZE_DIM_1;
    work_local[1] = MAX_WORK_GROUP_ITEM_SIZE_DIM_2;
    //===================================================================================================

    CL_CHECK(clEnqueueNDRangeKernel(commands, kernel_sobel, 2, NULL, work_global, work_local, 0, NULL,  &kernel_event) );
    // CL_CHECK(clFinish(commands));
    // CL_CHECK( clWaitForEvents(1 , &kernel_event) );
    //===================================================================================================

    CL_CHECK(clEnqueueReadBuffer(commands, image_out_mem, CL_TRUE, 0, image_size, image_out_host, 0, NULL, &read_dev_host_event));

    //== Total time elapsed =============================================================================
    timer_stop();
    tempo_total = get_elapsed_time();
    //===================================================================================================
    //====== Get time of Profile Info ===================================================================
    // kernel sobel time
    CL_CHECK(clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
    // Write data time
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL));
    // Read data time
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL));
    //===================================================================================================

    write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time;
    read_dev_host_run_time =  read_dev_host_end_time -  read_dev_host_start_time;
    kernel_run_time = kernel_end_time - kernel_start_time;

    image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES);

    //===================================================================================================
    save_log_gpu(image_filename, kernel_run_time, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)),
        (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME);
    //===================================================================================================

    opgm.width  = image_width;
    opgm.height = image_height;
    opgm.buf    = image_out_host;

    escrever_pgm(&opgm, output_filename);

    //===================================================================================================
    CL_CHECK(clReleaseMemObject(image_in_mem));
	CL_CHECK(clReleaseEvent(kernel_event));
    CL_CHECK(clReleaseEvent(read_dev_host_event));
    CL_CHECK(clReleaseEvent(write_host_dev_event));
    CL_CHECK(clReleaseMemObject(image_out_mem));
    CL_CHECK(clReleaseProgram(program));
    CL_CHECK(clReleaseKernel(kernel_sobel));
    CL_CHECK(clReleaseCommandQueue(commands));
    CL_CHECK(clReleaseContext(context));
    destruir_pgm(&ipgm);
    destruir_pgm(&opgm);
    free(source_str);
    free(image_filename);
    free(output_filename);

    //_CrtDumpMemoryLeaks();

    return 0;
}
int main(int argc, char *argv[]) {

	const char *imageOriginalPath;
	const char *imageCryptedPath;

	if (argc == 2 || argc > 3) {
		printf("Need original image path and crypted image path as program arguments, or nothing to use default values.\n");
		system("pause");
		return EXIT_FAILURE;
	}
	else if (argc == 3) {
		imageOriginalPath = argv[1];
		imageCryptedPath = argv[2];
	}
	else {
		imageOriginalPath = "image/lena.bmp";
		imageCryptedPath = "image/output.bmp";
	}

	int imageWidth, imageHeight;
	float *imageOriginal = NULL;
	imageOriginal = readImage(imageOriginalPath, &imageWidth, &imageHeight);
	float *imageCrypted = NULL;
	imageCrypted = readImage(imageCryptedPath, &imageWidth, &imageHeight);
	int imgLength = imageWidth * imageHeight;

	cl_int status;
	cl_command_queue cmdQueue;
	cl_context context;
	cl_program program;
	cl_device_id *devices;
	cl_uint numDevices;
	cl_event event;
	cl_kernel kernel;

	cl_ulong time_start, time_end;
	double total_time;

	// OpenCL initializations

	devices = getDevices(&numDevices, &status);
	checkError(status, 50);

	context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
	checkError(status, 53);

	cmdQueue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &status);
	checkError(status, 56);
	clFinish(cmdQueue);

	program = createKernelProgramFromFile(&context, clFile_decoding, &status);
	checkError(status, 60);

	status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
	checkError(status, 63);

	kernel = clCreateKernel(program, kernel_decoding, &status);
	checkError(status, 66);

	// Input
	size_t imageSize = sizeof(float) * imgLength;
	cl_mem bufferImageOriginal = clCreateBuffer(context, CL_MEM_READ_ONLY, imageSize, NULL, &status);
	cl_mem bufferImageCrypted = clCreateBuffer(context, CL_MEM_READ_ONLY, imageSize, NULL, &status);
	size_t messageSize = MSG_LENGTH;
	cl_mem bufferOutput = clCreateBuffer(context, CL_MEM_WRITE_ONLY, messageSize, NULL, &status);
	status = clEnqueueWriteBuffer(cmdQueue, bufferImageOriginal, CL_FALSE, 0, imageSize, imageOriginal, 0, NULL, NULL);
	status = clEnqueueWriteBuffer(cmdQueue, bufferImageCrypted, CL_FALSE, 0, imageSize, imageCrypted, 0, NULL, NULL);

	// Mapping
	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferImageOriginal);
	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferImageCrypted);
	status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferOutput);
	const unsigned int blockSize = imgLength / messageSize;
	status = clSetKernelArg(kernel, 3, sizeof(cl_uint), &blockSize);

	// Worksize
	size_t workgroup_size;
    status = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE,sizeof(size_t), &workgroup_size, NULL);
	size_t globalWorkSize[1];
	globalWorkSize[0] = imgLength;
	size_t localWorkSize = 1024;
	if(localWorkSize>workgroup_size)
		localWorkSize=workgroup_size;

	// Kernel execution
	status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, &localWorkSize, 0, NULL, &event);
	clWaitForEvents(1, &event);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
	total_time = (double)(time_end - time_start);
	
	size_t countWG = MSG_LENGTH;
	bool* outputMsg = NULL;
	outputMsg = (bool*)malloc(sizeof(bool) * countWG);
	clEnqueueReadBuffer(cmdQueue, bufferOutput, CL_TRUE, 0, countWG, outputMsg, 0, NULL, NULL); // values

	// Display
	printf("\n");
	printf("-------------------------------------------\n");
	printf("Binary message length: %d bits\n", MSG_LENGTH);
	printf("Binary message content:\n");
	for (int i = 0; i < MSG_LENGTH; i++) {
		printf("%d", outputMsg[i]);
	}
	printf("\n\n");
	printf("Quantity of image pixels: %d \n", (int)imgLength);
	printf("Local worksize: %d \n", (int)localWorkSize);
	printf("Kernel(s) execution time : %0.3f ms \n", (total_time / 1000000.0));
	printf("-------------------------------------------\n");
	printf("\n");

	// Free ressources
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(cmdQueue);
	clReleaseMemObject(bufferImageOriginal);
	clReleaseMemObject(bufferImageCrypted);
	clReleaseMemObject(bufferOutput);
	clReleaseContext(context);
	free(imageOriginal);
	free(imageCrypted);
	free(outputMsg);
	free(devices);

	//system("pause");
	return EXIT_SUCCESS;
}
int main() {

   /* OpenCL data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   cl_int i, err, num_vectors;

   /* Data and events */
   char data[NUM_BYTES];
   cl_mem data_buffer;
   cl_event prof_event;
   cl_ulong time_start, time_end, total_time;
   void* mapped_memory;

   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }     

   /* Build the program and create a kernel */
   program = build_program(context, device, PROGRAM_FILE);
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);   
   };

   /* Create a buffer to hold data */
   data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         sizeof(data), NULL, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };         

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
   if(err < 0) {
      perror("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Tell kernel number of char16 vectors */
   num_vectors = NUM_BYTES/16;
   clSetKernelArg(kernel, 1, sizeof(num_vectors), &num_vectors);

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 
         CL_QUEUE_PROFILING_ENABLE, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   total_time = 0.0f;
   for(i=0; i<NUM_ITERATIONS; i++) {

      /* Enqueue kernel */
      err = clEnqueueTask(queue, kernel, 0, NULL, NULL);
      if(err < 0) {
         perror("Couldn't enqueue the kernel");
         exit(1);   
      }

#ifdef PROFILE_READ

      /* Read the buffer */
      err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
            sizeof(data), data, 0, NULL, &prof_event);
      if(err < 0) {
         perror("Couldn't read the buffer");
         exit(1);
      }

#else

      /* Create memory map */
      mapped_memory = clEnqueueMapBuffer(queue, data_buffer, CL_TRUE,
            CL_MAP_READ, 0, sizeof(data), 0, NULL, &prof_event, &err);
      if(err < 0) {
         perror("Couldn't map the buffer to host memory");
         exit(1);   
      }

#endif

      /* Get profiling information */
      clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
            sizeof(time_start), &time_start, NULL);
      clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
            sizeof(time_end), &time_end, NULL);
      total_time += time_end - time_start;

#ifndef PROFILE_READ

      /* Unmap the buffer */
      err = clEnqueueUnmapMemObject(queue, data_buffer, mapped_memory,
            0, NULL, NULL);
      if(err < 0) {
         perror("Couldn't unmap the buffer");
         exit(1);   
      }

#endif
   }

#ifdef PROFILE_READ
   printf("Average read time: %lu\n", total_time/NUM_ITERATIONS);
#else
   printf("Average map time: %lu\n", total_time/NUM_ITERATIONS);
#endif

   /* Deallocate resources */
   clReleaseEvent(prof_event);
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Example #10
0
cl_int
run (
        const char *ker,
        cl_uint M,
        cl_uint N,
        cl_uint K,
        FType alpha,
        BlasGenSettings *gset,
        TileMulFlags flags,
        cl_device_type deviceType,
        bool verbose,
        unsigned int iterNum)
{
    cl_int err;
    cl_platform_id platform;
    cl_context ctx;
    cl_device_id device;
    cl_command_queue queue;
    cl_event evt;
    DataType dtype = gset->kextra->dtype;

    cl_mem bufA, bufB, bufC;
    FPtr A, B, C, C_naive;
    bool isComplex = isComplexType(dtype);
    bool isDouble = isDoubleBasedType(dtype);
    cl_uint nwords = (isComplex) ? 2 : 1;
    unsigned int tsize = dtypeSize(dtype);
    cl_kernel kernel;
    size_t i, j, k;
    size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
    size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
    char log[100000];
    size_t logSize;
    cl_long sTime, fTime;
    cl_program program = NULL;

    clGetPlatformIDs(1, &platform, NULL);

    clGetDeviceIDs(platform, deviceType, 1, &device, NULL);

    ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        return err;
    }

    queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if (err != CL_SUCCESS) {
        return err;
    }

    /* Prepare OpenCL kernel and its arguments */

    program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL);

    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    clGetProgramBuildInfo (program,
            device,
            CL_PROGRAM_BUILD_LOG,
            sizeof(log),
            log,
            &logSize);
    printf("%s", log);
    if (err != CL_SUCCESS){
        clReleaseProgram(program);
        return err;
    }

    kernel = clCreateKernel(program, kernelName, &err);
    if (err != CL_SUCCESS){
        clReleaseProgram(program);
        return err;
    }
    /* Memory allocation */

    A.v = malloc(M * K * tsize);
    B.v = malloc(K * N * tsize);
    C.v = malloc(M * N * tsize);
    C_naive.v = malloc(M * N * tsize);

#if JUST_MULTIPLICATION
    srand(0);
    if (isDouble) {
        for(i = 0; i < M * K * nwords; i++){
            A.d[i] = i;
        }
        for(i = 0; i < N * K * nwords; i++){
            B.d[i] = i + 7;
        }
        for(i = 0; i < M * N * nwords; i++){
            C.d[i] = 0.0;
            C_naive.d[i] = 0.0;
        }
    }
    else {
        for(i = 0; i < M * K * nwords; i++){
            A.f[i] = i;
        }
        for(i = 0; i < N * K * nwords; i++){
            B.f[i] = i + 7;
        }
        for(i = 0; i < M * N * nwords; i++){
            C.f[i] = 0.0;
            C_naive.f[i] = 0.0;
        }
    }

#else
    srand(0);
    if (isDouble) {
        for(i = 0; i < M * K * nwords; i++){
            A.d[i] = (double)(rand() % RAND_BOUND);
        }
        for(i = 0; i < N * K * nwords; i++){
            B.d[i] = (double)(rand() % RAND_BOUND);
        }
        for(i = 0; i < M * N * nwords; i++){
            C.d[i] = 0.0;
            C_naive.d[i] = 0.0;
        }
    }
    else {
        for(i = 0; i < M * K * nwords; i++){
            A.f[i] = (float)(rand() % RAND_BOUND);
        }
        for(i = 0; i < N * K * nwords; i++){
            B.f[i] = (float)(rand() % RAND_BOUND);
        }
        for(i = 0; i < M * N * nwords; i++){
            C.f[i] = 0.0;
            C_naive.f[i] = 0.0;
        }
    }
#endif

    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
            K * M * tsize, A.v, &err);
    if (err != CL_SUCCESS) {
        clReleaseKernel(kernel);
        return err;
    }

    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
            K * N * tsize, B.v, &err);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
        M * N * tsize, C.v, &err);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    /* Argument setting and kernel execution */
    err = clSetKernelArg(kernel, 0, tsize, alpha.u);
    err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA);
    err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB);
    err |= clSetKernelArg(kernel, 3, sizeof(M), &M);
    err |= clSetKernelArg(kernel, 4, sizeof(N), &N);
    err |= clSetKernelArg(kernel, 5, sizeof(K), &K);
    err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC);
    err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufC);
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
        globalWorkSize, localWorkSize, 0,
        NULL, &evt);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufC);
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    err = clFinish(queue);
    err = clEnqueueReadBuffer (queue,
        bufC,
        CL_TRUE,
        0,
        M * N * tsize,
        C.v,
        0,
        NULL,
        NULL);

    /* Naive CPU multiplication */
    if (isDouble) {
        for (i = 0; i < M; i++) {
            for (j = 0; j < N; j++) {
                if (isComplex) {
                    cl_double2 val;
                    for (k = 0; k < K; k++) {
                        cl_double2 bkj = flags & TILEMUL_TRB ?
                                B.d2[j * K + k] : B.d2[k * N + j];
                        cl_double2 aik = flags & TILEMUL_TRA ?
                                A.d2[k * M + i] : A.d2[i * K + k];
                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
                        C_naive.d2[i * N + j].s[0] += val.s[0];
                        C_naive.d2[i * N + j].s[1] += val.s[1];
                    }
                    val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] -
                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[1];
                    val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] +
                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[0];
                    C_naive.d2[i * N + j] = val;
                }
                else {
                    for (k = 0; k < K; k++) {
                        double bkj = flags & TILEMUL_TRB ?
                                B.d[j * K + k] : B.d[k * N + j];
                        double aik = flags & TILEMUL_TRA ?
                                A.d[k * M + i] : A.d[i * K + k];
                        C_naive.d[i * N + j] += aik * bkj;
                    }
                    C_naive.d[i * N + j] *= alpha.d;
                }
            }
        }

        for (i = 0; i < M * N; i++) {
            if (C.d[i] != C_naive.d[i]) {
                printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N,
                        C.d[i], C_naive.d[i]);
                break;
            }
        }
        if (i == M * N) {
            printf("Match\n");
        }
    }
    else {
        for (i = 0; i < M; i++) {
            for (j = 0; j < N; j++) {
                if (isComplex) {
                    cl_float2 val;
                    for (k = 0; k < K; k++) {
                        cl_float2 bkj = flags & TILEMUL_TRB ?
                                B.f2[j * K + k] : B.f2[k * N + j];
                        cl_float2 aik = flags & TILEMUL_TRA ?
                                A.f2[k * M + i] : A.f2[i * K + k];
                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
                        C_naive.f2[i * N + j].s[0] += val.s[0];
                        C_naive.f2[i * N + j].s[1] += val.s[1];
                    }
                    val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] -
                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[1];
                    val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] +
                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[0];
                    C_naive.f2[i * N + j] = val;
                }
                else {
                    for (k = 0; k < K; k++) {
                        float bkj = flags & TILEMUL_TRB ?
                                B.f[j * K + k] : B.f[k * N + j];
                        float aik = flags & TILEMUL_TRA ?
                                A.f[k * M + i] : A.f[i * K + k];
                        C_naive.f[i * N + j] += aik * bkj;
                    }
                    C_naive.f[i * N + j] *= alpha.f;
                }
            }
        }

        for (i = 0; i < M * N; i++) {
            if (C.f[i] != C_naive.f[i]) {
                printf("Differ at (%lu, %lu): %lf != %lf\n",
                        i / N, i % N, C.f[i], C_naive.f[i]);
                break;
            }
        }
        if (i == M * N) {
            printf("Match\n");
        }
    }

    /* End of naive CPU multiplication */
    if (verbose) {
        if (!isDouble) {
            printf("Matrix A:\n");
            for (i = 0; i < M; i++) {
                for (k = 0; k < K; k++) {
                    if (isComplex) {
                        cl_float2 aik = flags & TILEMUL_TRA ?
                                A.f2[k * M + i] : A.f2[i * K + k];
                        printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]);
                    }
                    else {
                        float aik = flags & TILEMUL_TRA ?
                                A.f[k * M + i] : A.f[i * K + k];
                        printf("%4.1f ", aik);
                    }
                }
                printf("\n");
            }

            printf("Matrix B:\n");
            for (k = 0; k < K; k++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        cl_float2 bkj = flags & TILEMUL_TRB ?
                                B.f2[j * K + k] : B.f2[k * N + j];
                        printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]);
                    }
                    else {
                        float bkj = flags & TILEMUL_TRB ?
                                B.f[j * K + k] : B.f[k * N + j];
                        printf("%4.1f ", bkj);
                    }
                }
                printf("\n");
            }

            printf("CPU calculated matrix:\n");
            for (i = 0; i < M; i++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        printf("(%4.1f, %4.1f) ",
                                C_naive.f2[i * N + j].s[0],
                                C_naive.f2[i * N + j].s[1]);
                    }
                    else {
                        printf("%4.1f ", C_naive.f[i * N + j]);
                    }
                }
                printf("\n");
            }

            printf("GPU calculated matrix:\n");
            for (i = 0; i < M; i++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        printf("(%4.1f, %4.1f) ",
                                C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]);
                    }
                    else {
                        printf("%4.1f ", C.f[i * N + j]);
                    }
                }
                printf("\n");
            }
        }
    }

    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
            &sTime, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
            &fTime, NULL);

    printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n",
            (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum));

    clReleaseMemObject(bufC);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufA);
    clReleaseKernel(kernel);
    return CL_SUCCESS;
}
Example #11
0
static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
{
    cl_ulong startTime, endTime;
    cl_command_queue queue_prof;
    cl_event Event[6];
    cl_int ret_code;
    int i;
    size_t scalar_gws = VF * gws;

    create_clobj(gws, self);
    queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
    for (i = 0; i < scalar_gws; i++)
        set_key(tests[0].plaintext, i);
    set_salt(get_salt(tests[0].ciphertext));

    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_key, CL_TRUE, 0, UNICODE_LENGTH * scalar_gws, saved_key, 0, NULL, &Event[0]), "Failed transferring keys");
    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_len, CL_TRUE, 0, sizeof(int) * scalar_gws, saved_len, 0, NULL, &Event[1]), "Failed transferring lengths");

    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, GenerateSHA1pwhash, 1, NULL, &scalar_gws, &local_work_size, 0, NULL, &Event[2]), "running kernel");

    for (i = 0; i < 50000 / HASH_LOOPS - 1; i++)
        HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "running kernel");

    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &Event[3]), "running kernel");

    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, Generate2007key, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &Event[4]), "running kernel");

    HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, cl_key, CL_TRUE, 0, 16 * scalar_gws, key, 0, NULL, &Event[5]), "failed in reading key back");

#if 0
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[2],
                                           CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[2],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    fprintf(stderr, "GenerateSHA1pwhash kernel duration: %llu us, ", (endTime-startTime)/1000ULL);
#endif

    HANDLE_CLERROR(clGetEventProfilingInfo(Event[3],
                                           CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[3],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    if (do_benchmark)
        fprintf(stderr, "%.2f ms x %u = %.2f s\t", (float)((endTime - startTime)/1000000.), 50000/HASH_LOOPS, (float)(50000/HASH_LOOPS) * (endTime - startTime) / 1000000000.);

    /* 200 ms duration limit for GCN to avoid ASIC hangs */
    if (amd_gcn(device_info[ocl_gpu_id]) && endTime - startTime > 200000000) {
        if (do_benchmark)
            fprintf(stderr, "- exceeds 200 ms\n");
        clReleaseCommandQueue(queue_prof);
        release_clobj();
        return 0;
    }

#if 0
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[4],
                                           CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[4],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    fprintf(stderr, "Generate2007key kernel duration: %llu us\n", (endTime-startTime)/1000ULL);
#endif

    HANDLE_CLERROR(clGetEventProfilingInfo(Event[0],
                                           CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[5],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    clReleaseCommandQueue(queue_prof);
    release_clobj();

    return (endTime - startTime);
}
Example #12
0
struct tableNode * groupBy(struct groupByNode * gb, struct clContext * context, struct statistic * pp){

    struct timespec start,end;
    clock_gettime(CLOCK_REALTIME,&start);

    cl_event ndrEvt;
    cl_ulong startTime,endTime;

    struct tableNode * res = NULL;
    long gpuTupleNum;
    int gpuGbColNum;
    cl_mem gpuGbIndex;
    cl_mem gpuGbType, gpuGbSize;

    cl_mem gpuGbKey;
    cl_mem gpuContent;

    int gbCount;                // the number of groups
    int gbConstant = 0;         // whether group by constant

    cl_int error = 0;

    res = (struct tableNode *) malloc(sizeof(struct tableNode));
    CHECK_POINTER(res);
    res->tupleSize = gb->tupleSize;
    res->totalAttr = gb->outputAttrNum;
    res->attrType = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrType);
    res->attrSize = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrSize);
    res->attrTotalSize = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrTotalSize);
    res->dataPos = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->dataPos);
    res->dataFormat = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->dataFormat);
    res->content = (char **) malloc(sizeof(char **) * res->totalAttr);
    CHECK_POINTER(res->content);

    for(int i=0;i<res->totalAttr;i++){
        res->attrType[i] = gb->attrType[i];
        res->attrSize[i] = gb->attrSize[i];
        res->dataFormat[i] = UNCOMPRESSED;
    }
    
    gpuTupleNum = gb->table->tupleNum;
    gpuGbColNum = gb->groupByColNum;

    if(gpuGbColNum == 1 && gb->groupByIndex[0] == -1){

        gbConstant = 1;
    }

    size_t localSize = 128;
    size_t globalSize = 1024*128;

    int blockNum = gb->table->tupleNum / localSize + 1; 

    if(blockNum < 1024)
        globalSize = blockNum * 128;


    cl_mem gpu_hashNum;
    cl_mem gpu_groupNum;
    cl_mem gpu_psum;
    cl_mem gpuGbCount;

    long * cpuOffset = (long *)malloc(sizeof(long) * gb->table->totalAttr);
    CHECK_POINTER(cpuOffset);
    long offset = 0;
    long totalSize = 0;

    for(int i=0;i<gb->table->totalAttr;i++){

        int attrSize = gb->table->attrSize[i];
        int size = attrSize * gb->table->tupleNum;

        cpuOffset[i] = offset;

        /*align each column*/

        if(size % 4 !=0){
            size += 4 - (size%4);
        }

        offset += size;
        totalSize += size;
    }

    gpuContent = clCreateBuffer(context->context,CL_MEM_READ_ONLY, totalSize,NULL,&error);

    for(int i=0;i<gb->table->totalAttr;i++){

        int attrSize = gb->table->attrSize[i];
        int size = attrSize * gb->table->tupleNum;

        if(gb->table->dataPos[i]==MEM){
            error = clEnqueueWriteBuffer(context->queue, gpuContent, CL_TRUE, cpuOffset[i], size, gb->table->content[i],0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
            clWaitForEvents(1, &ndrEvt);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
            pp->pcie += 1e-6 * (endTime - startTime);
#endif
        }else
            error = clEnqueueCopyBuffer(context->queue,(cl_mem)gb->table->content[i],gpuContent,0, cpuOffset[i],size,0,0,0);

    }

    cl_mem gpuOffset = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(long)*gb->table->totalAttr,NULL,&error);
    clEnqueueWriteBuffer(context->queue,gpuOffset,CL_TRUE,0,sizeof(long)*gb->table->totalAttr,cpuOffset,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    if(gbConstant != 1){

        gpuGbType = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByType,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpuGbSize = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupBySize,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpuGbKey = clCreateBuffer(context->context,CL_MEM_READ_WRITE,sizeof(int)*gb->table->tupleNum,NULL,&error);

        gpuGbIndex = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbIndex,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByIndex,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpu_hashNum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error);

        context->kernel = clCreateKernel(context->program,"cl_memset_int",0);

        int tmp = HSIZE;
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpu_hashNum);
        clSetKernelArg(context->kernel,1,sizeof(int), (void*)&tmp);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif


        gpu_groupNum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error);

        context->kernel = clCreateKernel(context->program,"cl_memset_int",0);

        int tmp = HSIZE;
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpu_groupNum);
        clSetKernelArg(context->kernel,1,sizeof(int), (void*)&tmp);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        context->kernel = clCreateKernel(context->program, "build_groupby_key",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem),(void *)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int),(void *)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem),(void *)&gpuGbIndex);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem),(void *)&gpuGbType);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem),(void *)&gpuGbSize);
        clSetKernelArg(context->kernel,6,sizeof(long),(void *)&gpuTupleNum);
        clSetKernelArg(context->kernel,7,sizeof(cl_mem),(void *)&gpuGbKey);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem),(void *)&gpu_hashNum);
        clSetKernelArg(context->kernel,9,sizeof(cl_mem),(void *)&gpu_groupNum);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        clReleaseMemObject(gpuGbType);
        clReleaseMemObject(gpuGbSize);
        clReleaseMemObject(gpuGbIndex);

        gbCount = 1;

        tmp = 0;
        gpuGbCount = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int),NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbCount,CL_TRUE,0,sizeof(int),&tmp,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        int hsize = HSIZE;
        context->kernel = clCreateKernel(context->program, "count_group_num",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpu_hashNum);
        clSetKernelArg(context->kernel,1,sizeof(int),(void *)&hsize);
        clSetKernelArg(context->kernel,2,sizeof(cl_mem),(void *)&gpuGbCount);
        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        clEnqueueReadBuffer(context->queue, gpuGbCount, CL_TRUE, 0, sizeof(int), &gbCount,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpu_psum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error);

        scanImpl(gpu_hashNum,HSIZE,gpu_psum,context,pp);

        clReleaseMemObject(gpuGbCount);
        clReleaseMemObject(gpu_hashNum);
    }

    if(gbConstant == 1)
        res->tupleNum = 1;
    else
        res->tupleNum = gbCount;

    printf("groupBy num %ld\n",res->tupleNum);

    gpuGbType = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrType,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    gpuGbSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrSize,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    /*
     * @gpuGbExp is the mathExp in each groupBy expression
     * @mathexp stores the math exp for for the group expression that has two operands
     * The reason that we need two variables instead of one is that OpenCL doesn't support pointer to pointer
     * */

    cl_mem gpuGbExp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(struct mathExp)*res->totalAttr, NULL, &error);
    cl_mem mathexp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, 2*sizeof(struct mathExp)*res->totalAttr, NULL, &error);

    struct mathExp tmpExp[2];
    int * cpuFunc = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(cpuFunc);

    offset = 0;
    for(int i=0;i<res->totalAttr;i++){

        error = clEnqueueWriteBuffer(context->queue, gpuGbExp, CL_TRUE, offset, sizeof(struct mathExp), &(gb->gbExp[i].exp),0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        offset += sizeof(struct mathExp);

        cpuFunc[i] = gb->gbExp[i].func;

        if(gb->gbExp[i].exp.opNum == 2){
            struct mathExp * tmpMath = (struct mathExp *) (gb->gbExp[i].exp.exp);
            tmpExp[0].op = tmpMath[0].op;
            tmpExp[0].opNum = tmpMath[0].opNum;
            tmpExp[0].opType = tmpMath[0].opType;
            tmpExp[0].opValue = tmpMath[0].opValue;

            tmpExp[1].op = tmpMath[1].op;
            tmpExp[1].opNum = tmpMath[1].opNum;
            tmpExp[1].opType = tmpMath[1].opType;
            tmpExp[1].opValue = tmpMath[1].opValue;
            clEnqueueWriteBuffer(context->queue, mathexp, CL_TRUE, 2*i*sizeof(struct mathExp),2*sizeof(struct mathExp),tmpExp,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
            clWaitForEvents(1, &ndrEvt);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
            pp->pcie += 1e-6 * (endTime - startTime);
#endif
        }
    }

    cl_mem gpuFunc = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuFunc,CL_TRUE,0,sizeof(int)*res->totalAttr,cpuFunc,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    long *resOffset = (long *)malloc(sizeof(long)*res->totalAttr);
    CHECK_POINTER(resOffset);
    
    offset = 0;
    totalSize = 0;
    for(int i=0;i<res->totalAttr;i++){
        
        /*
         * align the output of each column on the boundary of 4
         */

        int size = res->attrSize[i] * res->tupleNum;
        if(size %4 != 0){
            size += 4- (size %4);
        }

        resOffset[i] = offset;
        offset += size; 
        totalSize += size;
    }

    cl_mem gpuResult = clCreateBuffer(context->context,CL_MEM_READ_WRITE, totalSize, NULL, &error);
    cl_mem gpuResOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY,sizeof(long)*res->totalAttr, NULL,&error);
    clEnqueueWriteBuffer(context->queue,gpuResOffset,CL_TRUE,0,sizeof(long)*res->totalAttr,resOffset,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    gpuGbColNum = res->totalAttr;

    if(gbConstant !=1){
        context->kernel = clCreateKernel(context->program,"agg_cal",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType);
        clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize);
        clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuGbKey);
        clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpu_psum);
        clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuResult);
        clSetKernelArg(context->kernel,11,sizeof(cl_mem), (void*)&gpuResOffset);
        clSetKernelArg(context->kernel,12,sizeof(cl_mem), (void*)&gpuFunc);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif
        
        clReleaseMemObject(gpuGbKey);
        clReleaseMemObject(gpu_psum);
    }else{
        context->kernel = clCreateKernel(context->program,"agg_cal_cons",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType);
        clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize);
        clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuResult);
        clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpuResOffset);
        clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuFunc);

        globalSize = localSize * 4;
        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif
    }

    for(int i=0; i<res->totalAttr;i++){
        res->content[i] = (char *)clCreateBuffer(context->context,CL_MEM_READ_WRITE, res->attrSize[i]*res->tupleNum, NULL, &error); 
        res->dataPos[i] = GPU;
        res->attrTotalSize[i] = res->tupleNum * res->attrSize[i];
        clEnqueueCopyBuffer(context->queue, gpuResult, (cl_mem)res->content[i], resOffset[i],0, res->attrSize[i] * res->tupleNum, 0,0,0);
    }

    free(resOffset);
    free(cpuOffset);

    clFinish(context->queue);
    clReleaseMemObject(gpuContent);
    clReleaseMemObject(gpuGbType);
    clReleaseMemObject(gpuGbSize);
    clReleaseMemObject(gpuResult);
    clReleaseMemObject(gpuOffset);
    clReleaseMemObject(gpuResOffset);
    clReleaseMemObject(gpuGbExp);
    clReleaseMemObject(gpuFunc);

    clock_gettime(CLOCK_REALTIME,&end);
    double timeE = (end.tv_sec -  start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec;
    printf("GroupBy Time: %lf\n", timeE/(1000*1000));

    return res;
}
//#define AUTO_BLOCK_SIZE
void op_par_loop_save_soln(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1 ){
  
  cl_int ciErrNum;
  cl_event ceEvent;



  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  save_soln \n");
  }

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers(&cpu_t1, &wall_t1);

  // set CUDA execution parameters

#ifdef AUTO_BLOCK_SIZE
    const size_t nthread = 1024;
#else
  #ifdef OP_BLOCK_SIZE_0
    const size_t nthread = OP_BLOCK_SIZE_0;
  #else
    // int nthread = OP_block_size;
    const size_t nthread = 128;
  #endif
#endif

  const size_t nblocks = 200;
  const size_t n_tot_thread = nblocks * nthread;

  // work out shared memory requirements per element

  int nshared = 0;
  nshared = MAX(nshared,sizeof(float)*4);
  nshared = MAX(nshared,sizeof(float)*4);

  // execute plan

  int offset_s = nshared*OP_WARPSIZE;

  nshared = nshared*nthread;


  cl_kernel hKernel = getKernel( "op_cuda_save_soln" );

  //nshared *= 4;
  //offset_s *= 4;

  int i = 0;
  ciErrNum = clSetKernelArg( hKernel, i++, sizeof(cl_mem), &(arg0.data_d) );
  ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(cl_mem), &(arg1.data_d) );
  ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(int), &offset_s );
  ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(int), &set->size );
  ciErrNum |= clSetKernelArg( hKernel, i++, nshared, NULL );
  assert_m( ciErrNum == CL_SUCCESS, "error setting kernel arguments" );

#ifdef AUTO_BLOCK_SIZE
  ciErrNum = clEnqueueNDRangeKernel( cqCommandQueue, hKernel, 1, NULL, &n_tot_thread, NULL, 0, NULL, &ceEvent );
#else
  ciErrNum = clEnqueueNDRangeKernel( cqCommandQueue, hKernel, 1, NULL, &n_tot_thread, &nthread, 0, NULL, &ceEvent );
#endif
  assert_m( ciErrNum == CL_SUCCESS, "error executing kernel" );

#ifndef ASYNC
  ciErrNum = clFinish( cqCommandQueue );
  assert_m( ciErrNum == CL_SUCCESS, "error completing device commands" );

#ifdef PROFILE
  unsigned long tqueue, tsubmit, tstart, tend, telapsed;
  ciErrNum  = clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(tqueue), &tqueue, NULL );
  ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(tsubmit), &tsubmit, NULL );
  ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL );
  ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_END, sizeof(tend), &tend, NULL );
  assert_m( ciErrNum == CL_SUCCESS, "error getting profiling info" );
  OP_kernels[0].queue_time      += (tsubmit - tqueue);
  OP_kernels[0].wait_time       += (tstart - tsubmit);
  OP_kernels[0].execution_time  += (tend - tstart);
  //printf("%20lu\n%20lu\n%20lu\n%20lu\n\n", tqueue, tsubmit, tstart, tend);
  //printf("queue: %8.4f\nwait:%8.4f\nexec: %8.4f\n\n", OP_kernels[0].queue_time * 1.0e-9, OP_kernels[0].wait_time * 1.0e-9, OP_kernels[0].execution_time * 1.0e-9 );
#endif

  // update kernel record

  op_timers(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg0.size;
  OP_kernels[0].transfer += (float)set->size * arg1.size;
#endif
}
Example #14
0
int main()
{
	
	cl_float *inputMatrix1;
	cl_float *inputMatrix2;
	cl_float *results;
	cl_uint width = COLS;
	cl_uint height = ROWS;
	
	// OpenCL host variables
	cl_uint num_devs_returned;
	cl_context_properties properties[3];
	cl_device_id device_id;
	cl_int err;
	cl_platform_id platform_id;
	cl_uint num_platforms_returned;
	cl_context context;
	cl_command_queue command_queue;
	cl_program program;
	cl_kernel kernel;
	cl_mem input_buffer1,input_buffer2, output_buffer;
	size_t global[2];
	size_t local[2];

  cl_event profEvent;
	
	// variables used to read kernel source file
	FILE *fp;
	long filelen;
	long readlen;
	char *kernel_src;  // char string to hold kernel source


	// initialize inputMatrix with some data and print it
	int x,y;
	int data=0;

	inputMatrix1 = malloc(sizeof(cl_float)*width*height);
	inputMatrix2 = malloc(sizeof(cl_float)*width*height);
	results = malloc(sizeof(cl_float)*width*height);

	for(y=0;y<height;y++)
	{
		for(x=0;x<width;x++)
		{
		   inputMatrix1[y*height+x]= data;
		   inputMatrix2[y*height+x]= data;
		   results[y*height+x]=0;
                   data++;

		}
	}

	// read the kernel
	fp = fopen("multMatrix_kernel.cl","r");
	fseek(fp,0L, SEEK_END);
	filelen = ftell(fp);
	rewind(fp);

	kernel_src = malloc(sizeof(char)*(filelen+1));
	readlen = fread(kernel_src,1,filelen,fp);
	if(readlen!= filelen)
	{
	   printf("error reading file\n");
	   exit(1);
	}
	
	// ensure the string is NULL terminated
	kernel_src[filelen+1]='\0';


// OpenCL host source starts here ----

	// get a platform id
	err = clGetPlatformIDs(1,&platform_id,&num_platforms_returned); 

	if (err != CL_SUCCESS)
	{	
		printf("Unable to get Platform ID. Error Code=%d\n",err);
		exit(1);
	}
	
	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devs_returned);
	if (err != CL_SUCCESS)
	{	
		printf("Unable to get Device ID. Error Code=%d\n",err);
		exit(1);
	}
	
	// context properties list - must be terminated with 0
	properties[0]= CL_CONTEXT_PLATFORM;
	properties[1]= (cl_context_properties) platform_id;
	properties[2]= 0;
	
	// create context
	context = clCreateContext(properties, 1, &device_id, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{	
		printf("Unable to create context. Error Code=%d\n",err);
		exit(1);
	}

	// create command queue 
	command_queue = clCreateCommandQueue(context,device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	if (err != CL_SUCCESS)
	{	
		printf("Unable to create command queue. Error Code=%d\n",err);
		exit(1);
	}
	 
	// create program object from source. kernel_src contains 
	// source read from file earlier
	program = clCreateProgramWithSource(context, 1 ,(const char **) &kernel_src, NULL, &err);
	if (err != CL_SUCCESS)
	{	
		printf("Unable to create program object. Error Code=%d\n",err);
		exit(1);
	}       
	
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS)
	{
        printf("Build failed. Error Code=%d\n", err);

		size_t len;
		char buffer[4096];
		// get the build log
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("--- Build Log -- \n %s\n",buffer);
		exit(1);
	}

	//kernel = clCreateKernel(program, "multMatrix", &err);
	kernel = clCreateKernel(program, "multMatrixSimple", &err);
	if (err != CL_SUCCESS)
	{	
		printf("Unable to create kernel object. Error Code=%d\n",err);
		exit(1);
	}

	// create buffer objects to input and output args of kernel function
	input_buffer1 = clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * ROWS*COLS, inputMatrix1, NULL);
	input_buffer2 = clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * ROWS*COLS, inputMatrix2, NULL);
	output_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * ROWS*COLS, NULL ,NULL);
	// set the kernel arguments
	if ( clSetKernelArg(kernel, 0, sizeof(cl_mem), &output_buffer) ||
         clSetKernelArg(kernel, 1, sizeof(cl_mem), &input_buffer1) ||
         clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_buffer2) ||
         clSetKernelArg(kernel, 3, sizeof(cl_uint), &width) ||
         clSetKernelArg(kernel, 4, sizeof(cl_uint), &width) != CL_SUCCESS) 
	{
		printf("Unable to set kernel arguments. Error Code=%d\n",err);
		exit(1);
	}

	// set the global & local work size
	global[0]= width;
	global[1]= height;

	local[0]=BLOCK_SIZE;
	local[1]=BLOCK_SIZE;

	// Enqueue the kernel object with 
	// Dimension size = 2, 
	// global worksize = global, 
	// local worksize = local
	// No event wait list
	err = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global,local, 0, NULL, &profEvent);
	if (err != CL_SUCCESS)
	{	
		printf("Unable to enqueue kernel command. Error Code=%d\n",err);
		exit(1);
	}

	// wait for the command to finish
	clFinish(command_queue);

  cl_ulong startTime, endTime;
  clGetEventProfilingInfo(profEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);
  clGetEventProfilingInfo(profEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

  cl_ulong elapsedTime = endTime - startTime;

  printf("Elapsed time: %lu ns (%.3f ms)\n", elapsedTime, elapsedTime/10e6);

	// read the output back to host memory
	err = clEnqueueReadBuffer(command_queue, output_buffer, CL_TRUE, 0, sizeof(cl_float)*width*height, results, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{	
		printf("Error enqueuing read buffer command. Error Code=%d\n",err);
		exit(1);
	}

	// clean up
	clReleaseMemObject(input_buffer1);
	clReleaseMemObject(input_buffer2);
	clReleaseMemObject(output_buffer);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);

// ---- End of OpenCL host portion 

	
//  uncoment this block to print out matrix results
/*
	printf("\nMatrix A\n");
	for(y=0;y<height;y++)
	{
	   for(x=0;x<width;x++)
	   {
	      printf("%.2f , ",inputMatrix1[y*height+x]);
	   }
	   printf("\n");
	}
	  
	printf("\nMatrix B\n");
	for(y=0;y<height;y++)
	{
	    for(x=0;x<width;x++)
	   {
	      printf("%.2f , ",inputMatrix2[y*height+x]);
	   }
	   printf("\n");
	}
	// print out the transposed matrix 
	printf("\n Matrix A + Matrix B \n");
	for(y=0;y<height;y++)
	{
	   for(x=0;x<width;x++)
	   {
	      printf("%.2f , ",results[y*height+x]);
	   }
	   printf("\n");
	}
*/	  
	free(kernel_src);
	free(inputMatrix1);
	free(inputMatrix2);
	free(results);
	return 0;
}
int main(int argc, char** argv) {
    /* OpenCL 1.1 data structures */
    cl_platform_id* platforms;
    cl_program program;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_uint numOfPlatforms;
    cl_int  error;

    cl_mem matrixAMemObj; // input matrix A mem buffer
    cl_mem matrixBMemObj; // input matrix B mem buffer
    cl_mem matrixCMemObj; // input matrix C mem buffer
    cl_int* matrixA;      // input matrix A
    cl_int* matrixB;      // input matrix B
    cl_int* matrixC;      // input matrix C
    cl_uint widthA = WIDTH_G;
    cl_uint heightA = HEIGHT_G;
    cl_uint widthB = WIDTH_G;
    cl_uint heightB = HEIGHT_G;

	{
	    // allocate memory for input and output matrices 
        // based on whatever matrix theory i know.
	    matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int));
	    matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int));
	    matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int));

	    memset(matrixA, 0, widthA * heightA * sizeof(cl_int));
	    memset(matrixB, 0, widthB * heightB * sizeof(cl_int));
	    memset(matrixC, 0, widthB * heightA * sizeof(cl_int));
        
        fillRandom(matrixA, widthA, heightA, 643);
        fillRandom(matrixB, widthB, heightB, 991);
    }

    /*
     Get the number of platforms
     Remember that for each vendor's SDK installed on the computer,
     the number of available platform also increased.
     */
    error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
    if(error != CL_SUCCESS) {
        perror("Unable to find any OpenCL platforms");
        exit(1);
    }
    
    platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
    printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);
    
    error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
    if(error != CL_SUCCESS) {
        perror("Unable to find any OpenCL platforms");
        exit(1);
    }
    // Search for a GPU device through the installed platforms
    // Build a OpenCL program and do not run it.
    for(cl_int i = 0; i < numOfPlatforms; i++ ) {
        // Get the GPU device
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL);

        if(error != CL_SUCCESS) {
            perror("Can't locate a OpenCL compliant device i.e. GPU");
            exit(1);
        }
        /* Create a context */
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
        if(error != CL_SUCCESS) {
            perror("Can't create a valid OpenCL context");
            exit(1);
        }
        
        /* Load the two source files into temporary datastores */
        const char *file_names[] = {"simple_mm_mult.cl"};
        const int NUMBER_OF_FILES = 1;
        char* buffer[NUMBER_OF_FILES];
        size_t sizes[NUMBER_OF_FILES];
        loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
        
        /* Create the OpenCL program object */
        program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);
	    if(error != CL_SUCCESS) {
            perror("Can't create the OpenCL program object");
            exit(1);
	    }
        /* Build OpenCL program object and dump the error message, if any */
        char *program_log;
        const char options[] = "";
        size_t log_size;

        error = clBuildProgram(program, 1, &device, options, NULL, NULL);
	    if(error != CL_SUCCESS) {
            // If there's an error whilst building the program, dump the log
            clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
            program_log = (char*) malloc(log_size+1);
            program_log[log_size] = '\0';
            clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
                                  log_size+1, program_log, NULL);
            printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
            free(program_log);
            exit(1);
	    }
       
        // Queue is created with profiling enabled 
        cl_command_queue_properties props;
        props |= CL_QUEUE_PROFILING_ENABLE;

        queue = clCreateCommandQueue(context, device, props, &error);

        cl_kernel kernel = clCreateKernel(program, "mmmult", &error);

        matrixAMemObj = clCreateBuffer(context,
                                       CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                       widthA * heightA * sizeof(cl_int),
                                       matrixA,
                                       &error);

        matrixBMemObj = clCreateBuffer(context,
                                       CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                       widthB * heightB * sizeof(cl_int),
                                       matrixB,
                                       &error);

        matrixCMemObj = clCreateBuffer(context,
                                       CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR,
                                       widthB * heightA * sizeof(cl_int),
                                       0,
                                       &error);

        clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB);
        clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA);
        clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj);
        clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj);
        clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj);
         
        size_t globalThreads[] = {widthB, heightA};

		cl_event exeEvt; 
        cl_ulong executionStart, executionEnd;
		error = clEnqueueNDRangeKernel(queue,
		                               kernel,
		                               2,
		                               NULL,
                                       globalThreads,
                                       NULL, 
                                       0,
                                       NULL,
                                       &exeEvt);
		clWaitForEvents(1, &exeEvt);
		if(error != CL_SUCCESS) {
			printf("Kernel execution failure!\n");
			exit(-22);
		}	

        // let's understand how long it took?
        clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL);
        clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL);
        clReleaseEvent(exeEvt);

        printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000);
 
        clEnqueueReadBuffer(queue,
                            matrixCMemObj,
                            CL_TRUE,
                            0,
                            widthB * heightA * sizeof(cl_int),
                            matrixC,
                            0,
                            NULL,
                            NULL);
       
        if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB))
            printf("Passed!\n");
        else 
            printf("Failed!\n");
 
        /* Clean up */
        for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
        clReleaseProgram(program);
        clReleaseContext(context);
        clReleaseMemObject(matrixAMemObj);
        clReleaseMemObject(matrixBMemObj);
        clReleaseMemObject(matrixCMemObj);
    }
    
    free(matrixA);
    free(matrixB);
    free(matrixC);
}
Example #16
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
    shrQAStart(argc, argv);
	int NUM_BLOCKS = 10;
	shrSetLogFileName ("Barrier_Centralized.txt");
	while(NUM_BLOCKS<=120)
	{
	
	
	int iNumElements = NUM_BLOCKS* NUM_THREADS;	// total num of threads
	// BARRIER GOAL
	int goal_val = NUM_BLOCKS;
	// get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    
    // start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("Barrier.txt");
    shrLog("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); 

    // set and log Global and Local work size dimensions
    szLocalWorkSize = NUM_THREADS ;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

    

    //Get an OpenCL platform
    ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

    shrLog("clGetPlatformID...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Get the devices
    ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    shrLog("clGetDeviceIDs...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
    shrLog("clCreateContext...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1);
    shrLog("clCreateCommandQueue...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


	

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    shrLog("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Build the program with 'mad' Optimization option
    #ifdef MAC
        char* flags = "-cl-fast-relaxed-math -DMAC";
    #else
        char* flags = "-cl-fast-relaxed-math";
    #endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    shrLog("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1);
    shrLog("clCreateKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }



	 // Allocate and initialize host arrays 
    shrLog( "Allocate and Init Host Mem...\n"); 
    input = (int *)malloc(sizeof(int) * NUM_BLOCKS);

	for(int i =0; i<=NUM_BLOCKS; i++)
	{
		input[i]=0;

	}

	// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
    array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
	
	if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Set the Argument values
    
    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val);
	ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in);
	ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out);

   // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements);
    shrLog("clSetKernelArg 0 - 2...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }






    // --------------------------------------------------------
    // Start Core sequence... copy input data to GPU, compute, copy results back



	ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL);
    
    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Launch kernel
    ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent);
    shrLog("clEnqueueNDRangeKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

   /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL);
    shrLog("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }*/


	//GPU_PROFILING
    ciErr1=clWaitForEvents(1, &ceEvent);
	if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error 1 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
       
        cl_ulong start, end;
     ciErr1 =   clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
      ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error 2 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
        double dSeconds = 1.0e-9 * (double)(end - start);
		shrLog("Done! time taken %ul \n",end - start );
      // shrLog("Done! Kernel execution time: %.5f s\n\n", dSeconds);


		// Release event
       clReleaseEvent(ceEvent);
       ceEvent = 0;

    
		    Cleanup (argc, argv,  EXIT_SUCCESS);

			NUM_BLOCKS = NUM_BLOCKS+10;
		}

		shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
Example #17
0
    void display(cl_long start, cl_long finish)
    {
        std::string graph;
        graph.resize(WindowWidth + 1);
        graph[WindowWidth] = '\x0';
        cl_long timeFrame = finish - start;
        cl_long interval = timeFrame / WindowWidth;

        // Find time min/max ranges for the frame scaling
        for (size_t op = 0; (op < Total); ++op) {
            if (events_[op].size() == 0) continue;
            cl_long timeStart, timeEnd;
            int begin = 0, end = 0;
            for (size_t idx = 0; idx < events_[op].size(); ++idx) {
                bool cutStart = false;
                clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_START,
                    sizeof(cl_long), &timeStart, NULL);
                clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_END,
                    sizeof(cl_long), &timeEnd, NULL);

                // Continue if out of the frame scope
                if (timeStart >= finish) continue;
                if (timeEnd <= start) continue;

                if (timeStart <= start) {
                    timeStart = start;
                    cutStart = true;
                }

                if (timeEnd >= finish) {
                    timeEnd = finish;
                }

                // Readjust time to the frame
                timeStart -= start;
                timeEnd -= start;
                timeStart = static_cast<cl_long>(
                    floor(static_cast<float>(timeStart) / interval + 0.5f));
                timeEnd = static_cast<cl_long>(
                    floor(static_cast<float>(timeEnd) / interval + 0.5f));
                begin = static_cast<int>(timeStart);
                // Idle from end to begin
                for (int c = end; c < begin; ++c) {
                    graph[c] = '-';
                }
                end = static_cast<int>(timeEnd);
                for (int c = begin; c < end; ++c) {
                    if ((c == begin) && !cutStart) {
                        graph[c] = StartCommand[op];
                    }
                    else {
                        graph[c] = ExecCommand[op];
                    }
                }
                if ((begin == end) && (end < WindowWidth)) {
                    graph[begin] = '+';
                }
            }
            if (end < WindowWidth) {
                for (int c = end; c < WindowWidth; ++c) {
                    graph[c] = '-';
                }
            }
            printf("%s\n", graph.c_str());
        }
    }
Example #18
0
void run1(int N, char *fileName)
{
	puts("Matrix Vector Multiplication Naive\n");

	int i,j;

	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = 1.f;	
		}
	}

	float *B;
	B = (float*)malloc(sizeof(float)*N);
	for( i = 0; i < N ; ++i )
	{
		B[i] = 1.f;	
	}
	
	float *C;
	C = (float*)malloc(sizeof(float)*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);

	puts("B");
	check_1d_f(B,N);
#endif

	int NumK = 1;
	int NumE = 1;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);

#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bin, NULL);
	OCL_CHECK(err);

	// Print the binary out to the output file
	fh = fopen("kernel_mv_1.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

#endif

	kernel[0] = clCreateKernel(program, "mv_1", &err);
	OCL_CHECK(err);


	// memory on device
	cl_mem A_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem B_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);
	cl_mem C_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);

	// Initialize device memory
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , NULL); 
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, B_d, 	CL_TRUE, 0, sizeof(float)*N, 	B, 0, NULL , NULL); 
	OCL_CHECK(err);

	size_t localsize =  64;
	size_t globalsize = N;


	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &C_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &globalsize, &localsize, 0, NULL, &event[0]);
	OCL_CHECK(err);

	clFinish(queue);

	clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, sizeof(float)*N, C , 0, NULL , NULL );

	err = clWaitForEvents(1,&event[0]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("C = A * B");
	check_1d_f(C,N);

#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(B_d);	
	clReleaseMemObject(C_d);	



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif

	free(A);
	free(B);
	free(C);

	return;
}
Example #19
0
int main(int argc, char **argv)
{
    printf("enter demo main\n");
    fflush(stdout);
    putenv("POCL_VERBOSE=1");
    putenv("POCL_DEVICES=basic");
    putenv("POCL_LEAVE_TEMP_DIRS=1");
    putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1");
    putenv("POCL_TEMP_DIR=pocl");
    putenv("POCL_CACHE_DIR=pocl");
    putenv("POCL_WORK_GROUP_METHOD=spmd");
    if(argc >= 2) {
        printf("argv[1]:%s:\n",argv[1]);
        if(!strcmp(argv[1], "h"))
            putenv("POCL_WORK_GROUP_METHOD=spmd");
        if(!strcmp(argv[1], "c"))
            putenv("POCL_CROSS_COMPILE=1");
    }
    if(argc >= 3) {
        printf("argv[2]:%s:\n",argv[2]);
        if(!strcmp(argv[2], "h"))
            putenv("POCL_WORK_GROUP_METHOD=spmd");
        if(!strcmp(argv[2], "c"))
            putenv("POCL_CROSS_COMPILE=1");
    }

    //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
    //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
    //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib");
    //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath());
    cl_platform_id platforms[100];
    cl_uint platforms_n = 0;
    CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

    printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
    for (int i=0; i<platforms_n; i++)
    {
        char buffer[10240];
        printf("  -- %d --\n", i);
        CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
        printf("  PROFILE = %s\n", buffer);
        CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
        printf("  VERSION = %s\n", buffer);
        CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
        printf("  NAME = %s\n", buffer);
        CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
        printf("  VENDOR = %s\n", buffer);
        CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
        printf("  EXTENSIONS = %s\n", buffer);
    }

    if (platforms_n == 0)
        return 1;

    cl_device_id devices[100];
    cl_uint devices_n = 0;
    // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
    CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

    printf("=== %d OpenCL device(s) found on platform:\n", platforms_n);
    for (int i=0; i<devices_n; i++)
    {
        char buffer[10240];
        cl_uint buf_uint;
        cl_ulong buf_ulong;
        size_t wi_size[3];
        printf("  -- %d --\n", i);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
        printf("  DEVICE_NAME = %s\n", buffer);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
        printf("  DEVICE_VENDOR = %s\n", buffer);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
        printf("  DEVICE_VERSION = %s\n", buffer);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
        printf("  DRIVER_VERSION = %s\n", buffer);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
        printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
        printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
        printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
        CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(wi_size), &wi_size, NULL));
        printf("  DEVICE_MAX_WG_SIZE X=%ld,Y=%ld,Z=%ld\n", wi_size[0], wi_size[1], wi_size[2]);
    }

    if (devices_n == 0)
        return 1;

    cl_context context;
    context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err));

    cl_command_queue queue;
    queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err));

    cl_kernel kernel = 0;
    cl_mem memObjects[2] = {0,0};


    // Create OpenCL program - first attempt to load cached binary.
    //  If that is not available, then create the program from source
    //  and store the binary for future use.
    std::cout << "Attempting to create program from binary..." << std::endl;
    cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin");
    if (program == NULL)
    {
        std::cout << "Binary not loaded, create from source..." << std::endl;
        program = CreateProgram(context, devices[1], "kernel.cl");
        if (program == NULL)
        {
            Cleanup(context, queue, program, kernel, memObjects);
            return 1;
        }

        std::cout << "Save program binary for future run..." << std::endl;
        if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false)
        {
            std::cerr << "Failed to write program binary" << std::endl;
            Cleanup(context, queue, program, kernel, memObjects);
            return 1;
        }
    }
    else
    {
        std::cout << "Read program from binary." << std::endl;
    }

    printf("attempting to create input buffer\n");
    fflush(stdout);
    cl_mem input_buffer;
    input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*NUM_DATA, NULL, &_err));

    printf("attempting to create output buffer\n");
    fflush(stdout);
    cl_mem output_buffer;
    output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*NUM_DATA, NULL, &_err));

    memObjects[0] = input_buffer;
    memObjects[1] = output_buffer;

    float factor = ((float)rand()/(float)(RAND_MAX)) * 100.0;

    printf("attempting to create kernel\n");
    fflush(stdout);
    kernel = CL_CHECK_ERR(clCreateKernel(program, "saxpy", &_err));
    printf("setting up kernel args cl_mem:%lx \n",input_buffer);
    fflush(stdout);
    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

    printf("attempting to enqueue write buffer\n");
    fflush(stdout);
    for (int i=0; i<NUM_DATA; i++) {
        float in = ((float)rand()/(float)(RAND_MAX)) * 100.0;
        CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(float), 4, &in, 0, NULL, NULL));
    }

    cl_event kernel_completion;
    size_t global_work_size[1] = { NUM_DATA };
    printf("attempting to enqueue kernel\n");
    fflush(stdout);
    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
    printf("Enqueue'd kerenel\n");
    fflush(stdout);
    cl_ulong time_start, time_end;
    CL_CHECK(clWaitForEvents(1, &kernel_completion));
    CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL));
    CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL));
    double elapsed = time_end - time_start;
    printf("time(ns):%lg\n",elapsed);
    CL_CHECK(clReleaseEvent(kernel_completion));

    printf("Result:");
    for (int i=0; i<NUM_DATA; i++) {
        float data;
        CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(float), 4, &data, 0, NULL, NULL));
        //printf(" %f", data);
    }
    printf("\n");

    CL_CHECK(clReleaseMemObject(memObjects[0]));
    CL_CHECK(clReleaseMemObject(memObjects[1]));

    CL_CHECK(clReleaseKernel(kernel));
    CL_CHECK(clReleaseProgram(program));
    CL_CHECK(clReleaseContext(context));

    return 0;
}
void    matrixTransposeGMSP (cl_uint numDevices,cl_device_id *devices, cl_program program,cl_context context,float * h_Mat, float *h_Output,int height,int width)
{
        cl_int err;
        cl_command_queue cmdQueue;   //holds command queue object
        cl_kernel kernel;               //holds kernel object
        cl_mem d_Mat,d_rows,d_Output;            //holds device input output buffer
	int workgroup=height;
	size_t globalWorkSize[2]={workgroup,workgroup}; //holds global group size
        double                  gflops=0.0;             //holds total achieved gflops
        cl_ulong startTime, endTime,elapsedTime;       //holds time
        float executionTimeInSeconds;                 //holds total execution time
	cl_event                events; 
         cl_event                gpuExec[1];        // events

	//create command queue
        cmdQueue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
        if( err != CL_SUCCESS ||  cmdQueue == 0)
         {
               printf("\n\t Failed to create command queue  \n" );
               exit (-1);
         }

        /*create kernel object*/
        kernel = clCreateKernel(program,"transMatrix",&err);
        OPENCL_CHECK_STATUS("error while creating kernel",err);

        /*create buffer*/

        d_Mat=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*(height*width),h_Mat,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for input",err);
      
	d_rows=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float),(void *)&height,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for input",err);
        
	d_Output=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(float)*(height*width),NULL,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for output",err);


        /*set kernel arg*/
        err=clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&d_Mat);
        OPENCL_CHECK_STATUS("error while setting arg 1",err);

        err=clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&d_Output);
        OPENCL_CHECK_STATUS("error while setting arg 1",err);

        err=clSetKernelArg(kernel,2,sizeof(cl_mem),(void *)&d_rows);
        OPENCL_CHECK_STATUS("error while setting arg 2",err);

        /*load kernel*/
        err = clEnqueueNDRangeKernel(cmdQueue,kernel,2,NULL,globalWorkSize,NULL,0,NULL,&gpuExec[0]);
        OPENCL_CHECK_STATUS("error while creating ND range",err);

        //completion of all commands to command queue
        err = clFinish(cmdQueue);
        OPENCL_CHECK_STATUS("clFinish",err);

	/* calculate start time and end time*/
        clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);
        clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

	/* total alapsed time*/
        elapsedTime = endTime-startTime;
	/* total execution time*/
        executionTimeInSeconds = (float)(1.0e-9 * elapsedTime);

        /* reading buffer object*/
        err = clEnqueueReadBuffer(cmdQueue,d_Output,CL_TRUE,0,sizeof(cl_float)*height*width,h_Output,0,0,&events);
        OPENCL_CHECK_STATUS("error while reading buffer",err);

        // Print the gflops on the screen
         print_on_screen("Matrix Tranpose using global memory",executionTimeInSeconds,height,gflops,0);

	//release opencl objects
        clReleaseMemObject(d_Mat);
        clReleaseMemObject(d_rows);
        clReleaseMemObject(d_Output);
        clReleaseProgram(program);
        clReleaseKernel(kernel);
        clReleaseCommandQueue(cmdQueue);
        clReleaseContext(context);
}
int main() {

   /* OpenCL structures */
   cl_device_id device;
   cl_context context;
   cl_program program;
   cl_kernel vector_kernel, complete_kernel;
   cl_command_queue queue;
   cl_event start_event, end_event;
   cl_int i, err;
   size_t local_size, global_size;

   /* Data and buffers */
   float data[ARRAY_SIZE];
   float sum, actual_sum;
   cl_mem data_buffer, sum_buffer;
   cl_ulong time_start, time_end, total_time;

   /* Initialize data */
   for(i=0; i<ARRAY_SIZE; i++) {
      data[i] = 1.0f*i;
   }

   /* Create device and determine local size */
   device = create_device();
   err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, 	
         sizeof(local_size), &local_size, NULL);	
   if(err < 0) {
      perror("Couldn't obtain device information");
      exit(1);   
   }

   /* Create a context */
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Build program */
   program = build_program(context, device, PROGRAM_FILE);

   /* Create data buffer */
   data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
         CL_MEM_USE_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
   sum_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         sizeof(float), NULL, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 
         CL_QUEUE_PROFILING_ENABLE, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Create kernels */
   vector_kernel = clCreateKernel(program, KERNEL_1, &err);
   complete_kernel = clCreateKernel(program, KERNEL_2, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);
   };

   /* Set arguments for vector kernel */
   err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(vector_kernel, 1, local_size * 4 * sizeof(float), NULL);

   /* Set arguments for complete kernel */
   err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(complete_kernel, 1, local_size * 4 * sizeof(float), NULL);
   err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);
   if(err < 0) {
      perror("Couldn't create a kernel argument");
      exit(1);   
   }

   /* Enqueue kernels */
   global_size = ARRAY_SIZE/4;
   err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, 
         &local_size, 0, NULL, &start_event);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }
   printf("Global size = %lu\n", global_size);

   /* Perform successive stages of the reduction */
   while(global_size/local_size > local_size) {
      global_size = global_size/local_size;
      err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, 
            &local_size, 0, NULL, NULL);
      printf("Global size = %lu\n", global_size);
      if(err < 0) {
         perror("Couldn't enqueue the kernel");
         exit(1);   
      }
   }
   global_size = global_size/local_size;
   err = clEnqueueNDRangeKernel(queue, complete_kernel, 1, NULL, &global_size, 
         NULL, 0, NULL, &end_event);
   printf("Global size = %lu\n", global_size);

   /* Finish processing the queue and get profiling information */
   clFinish(queue);
   clGetEventProfilingInfo(start_event, CL_PROFILING_COMMAND_START,
         sizeof(time_start), &time_start, NULL);
   clGetEventProfilingInfo(end_event, CL_PROFILING_COMMAND_END,
         sizeof(time_end), &time_end, NULL);
   total_time = time_end - time_start;

   /* Read the result */
   err = clEnqueueReadBuffer(queue, sum_buffer, CL_TRUE, 0, 
      sizeof(float), &sum, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   /* Check result */
   actual_sum = 1.0f * (ARRAY_SIZE/2)*(ARRAY_SIZE-1);
   if(fabs(sum - actual_sum) > 0.01*fabs(sum))
      printf("Check failed.\n");
   else
      printf("Check passed.\n");
   printf("Total time = %lu\n", total_time);

   /* Deallocate resources */
   clReleaseEvent(start_event);
   clReleaseEvent(end_event);
   clReleaseMemObject(sum_buffer);
   clReleaseMemObject(data_buffer);
   clReleaseKernel(vector_kernel);
   clReleaseKernel(complete_kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
Example #22
0
int main(int argc, char** argv)
{
	cl_event event;
	int err, i = 0;                            // error code returned from api calls
	cl_ulong time_start, time_end;
	double total_time = 0;

	pgm_t input_pgm, output_pgm;


	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel;                // compute kernel

	// OpenCL device memory for matrices
	cl_mem d_image, d_filter, d_output;

	// Simple laplacian kernel
	DTYPE lap_filter[FILTER_SIZE*FILTER_SIZE] = {-1.0, -1.0, -1.0, -1.0, 8.0, -1.0, -1.0, -1.0, -1.0};
	DTYPE bias = 0.01;

	if (argc != 2) {
		printf("Usage: %s <image_name.pgm>\n", argv[0]);
		exit(1);
	}

	// Read the input image
	readPGM(&input_pgm, argv[1]);

	printf("Host: Input image resolution:%dx%d\n", input_pgm.width, input_pgm.height);

	DTYPE  *h_image, *h_image_padded;
	DTYPE  *h_filter, *h_output, *ref_output;

	// Allocate host memory for images and outputs
	h_image    = (DTYPE*)malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height);
	ref_output = (DTYPE*)malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height);
	
	//setup padded input image
	const int PADDED_SIZE = sizeof(DTYPE)*(input_pgm.width+FILTER_SIZE-1)*(input_pgm.height+FILTER_SIZE-1);
	h_image_padded = (DTYPE*)malloc(PADDED_SIZE);
	memset((void*)h_image_padded, 0, PADDED_SIZE); //init padded image to 0s
	int offset = 0; //Used for padded image

	// Convert range from [0, 255] to [0.0, 1.0]
	for(i = 0; i < input_pgm.width * input_pgm.height; i++)
	{
		if(i%input_pgm.width == 0 && i>0){ //if end of image row
			offset += FILTER_SIZE-1; //bump padded image to next row
		}
		h_image[i] = (DTYPE) input_pgm.buf[i]/255.0;
		h_image_padded[i+offset] = h_image[i];
	}

	h_filter = (DTYPE*) lap_filter;
	h_output = (DTYPE*) malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height);


	// Platform and device query
	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[5];
	clGetPlatformIDs(dev_cnt, platform_ids, NULL);

	for(i = 0;i < dev_cnt; i++)
	{
		err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
		if(err == CL_SUCCESS)
			break;
	}
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context)
	{
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	if (!commands)
	{
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize;
	lFileSize = LoadOpenCLKernel("conv_kernel.cl", &KernelSource);
	if( lFileSize < 0L ) {
		perror("File read failed");
		return 1;
	}

	program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
	if (!program)
	{
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	kernel = clCreateKernel(program, "conv_2d", &err);
	if (!kernel || err != CL_SUCCESS)
	{
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	// Allocate the device buffer for input image, kernel and transfer the data
	d_image  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, PADDED_SIZE, h_image_padded, &err);

	// Create the input and output arrays in device memory for our calculation
	d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(DTYPE)*FILTER_SIZE*FILTER_SIZE, h_filter, &err);
	d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(DTYPE)*input_pgm.width*input_pgm.height, NULL, &err);

	if (!d_image || !d_filter || !d_output)
	{
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}    
		
	size_t localWorkSize[2], globalWorkSize[2];
	int filter_size  = FILTER_SIZE;

	// Setup the kernel arguments
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_image);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_filter);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_output);
	err |= clSetKernelArg(kernel, 3, sizeof(int), &filter_size);
	err |= clSetKernelArg(kernel, 4, sizeof(DTYPE), &bias);

	if (err != CL_SUCCESS) {
		printf("Error: Failed to set kernel arguments! %d\n", err);	
		exit(1);
	}

	globalWorkSize[0] = input_pgm.width;
	globalWorkSize[1] = input_pgm.height;

	localWorkSize[0] = 1;
	localWorkSize[1] = 1;

	uint trials = 1;
	printf("Launching the kernel...\n");
	for(uint j=0; j<trials;j++){
		/*Enqueue task for parallel execution*/
		err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
		if (err != CL_SUCCESS)
		{
			printf("Error: Failed to execute kernel! %d\n", err);
			exit(1);
		}

		// Wait for the commands to finish
		clWaitForEvents(1, &event);

		// Get the profiling info
		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
		total_time  += (double)(time_end - time_start);

	}
	total_time /= trials;

	// Retrieve result from device
	printf("Reading output buffer into host memory...\n");
	err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0, sizeof(DTYPE)*input_pgm.width*input_pgm.height, h_output, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}

	//-------------------------------------------------------------
	// Compare between host and device output
    // Generate reference output
    int kr, kc, row, col;
    DTYPE sum = 0;
    for(row = 0; row < input_pgm.height; row++) {
        for(col = 0; col < input_pgm.width; col++) {
            sum = 0;
            for(kr = 0; kr < FILTER_SIZE; kr++) {
                for(kc = 0; kc < FILTER_SIZE; kc++ ) {
                    sum += (lap_filter[kr*FILTER_SIZE + kc] * h_image_padded[(row+kr)*(input_pgm.width+FILTER_SIZE-1) + col + kc]);
                }
            }
            ref_output[row*input_pgm.width + col] = sum + bias;
        }
    }
    // Check Results
	int test_fail = 0;
    for(row = 0; row < input_pgm.height; row++) {
        for(col = 0; col < input_pgm.width; col++) {
             if(ref_output[row*input_pgm.width+col] != h_output[row*input_pgm.width+col]){
                 printf("Mismatch at : row = %d, col = %d, expected = %f, got = %f\n",
                     row, col, ref_output[row*input_pgm.width+col], h_output[row*input_pgm.width+col]);
                 test_fail = 1;
             }
        }
    }


	output_pgm.width = input_pgm.width;
	output_pgm.height = input_pgm.height;

	// Remove garbage pixels in the border. If not, this will effect the subsequent normalization.!
	for(row = 0; row < output_pgm.height; row++) {
		for(col = 0; col < output_pgm.width; col++) {
			if(row > output_pgm.height- FILTER_SIZE || col > output_pgm.width-FILTER_SIZE)
				h_output[row * output_pgm.width + col] = 0.0;
		}
	}

	normalizeF2PGM(&output_pgm, h_output);
	/* Output image */
	writePGM(&output_pgm, "ocl_output.pgm");

	if (test_fail) {
		printf("INFO: TEST FAILED !!!!\n");
	} else {
		printf("INFO: ****TEST PASSED****\n");
	}
	printf("Kernel runtime = %0.3f us\n", total_time / 1000.0);

	destroyPGM(&input_pgm);
	destroyPGM(&output_pgm);

	free(h_image);
	free(h_image_padded);
	free(h_output);
	free(ref_output);
	clReleaseMemObject(d_image);
	clReleaseMemObject(d_filter);
	clReleaseMemObject(d_output);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	return 0;
}
Example #23
0
int main(int argc, char **argv) {
  cl_int status;

  if (argc != 2) {
    fprintf(stderr, "Usage: %s <scale>\n", argv[0]);
    teardown(-1);
  }

  cl_float scale = strtof(argv[1],NULL);
  printf("scale: %f\n", scale);

  const char *platform_name = "NVIDIA";

  if (!find_platform(platform_name, &platform)) {
    fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name);
    print_platforms();
    teardown(-1);
  }

  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
  checkError (status, "Error: could not query devices");

  context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
  checkError(status, "could not create context");

  const char name[] = KERNELDIR "/interpolation.cl";

  unsigned char *source;
  size_t size;
  if (!load_file(name, &source, &size)) {
    teardown(-1);
  }

  program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status);
  checkError(status, "Error: failed to create program %s: ", name);

  status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL);
  if (status != CL_SUCCESS) {
    print_build_log(program, device);
    checkError(status, "Error: failed to create build %s: ", name);
  }

  free(source);

  print_device_info(device, 0);

  queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
  checkError(status, "could not create command queue");

  cl_ulong start, end;
  cl_event event;

  unsigned char *data;
  size_t datasize;

  if (!load_file("lena.dat", &data, &datasize)) {
    teardown(-1);
  }

  size_t width  = 512;
  size_t height = 512;

  size_t new_width = (size_t) ((int) width*scale);
  size_t new_height = (size_t) ((int) height*scale);
  printf("new size: %d %d\n", (int) new_width, (int) new_height);

  size_t buf_size = new_width*new_height*sizeof(cl_float);

  float *data_out = malloc(buf_size);
  if (!data_out) {
    fprintf(stderr,"\nError: malloc failed\n");
    teardown(-1);
  }

  kernel = clCreateKernel(program, "interpolation", &status);
  checkError(status, "could not create kernel");

  cl_image_format format = { CL_R, CL_UNORM_INT8};
  buffer_in = clCreateImage2D (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format,
    width, height, 0,
    data,
    &status);
  checkError(status, "Error: could not create image");

  cl_image_format format2 = { CL_R, CL_FLOAT};
  buffer_out = clCreateImage2D (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, &format2,
    new_width, new_height, 0,
    NULL,
    &status);
  checkError(status, "Error: could not create image");

  // execute kernel
  int arg = 0;
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in);
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out);
  checkError(status, "Error: could not set args");

  size_t work_size[] = {new_width, new_height};
  size_t local_size[] = {1, 1};

  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, work_size, local_size, 0, NULL, &event);
  checkError(status, "Error: could not enqueue kernel");

  status = clWaitForEvents(1, &event);
  checkError(status, "Error: could not wait for event");

  status  = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
  checkError(status, "Error: could not get start profile information");

  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
  checkError(status, "Error: could not get end profile information");

  status = clReleaseEvent(event);
  checkError(status, "Error: could not release event");

  // read results back
  size_t origin[] = {0,0,0};
  size_t region[] = {new_width, new_height, 1};
  status = clEnqueueReadImage(queue, buffer_out, CL_FALSE, origin, region, new_width*sizeof(cl_float), 0, data_out, 0, NULL, NULL);
  checkError(status, "Error: could not copy data into device");

  status  = clFinish(queue);
  checkError(status, "Error: could not finish successfully");

  double elapsed = (end - start) * 1e-9f;
  printf("time: %f\n", elapsed);

  write_bmp("scale.bmp", data_out, new_width, new_height, NORMAL);

  free(data);
  free(data_out);
  teardown(0);
}
Example #24
0
static void*
piglit_cl_get_info(void* fn_ptr, void* obj, cl_uint param)
{
	cl_int errNo;
	size_t param_size;
	void* param_ptr = NULL;

	/* get param size */
	if(fn_ptr == clGetPlatformInfo) {
		errNo = clGetPlatformInfo(*(cl_platform_id*)obj, param, 0, NULL,
		                          &param_size);
	} else if(fn_ptr == clGetDeviceInfo) {
		errNo = clGetDeviceInfo(*(cl_device_id*)obj, param, 0, NULL,
		                        &param_size);
	} else if(fn_ptr == clGetContextInfo) {
		errNo = clGetContextInfo(*(cl_context*)obj, param, 0, NULL,
		                         &param_size);
	} else if(fn_ptr == clGetCommandQueueInfo) {
		errNo = clGetCommandQueueInfo(*(cl_command_queue*)obj, param, 0, NULL,
		                              &param_size);
	} else if(fn_ptr == clGetMemObjectInfo) {
		errNo = clGetMemObjectInfo(*(cl_mem*)obj, param, 0, NULL,
		                           &param_size);
	} else if(fn_ptr == clGetImageInfo) {
		errNo = clGetImageInfo(*(cl_mem*)obj, param, 0, NULL,
		                       &param_size);
	} else if(fn_ptr == clGetSamplerInfo) {
		errNo = clGetSamplerInfo(*(cl_sampler*)obj, param, 0, NULL,
		                         &param_size);
	} else if(fn_ptr == clGetProgramInfo) {
		errNo = clGetProgramInfo(*(cl_program*)obj, param, 0, NULL,
		                         &param_size);
	} else if(fn_ptr == clGetProgramBuildInfo) {
		errNo = clGetProgramBuildInfo(((struct _program_build_info_args*)obj)->program,
		                              ((struct _program_build_info_args*)obj)->device,
		                              param, 0, NULL, &param_size);
	} else if(fn_ptr == clGetKernelInfo) {
		errNo = clGetKernelInfo(*(cl_kernel*)obj, param, 0, NULL,
		                        &param_size);
	} else if(fn_ptr == clGetKernelWorkGroupInfo) {
		errNo = clGetKernelWorkGroupInfo(((struct _kernel_work_group_info_args*)obj)->kernel,
		                                 ((struct _kernel_work_group_info_args*)obj)->device,
		                                 param, 0, NULL, &param_size);
	} else if(fn_ptr == clGetEventInfo) {
		errNo = clGetEventInfo(*(cl_event*)obj, param, 0, NULL,
		                       &param_size);
	} else if(fn_ptr == clGetEventProfilingInfo) {
		errNo = clGetEventProfilingInfo(*(cl_event*)obj, param, 0, NULL,
		                                &param_size);
	} else {
		fprintf(stderr,
		        "Trying to get %s information from undefined function.\n",
		        piglit_cl_get_enum_name(param));
		piglit_report_result(PIGLIT_FAIL);
	}

	if(errNo == CL_SUCCESS) {
		param_ptr = calloc(param_size, sizeof(char));
		
		/* retrieve param */
		if(fn_ptr == clGetPlatformInfo) {
			errNo = clGetPlatformInfo(*(cl_platform_id*)obj, param,
			                          param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetDeviceInfo) {
			errNo = clGetDeviceInfo(*(cl_device_id*)obj, param,
			                        param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetContextInfo) {
			errNo = clGetContextInfo(*(cl_context*)obj, param,
			                         param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetCommandQueueInfo) {
			errNo = clGetCommandQueueInfo(*(cl_command_queue*)obj,
			                              param, param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetMemObjectInfo) {
			errNo = clGetMemObjectInfo(*(cl_mem*)obj, param,
			                           param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetImageInfo) {
			errNo = clGetImageInfo(*(cl_mem*)obj, param,
			                       param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetSamplerInfo) {
			errNo = clGetSamplerInfo(*(cl_sampler*)obj, param,
			                         param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetProgramInfo) {
			errNo = clGetProgramInfo(*(cl_program*)obj, param,
			                         param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetProgramBuildInfo) {
			errNo = clGetProgramBuildInfo(((struct _program_build_info_args*)obj)->program,
			                              ((struct _program_build_info_args*)obj)->device,
			                              param, param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetKernelInfo) {
			errNo = clGetKernelInfo(*(cl_kernel*)obj, param,
			                        param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetKernelWorkGroupInfo) {
			errNo = clGetKernelWorkGroupInfo(((struct _kernel_work_group_info_args*)obj)->kernel,
			                                 ((struct _kernel_work_group_info_args*)obj)->device,
			                                 param, param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetEventInfo) {
			errNo = clGetEventInfo(*(cl_event*)obj, param,
			                       param_size, param_ptr, NULL);
		} else if(fn_ptr == clGetEventProfilingInfo) {
			errNo = clGetEventProfilingInfo(*(cl_event*)obj, param,
			                                param_size, param_ptr, NULL);
		}

		if(errNo != CL_SUCCESS) {
			free(param_ptr);
			param_ptr = NULL;
		}
	}

	if(param_ptr == NULL) {
		fprintf(stderr,
		        "Unable to get %s information (Error: %s)\n",
		        piglit_cl_get_enum_name(param),
		        piglit_cl_get_error_name(errNo));
		piglit_report_result(PIGLIT_FAIL);
	}

	return param_ptr;
}
static gpu_mem_buffer exec_pbkdf2(cl_uint *pass_api,cl_uint *salt_api,cl_uint saltlen_api,cl_uint *hash_out_api,cl_uint num, int jtrUniqDevNo,cl_command_queue cmdq )
{
	cl_event 	evnt;
	size_t 		N = num, M = globalObj[jtrUniqDevNo].lws;
	cl_int 		err;
	unsigned int 	i, itrCntKrnl = ITERATION_COUNT_PER_CALL;
	cl_ulong 	_kernelExecTimeNs = 0;

	HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu, CL_TRUE, 0, 4 * num * sizeof(cl_uint), pass_api, 0, NULL, NULL ), "Copy data to gpu");
	HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu, CL_TRUE, 0, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), salt_api, 0, NULL, NULL ), "Copy data to gpu");

	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 2, sizeof(cl_uint), &saltlen_api), "Set Kernel 0 Arg 2 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_uint), &num), "Set Kernel 0 Arg 3 :FAILED");

	err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[0], 1, NULL, &N, &M, 0, NULL, &evnt);
	if (err) {
		if (PROFILE)
			globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2;
	  	else
			HANDLE_CLERROR(err, "Enque Kernel Failed");

		return globalObj[jtrUniqDevNo].gpu_buffer;
	}

	if (PROFILE) {

		cl_ulong 	startTime, endTime;

		HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync :FAILED");
		HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

		clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
		_kernelExecTimeNs = endTime - startTime;
	}

	for (i=0; i< (10240 - 1); i = i+ itrCntKrnl ) {
		if (i == (10240 - itrCntKrnl))
			--itrCntKrnl;

		HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_uint), &itrCntKrnl), "Set Kernel 1 Arg 1 :FAILED");

		err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[1], 1, NULL, &N, &M, 0, NULL, &evnt);
		if (err) {
			if (PROFILE)
				globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2;
			else
				HANDLE_CLERROR(err, "Enque Kernel Failed");

			return globalObj[jtrUniqDevNo].gpu_buffer;
		}

		opencl_process_event();

		if (PROFILE) {
			cl_ulong 	startTime, endTime;

			HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync FAILED");
			HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

			_kernelExecTimeNs += endTime - startTime;
		}

		else if (active_dev_ctr == 1)
			HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

	}

	err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[2], 1, NULL, &N, &M, 0, NULL, &evnt);
	if (err) {
		if (PROFILE)
			globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2;
	  	else
			HANDLE_CLERROR(err, "Enque Kernel Failed");

		return globalObj[jtrUniqDevNo].gpu_buffer;
	}

	if (PROFILE) {
			cl_ulong 	startTime, endTime;
			HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync :FAILED");
			HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

			_kernelExecTimeNs += endTime - startTime;

			if (_kernelExecTimeNs < kernelExecTimeNs) {
				kernelExecTimeNs = _kernelExecTimeNs;

				//printf("%d\n",(int)kernelExecTimeNs);

				globalObj[jtrUniqDevNo].lws  = globalObj[jtrUniqDevNo].lws * 2;
				globalObj[jtrUniqDevNo].exec_time_inv =  (long double)pow(10, 9) / (long double)kernelExecTimeNs;
			}

         }
         else
		HANDLE_CLERROR(clEnqueueReadBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu, CL_FALSE, 0, 4*num*sizeof(cl_uint), hash_out_api, 1, &evnt, &events[event_ctr++]), "Write :FAILED");


	 return globalObj[jtrUniqDevNo].gpu_buffer;
}
Example #26
0
cl_mem parallelRemap1( cl_mem a_buffer, cl_mem v_buffer, cl_mem b_buffer, uint asize, uint bsize, real max_a, real min_val, real min_diff, double *time ) {
    
    cl_int error = 0;
    
    uint temp_size = (uint)((max_a - min_val)/min_diff);
    
    cl_mem temp_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, temp_size*sizeof(int), NULL, &error);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    size_t global_work_size[1];
    size_t local_work_size[1];
    
    local_work_size[0] = TILE_SIZE;
    global_work_size[0] = ((asize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
    
    /******************
     * Hash Kernel
     ******************/
    
    error = clSetKernelArg(cHash_kernel, 0, sizeof(real), &min_val);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 1, sizeof(real), &min_diff);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 2, sizeof(cl_uint), &asize);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 3, sizeof(cl_mem), (void*)&a_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(cHash_kernel, 4, sizeof(cl_mem), (void*)&temp_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    global_work_size[0] = ((asize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
    
    cl_event hash_kernel_event;
    
    error = clEnqueueNDRangeKernel(queue, cHash_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_kernel_event);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    /*****************
     * Remap Kernel
     *****************/
    
    cl_mem remap_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, bsize*sizeof(real), NULL, &error);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    error = clSetKernelArg(remap1_kernel, 0, sizeof(real), &min_val);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 1, sizeof(real), &min_diff);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 2, sizeof(cl_uint), &temp_size);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 3, sizeof(cl_uint), &bsize);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 4, sizeof(cl_mem), (void*)&a_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 5, sizeof(cl_mem), (void*)&v_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 6, sizeof(cl_mem), (void*)&b_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 7, sizeof(cl_mem), (void*)&temp_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    error = clSetKernelArg(remap1_kernel, 8, sizeof(cl_mem), (void*)&remap_buffer);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
    
    global_work_size[0] = ((bsize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];

    cl_event remap_event;
    
    error = clEnqueueNDRangeKernel(queue, remap1_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &remap_event);
    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
        
    long gpu_time_start, gpu_time_end, gpu_time=0;
    
    clWaitForEvents(1, &remap_event);
    
    clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
    clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
    gpu_time += gpu_time_end - gpu_time_start;
    clReleaseEvent(hash_kernel_event);
        
    clGetEventProfilingInfo(remap_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
    clGetEventProfilingInfo(remap_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
    gpu_time += gpu_time_end - gpu_time_start;
    clReleaseEvent(remap_event);
    
    clReleaseMemObject(temp_buffer);

    *time = gpu_time*1.0e-9;
    
    return remap_buffer;

}
Example #27
0
/*!
 * @function clut_blurImage
 * Blurs the image at [filename] with a filter of size [filter_size], and saves the result
 * to the file "output.png".
 * @param filename
 * The name of the file.
 * @param filter_size
 * The size of the blur filter.
 * @return
 * 0 on success, non-0 on failure.
 */
int clut_blurImage(const cl_device_id device, const char * const filename, const unsigned int filter_size)
{
	const char * const fname = "clut_blurImage";
	int return_value = 1;
	cl_int ret;

	if (NULL == filename) {
		Debug_out(DEBUG_HOMEWORK, "%s: NULL pointer argument.\n", fname);
		goto error1;
	}

	/* Create context */
	cl_context context = clCreateContext(NULL, 1, &device, clut_contextCallback, "clut_blurImage", &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create context", error1);
	Debug_out(DEBUG_HOMEWORK, "%s: Created context successfully.\n", fname);

	/* Create program */
	cl_program program = clut_createProgramFromFile(context, "homework_global.cl", NULL);
	if (NULL == program) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unable to create program.\n", fname);
		goto error3;
	}
	Debug_out(DEBUG_HOMEWORK, "%s: Program created.\n", fname);

	/* Create kernel */
	cl_kernel kernel = clCreateKernel(program, "blurImage", &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create kernel", error3);
	Debug_out(DEBUG_HOMEWORK, "%s: Kernel created.\n", fname);

	/* Create command_queue */
	cl_command_queue command_queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create command queue", error4);
	Debug_out(DEBUG_HOMEWORK, "%s: Command queue created.\n", fname);

	/* load source image */
	int width, height;
	cl_mem source_image = clut_loadImageFromFile(context, filename, &width, &height);
	if (NULL == source_image) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unable to read source image.\n", fname);
		goto error5;
	}

	if ((filter_size > (unsigned int) width) || (filter_size > (unsigned int) height)) {
		Debug_out(DEBUG_HOMEWORK, "%s: Filter does not fit in image.\n", fname);
		goto error6;
	}

	/* create destination image */
	cl_image_format image_format = {0, 0};

	cl_image_desc image_desc = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
	image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
//	image_desc.image_width		= 0;
//	image_desc.image_height		= 0;
//	image_desc.image_depth		= 0; /* only for 3D images */
//	image_desc.image_array_size	= 0; /* only for image arrays */
//	image_desc.image_row_pitch	= 0;
//	image_desc.image_slice_pitch	= 0; /* only for 3D images */
//	image_desc.num_mip_levels	= 0; /* mandatory */
//	image_desc.num_samples		= 0; /* mandatory */
//	image_desc.buffer		= NULL; /* only for 1D image buffers */

	image_desc.image_width = width - filter_size + 1;
	image_desc.image_height = height - filter_size + 1;

	ret = clGetImageInfo(source_image, CL_IMAGE_FORMAT, sizeof(image_format), &image_format, NULL);
	CLUT_CHECK_ERROR(ret, "Unable to get source image format information", error6);

	cl_mem result_image = clCreateImage(context, CL_MEM_WRITE_ONLY, &image_format, &image_desc, NULL, &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create second image", error6);

	Debug_out(DEBUG_HOMEWORK, "%s: Images created.\n", fname);

	/* create filter matrix */
	unsigned char *filter_matrix = createFilterMatrix(filter_size);
	if (NULL == filter_matrix) {
		Debug_out(DEBUG_HOMEWORK, "%s: Unable to create filter matrix.\n", fname);
		goto error7;
	}
	Debug_out(DEBUG_HOMEWORK, "%s: Filter matrix created.\n", fname);
//	printFilterMatrix(filter_matrix, filter_size);

	/* copy filter matrix to device */
	cl_mem filter_matrix_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, filter_size * filter_size, filter_matrix, &ret);
	CLUT_CHECK_ERROR(ret, "Unable to create filter matrix buffer on device", error8);

	/* set kernel arguments */
	ret = clSetKernelArg(kernel, 0, sizeof(source_image), (void *) &source_image);
	CLUT_CHECK_ERROR(ret, "Unable to set source image argument", error9);
	Debug_out(DEBUG_HOMEWORK, "%s: Source image argument set.\n", fname);
	ret = clSetKernelArg(kernel, 1, sizeof(result_image), (void *) &result_image);
	CLUT_CHECK_ERROR(ret, "Unable to set result image argument", error9);
	Debug_out(DEBUG_HOMEWORK, "%s: Result image argument set.\n", fname);
	ret = clSetKernelArg(kernel, 2, sizeof(filter_size), (void *) &filter_size);
	CLUT_CHECK_ERROR(ret, "Unable to set filter size argument", error9);
	Debug_out(DEBUG_HOMEWORK, "%s: Filter size argument set.\n", fname);
	ret = clSetKernelArg(kernel, 3, sizeof(filter_matrix_buffer), (void *) &filter_matrix_buffer);
	CLUT_CHECK_ERROR(ret, "Unable to set filter matrix argument", error9);
	Debug_out(DEBUG_HOMEWORK, "%s: Filter matrix argument set.\n", fname);

	Debug_out(DEBUG_HOMEWORK, "%s: All kernel arguments set.\n", fname);

	/* run kernel */
	cl_event kernel_event;
	const size_t work_size[2] = { height - filter_size + 1, width - filter_size + 1};
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, &kernel_event);
	CLUT_CHECK_ERROR(ret, "Unable to enqueue kernel", error9);

	ret = clFinish(command_queue);
	CLUT_CHECK_ERROR(ret, "Unable to finish commands in queue", error9);
	Debug_out(DEBUG_HOMEWORK, "%s: Kernel executed.\n", fname);
	ret = clWaitForEvents(1, &kernel_event);
	CLUT_CHECK_ERROR(ret, "Unable to wait for kernel event", error9);

	/* check that kernel executed correctly */
	cl_int kernel_ret;
	ret = clGetEventInfo(kernel_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(kernel_ret), &kernel_ret, NULL);
	CLUT_CHECK_ERROR(ret, "Unable to get kernel status", error9);
	Debug_out(DEBUG_HOMEWORK, "%s: Kernel status is %d.\n", fname, kernel_ret);
	if (CL_COMPLETE != kernel_ret) {
		Debug_out(DEBUG_HOMEWORK, "%s: kernel execution failed: %s.\n", fname, clut_getErrorDescription(kernel_ret));
		goto error9;
	}

	cl_ulong end_time;
	ret = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_END, sizeof(end_time), &end_time, NULL);
	CLUT_CHECK_ERROR(ret, "Unable to get kernel event end time", error9);
	if (0 == end_time) {
		Debug_out(DEBUG_HOMEWORK, "%s: kernel execution took 0 seconds.\n", fname);
		goto error9;
	}

	cl_double time_double = clut_getEventDuration(kernel_event);
	cl_ulong time_ulong = clut_getEventDuration_ns(kernel_event);
	Debug_out(DEBUG_HOMEWORK, "%s: Blurring took %f seconds (%lld nanoseconds).\n", fname, time_double, time_ulong);

	/* save image */
	clut_saveImageToFile("output.png", command_queue, result_image);

	/* print filter size and duration in nanoseconds for profiling */
	printf("%d,%llu\n", filter_size, clut_getEventDuration_ns(kernel_event));

	return_value = 0;

error9:
	clReleaseMemObject(filter_matrix_buffer);
error8:
	free(filter_matrix);
error7:
	clReleaseMemObject(result_image);
error6:
	clReleaseMemObject(source_image);
error5:
	clReleaseCommandQueue(command_queue);
error4:
	clReleaseKernel(kernel);
error3:
	clReleaseProgram(program);
error2:
	clReleaseContext(context);
error1:
	return return_value;

}
int
LDSBandwidth::bandwidth(cl_kernel &kernel)
{
    cl_int status;

    // Check group size against kernelWorkGroupSize
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[sampleArgs->deviceId],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

    if(localThreads > kernelWorkGroupSize)
    {
        localThreads = kernelWorkGroupSize;
    }

    // Set appropriate arguments to the kernel

    size_t size = (NUM_READS + localThreads) * vectorSize * sizeof(cl_float);
    // Local memory
    status = clSetKernelArg(kernel,
                            0,
                            size,
                            0);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(local memory)");

    // Output buffer
    status = clSetKernelArg(kernel,
                            1,
                            sizeof(cl_mem),
                            (void *)&outputBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)");

    // Get used local memory
    status =  clGetKernelWorkGroupInfo(kernel,
                                       devices[sampleArgs->deviceId],
                                       CL_KERNEL_LOCAL_MEM_SIZE,
                                       sizeof(cl_ulong),
                                       &usedLocalMemory,
                                       NULL);
    CHECK_OPENCL_ERROR(status,
                       "clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed.");

    if(usedLocalMemory > deviceInfo.localMemSize)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    double sec = 0;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        iterations = 10;
    }

    // Run the kernel for a number of iterations
    for(int i = 0; i < iterations; i++)
    {
        // Enqueue a kernel run call
        cl_event ndrEvt;
        status = clEnqueueNDRangeKernel(commandQueue,
                                        kernel,
                                        1,
                                        NULL,
                                        &globalThreads,
                                        &localThreads,
                                        0,
                                        NULL,
                                        &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        // wait for the kernel call to finish execution
        status = clWaitForEvents(1, &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clWaitForEvents failed.");

        // Calculate performance
        cl_ulong startTime;
        cl_ulong endTime;

        // Get kernel profiling info
        status = clGetEventProfilingInfo(ndrEvt,
                                         CL_PROFILING_COMMAND_START,
                                         sizeof(cl_ulong),
                                         &startTime,
                                         0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(startTime)");

        status = clGetEventProfilingInfo(ndrEvt,
                                         CL_PROFILING_COMMAND_END,
                                         sizeof(cl_ulong),
                                         &endTime,
                                         0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(endTime)");

        // Cumulate time for each iteration
        sec += 1e-9 * (endTime - startTime);

        status = clReleaseEvent(ndrEvt);
        CHECK_OPENCL_ERROR(status, "clReleaseEvent failed.(endTime)");
    }

    // Copy bytes
    int bytesPerThread = 0;
    if(vec3 == true)
    {
        bytesPerThread = NUM_READS * 3 * sizeof(cl_float);
    }
    else
    {
        bytesPerThread = NUM_READS * vectorSize * sizeof(cl_float);
    }
    double bytes = (double)(iterations * bytesPerThread);
    double perf = (bytes / sec) * 1e-9;
    perf *= globalThreads;

    std::cout << ": " << perf << " GB/s" << std::endl;

    return SDK_SUCCESS;
}
/** Run the OpenCL kernel */
short runOpenCL(struct benchmark *bench)
{

  short j;
  

#ifdef DEBUG
  printf("TEST OPENCL\n");
#endif
   /**********************

     Initializations

   **********************/
   error=clGetPlatformIDs(1,&platform,NULL);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Error to get platform ID : %d\n",error);
        goto error;
   }



   error=clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Error to get device ID : %d\n",error);
        goto error;
   } 


   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't get device info : %d\n",error);
        goto error;
   } 


   context = clCreateContext(0,1,&device,NULL,NULL,&error);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Error to create context : %d\n",error);
        goto error;
   } 

   size_t maxWorkItemDim;
   size_t maxWorkGroupSize;
   size_t workItemSize[10];
  

   error = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(size_t),&maxWorkItemDim,NULL);
   
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't get max work item dimensions : %d\n",error);
        goto errorContext;
   }

   error = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_SIZES,maxWorkItemDim*sizeof(size_t),workItemSize,NULL);
   
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't get mwork item sizes : %d\n",error);
        goto errorContext;
   }

   error = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&maxWorkGroupSize,NULL);
   
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't get max work item dimensions : %d\n",error);
        goto errorContext;
   }


   queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE ,&error);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Error to create command queue : %d\n",error);
        goto errorContext;
   } 





   /**********************

     Memory allocations

   **********************/
   long i;
#ifdef DEBUG
   printf("Create buffers\n");
#endif
   double createBufTime;
   createBufTime = createBuffers(context,bench);


   /**********************

     OpenCL kernel

   **********************/

   cl_program program;
   char *fileContent;
   FILE *f;
   struct stat fState;
   char path[256];

   strcpy(path,"Kernels/");
   strcat(path,bench->kernel);

   stat(path,&fState);

   f=fopen(path,"r");
   fileContent=malloc(fState.st_size*sizeof(char));
   fread(fileContent,sizeof(char),fState.st_size,f);
   fclose(f);


   program=clCreateProgramWithSource(context,1,(const char**)&fileContent,&fState.st_size,&error);
   free(fileContent);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't create program : %d\n",error);
        goto errorBuffer;
   }    
  

   /*error=clBuildProgram(program,1,&device,"-cl-fast-relaxed-math",NULL,NULL);*/
   error=clBuildProgram(program,1,&device,"",NULL,NULL);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't build program : %d\n",error);
        goto errorProgram;
   } 

   cl_kernel kernel=clCreateKernel(program,"mainKernel",&error);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't create kernel : %d\n",error);
        goto errorProgram;
   } 

   /**********************

     Launching the kernel

   **********************/  



#ifdef DEBUG
   printf("Set args\n");
#endif
   setArgs(kernel,bench);



   cl_ulong lStart;
   cl_ulong lEnd;
   double fTimeInSeconds;
   double fFLOPS;

#ifdef DEBUG
   printf("Compute\n");
   printf("%d\n",bench->worksizeDim);
   for(i=0;i<bench->worksizeDim;i++)
   {
      printf(" GLOBAL -> %d\n",bench->global_ws[i]);
      printf(" LOCAL -> %d\n",bench->local_ws[i]);
   }
#endif

   double writeBufTime;
   writeBufTime=writeInputs(queue,bench);



   double computeTime=getCurrentTime();
   error=clEnqueueNDRangeKernel(queue, kernel,bench->worksizeDim, NULL,bench->global_ws, bench->local_ws,0,NULL,&event);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't enqueue kernel : %d\n",error);
        goto errorKernel;
   } 

   clFinish(queue);
   computeTime= getCurrentTime() - computeTime;

#ifdef DEBUG
   printf("Read results\n");
#endif
   double readBufTime;
   readBufTime=readResults(queue,bench);

  

   error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&lStart,NULL);
   error |= clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END  ,sizeof(cl_ulong),&lEnd,NULL);
   if (error != CL_SUCCESS)
   {
        fprintf(stderr,"Can't get profiling info : %d\n",error);
        goto errorEvent;
   } 

   fTimeInSeconds = ((double)(lEnd-lStart)) / 1000000000.0;
   

   /* Send timing */
   write(newsockfd,"t\n",2*sizeof(char));

   sprintf(result,"%f\n",createBufTime);
   write(newsockfd,result,strlen(result));

   sprintf(result,"%f\n",writeBufTime);
   write(newsockfd,result,strlen(result));

   sprintf(result,"%f\n",fTimeInSeconds);
   write(newsockfd,result,strlen(result));

   sprintf(result,"%f\n",readBufTime);
   write(newsockfd,result,strlen(result));

   /*sprintf(result,"%f\n",computeTime);
   write(newsockfd,result,strlen(result));
   */


   /* Send results */
   sendResults(bench);


   
   
errorEvent:  
   clReleaseEvent(event); 
errorKernel:
   clReleaseKernel(kernel);

   /**********************

     Cleanup

   **********************/


errorProgram:
   clReleaseProgram(program);
errorBuffer:
   clReleaseCommandQueue(queue);
   releaseBuffers(bench);
errorContext:
   clReleaseContext(context);

   /**

   HORRIBLE error processing. It may create memory leaks. It will have to be improved.

   */

error: 
 return (error != CL_SUCCESS);

}
Example #30
0
// reorder the data from the scanned histogram
void cl_radix_reorder(uint pass){


	cl_int err;

	size_t nblocitems=_ITEMS;
	size_t nbitems=_GROUPS*_ITEMS;


	clFinish(command_que);

	err  = clSetKernelArg(ckReorder, 0, sizeof(cl_mem), &d_inKeys);
	assert(err == CL_SUCCESS);

	err  = clSetKernelArg(ckReorder, 1, sizeof(cl_mem), &d_outKeys);
	assert(err == CL_SUCCESS);

	err = clSetKernelArg(ckReorder, 3, sizeof(uint), &pass);
	assert(err == CL_SUCCESS);

	err  = clSetKernelArg(ckReorder, 4, sizeof(cl_mem), &d_inPermut);
	assert(err == CL_SUCCESS);

	err  = clSetKernelArg(ckReorder, 5, sizeof(cl_mem), &d_outPermut);
	assert(err == CL_SUCCESS);

	err  = clSetKernelArg(ckReorder, 6,
			sizeof(uint)* _RADIX * _ITEMS ,
			NULL); // mem cache
	assert(err == CL_SUCCESS);

	assert( nkeys_rounded%(_GROUPS * _ITEMS) == 0);

	err = clSetKernelArg(ckReorder, 7, sizeof(uint), &nkeys_rounded);
	assert(err == CL_SUCCESS);


	assert(_RADIX == pow(2,_BITS));

	cl_event eve;

	err = clEnqueueNDRangeKernel(command_que,
			ckReorder,
			1, NULL,
			&nbitems,
			&nblocitems,
			0, NULL, &eve);

	//cout << err<<" , "<<CL_MEM_OBJECT_ALLOCATION_FAILURE<<endl;

	assert(err== CL_SUCCESS);
	clFinish(command_que);

	cl_ulong debut,fin;

	err=clGetEventProfilingInfo (eve,
			CL_PROFILING_COMMAND_QUEUED,
			sizeof(cl_ulong),
			(void*) &debut,
			NULL);
	//cout << err<<" , "<<CL_PROFILING_INFO_NOT_AVAILABLE<<endl;
	assert(err== CL_SUCCESS);

	err=clGetEventProfilingInfo (eve,
			CL_PROFILING_COMMAND_END,
			sizeof(cl_ulong),
			(void*) &fin,
			NULL);
	assert(err== CL_SUCCESS);

	//cout <<"durée="<<(float) (fin-debut)/1e9<<" s"<<endl;
	reorder_time += (float) (fin-debut)/1e9;



	// swap the old and new vectors of keys
	cl_mem d_temp;
	d_temp=d_inKeys;
	d_inKeys=d_outKeys;
	d_outKeys=d_temp;

	// swap the old and new permutations
	d_temp=d_inPermut;
	d_inPermut=d_outPermut;
	d_outPermut=d_temp;

}