Esempio n. 1
0
int main(int argc, char *argv[])
{
	//fprintf(stderr, "[%s:%d:%s()] FFT!\n", __FILE__, __LINE__, __func__);
	LOG("FFT Start\n");
	cl_mem xmobj = NULL;
	cl_mem rmobj = NULL;
	cl_mem wmobj = NULL;
	cl_kernel sfac = NULL;
	cl_kernel trns = NULL;
	cl_kernel hpfl = NULL;

	cl_uint ret_num_platforms;
	cl_uint ret_num_devices;

	cl_int ret;

	cl_float2 *xm;
	cl_float2 *rm;
	cl_float2 *wm;

	pgm_t ipgm;
	pgm_t opgm;

	FILE *fp;
	const char fileName[] = "./fft.cl";
	size_t source_size;
	char *source_str;
	cl_int i, j;
	cl_int n;
	cl_int m;

	size_t gws[2];
	size_t lws[2];

	fp = fopen(fileName, "r");
	if(!fp)
	{
		fprintf(stderr, "[%s:%d:%s()] ERROR, Failed to load kernel source.\n", __FILE__, __LINE__, __func__);
		return 1;
	}

	source_str = (char *)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	readPGM(&ipgm, "./lena.pgm");

	n = ipgm.width;
	m = (cl_int)(log((double)n)/log(2.0));

	LOG("n = %d, m = %d.\n", m, n);

	xm = (cl_float2*)malloc(n*n*sizeof(cl_float2));
	rm = (cl_float2*)malloc(n*n*sizeof(cl_float2));
	wm = (cl_float2*)malloc(n/2 *sizeof(cl_float2));

	for( i = 0; i < n; i++)
	{
		for(j = 0; j < n; j++)
		{
			((float*)xm)[2*(n*j + i) + 0] = (float)ipgm.buf[n*j + i];
			((float*)xm)[2*(n*j + i) + 1] = (float)0;
		}
	}

	CL_CHECK(ret = clGetPlatformIDs(MAX_PLATFORM_IDS, platform_ids, &ret_num_platforms));
	platform_id = platform_ids[0];
	CL_CHECK(ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices));

	LOG("platform_id = %p, device_id = %p\n", platform_id, device_id);


	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	CL_CHECK(ret);

	queue = clCreateCommandQueue(context, device_id, 0, &ret);

	xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
	CL_CHECK(ret);
	rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
	CL_CHECK(ret);
	wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
	CL_CHECK(ret);

	CL_CHECK(ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL));

	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
	CL_CHECK(ret);

	CL_CHECK(ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

	sfac = clCreateKernel(program, "spinFact", &ret);
	CL_CHECK(ret);
	trns = clCreateKernel(program, "transpose", &ret);
	CL_CHECK(ret);
	hpfl = clCreateKernel(program, "highPassFilter", &ret);
	CL_CHECK(ret);

	CL_CHECK(ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj));
	CL_CHECK(ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n));
	setWorkSize(gws, lws, n/2, 1);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL));

	fftCore(rmobj, xmobj, wmobj, m, forward);

	CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n));
	setWorkSize(gws, lws, n, n);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL));

	fftCore(rmobj, xmobj, wmobj, m, forward);

#if 1 //FILTER
	cl_int radius = n>>4;
	CL_CHECK(ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj));
	CL_CHECK(ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n));
	CL_CHECK(ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius));
	setWorkSize(gws, lws, n, n);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL));
#endif

#if 1 /* Inverse FFT */
	fftCore(xmobj, rmobj, wmobj, m, inverse);

	CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n));
	setWorkSize(gws, lws, n, n);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL));

	fftCore(xmobj, rmobj, wmobj, m, inverse);
#endif

	CL_CHECK(ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL));

	float *ampd;
	ampd = (float*)malloc(n*n*sizeof(float));
	for(i = 0; i < n; i++)
	{
		for(j = 0; j < n; j++)
		{
			ampd[n*i + j] = AMP( ((float*)xm)[2*(n*i + j)], ((float*)xm)[2*(n*i + j) + 1] );
//			fprintf(stderr, "%d ", (int)ampd[n*i + j]);
		}
//		fprintf(stderr, "\n");
	}

	opgm.width = n;
	opgm.height = n;
	normalizeF2PGM(&opgm, ampd);
	free(ampd);

	writePGM(&opgm, "output.pgm");

	/* Termination */
	CL_CHECK(ret = clFlush(queue));
	CL_CHECK(ret = clFinish(queue));
	CL_CHECK(ret = clReleaseKernel(hpfl));
	CL_CHECK(ret = clReleaseKernel(trns));
	CL_CHECK(ret = clReleaseKernel(sfac));
	CL_CHECK(ret = clReleaseProgram(program));
	CL_CHECK(ret = clReleaseMemObject(xmobj));
	CL_CHECK(ret = clReleaseMemObject(rmobj));
	CL_CHECK(ret = clReleaseMemObject(wmobj));
	CL_CHECK(ret = clReleaseCommandQueue(queue));
	CL_CHECK(ret = clReleaseContext(context));

	destroyPGM(&ipgm);
	destroyPGM(&opgm);

	free(source_str);
	free(wm);
	free(rm);
	free(xm);

	return 0;
}
Esempio n. 2
0
int main(int argc, char** argv)
{
	cl_event event,event1,event2;
	int j =0,stride=2;
	int err, i =0, index =0;                            // error code returned from api calls
	pgm_t input_pgm,output_pgm;

	int ipgm_img_width,opgm_img_width;
	int ipgm_img_height,opgm_img_height;

	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel[3];                // compute kernel

	// OpenCL device memory for matrices
	cl_mem d_image, d_filter, d_output, d_bias;

	if (argc != 2) {
		printf("Expecting 2 arguments.\n");
		exit(1);
	}

	readPGM(&input_pgm,argv[1]);
	ipgm_img_width  = input_pgm.width;
	ipgm_img_height = input_pgm.height;
	opgm_img_width  = input_pgm.width;//-CONV1_FILTER_WIDTH+1;
	opgm_img_height = input_pgm.height;//-CONV1_FILTER_HEIGHT+1;

	printf("cl:main input image resolution:%dx%d\n", ipgm_img_width,ipgm_img_height);
	printf("cl:main output image resolution:%dx%d\n", opgm_img_width,opgm_img_height);

	DTYPE  *h_image;
	DTYPE  *h_filter, *h_bias, *h_output;

	// Allocate host memory for matrices
	unsigned int size_image = ipgm_img_width*ipgm_img_height;
	unsigned int mem_size_image = sizeof(DTYPE) * size_image;
	h_image    = (DTYPE*)malloc(mem_size_image);
	for(i=0;i<size_image;i++)
	{
		h_image[i] = (DTYPE) input_pgm.buf[i]/255;
	}

	unsigned int size_filter = CONV1_FILTER_WIDTH*CONV1_FILTER_HEIGHT;
	unsigned int mem_size_filter = sizeof(DTYPE) * size_filter;
	h_filter = (DTYPE*) conv1_weights;

	unsigned int size_output = opgm_img_width * opgm_img_height;
	unsigned int mem_size_output = sizeof(DTYPE) * size_output;
	h_output = (DTYPE*) malloc(mem_size_output);

	unsigned int size_bias = 1; //1 bias value for 1 output map 
	unsigned int mem_size_bias = sizeof(DTYPE) * size_bias;
	h_bias = (DTYPE*) conv1_bias;

	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[5];

	clGetPlatformIDs(dev_cnt, platform_ids, NULL);
	for(i=0;i<dev_cnt;i++)
	{
#ifdef DEVICE_GPU
		err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
#else
		err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
#endif
		if(err == CL_SUCCESS)
			break;
	}
	if (err != CL_SUCCESS)
	{
		if(err == CL_INVALID_PLATFORM)
			printf("CL_INVALID_PLATFORM\n");
		if(err == CL_INVALID_DEVICE_TYPE)
			printf("CL_INVALID_DEVICE_TYPE\n");
		if(err == CL_INVALID_VALUE)
			printf("CL_INVALID_VALUE\n");
		if(err == CL_DEVICE_NOT_FOUND)
			printf("CL_DEVICE_NOT_FOUND\n");
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context)
	{
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	if (!commands)
	{
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize;
	lFileSize = LoadOpenCLKernel("kernels.cl", &KernelSource);
	if( lFileSize < 0L ) {
		perror("File read failed");
		return 1;
	}

	program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
	if (!program)
	{
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	kernel[0] = clCreateKernel(program, "conv_2d", &err);
	if (!kernel[0] || err != CL_SUCCESS)
	{
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	d_image  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR /*| CL_MEM_USE_MSMC_TI*/, mem_size_image, h_image, &err);

	cl_ulong time_start, time_end;
	double total_time;

	// Create the input and output arrays in device memory for our calculation
	d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR /*| CL_MEM_USE_MSMC_TI*/, mem_size_filter, h_filter, &err);
	d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY /*| CL_MEM_USE_MSMC_TI*/, mem_size_output, NULL, &err);
	d_bias   = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR , mem_size_bias, h_bias, &err);

	if (!d_image || !d_filter || !d_output || !d_bias)
	{
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}    
		
	// Launch OpenCL kernel
	size_t localWorkSize[2], globalWorkSize[2];
	int filter_width  = CONV1_FILTER_WIDTH;
	int filter_height = CONV1_FILTER_HEIGHT;

	localWorkSize[0] = opgm_img_width;
	localWorkSize[1] = opgm_img_height/NUM_WORK_GROUPS;

	globalWorkSize[0] = opgm_img_width;
	globalWorkSize[1] = opgm_img_height;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), (void *)&d_image);
	err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), (void *)&d_filter);
	err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), (void *)&d_output);
	err |= clSetKernelArg(kernel[0], 3, sizeof(int), (void *)&filter_width);
	err |= clSetKernelArg(kernel[0], 4, sizeof(int), (void *)&filter_height);
	err |= clSetKernelArg(kernel[0], 5, sizeof(cl_mem), (void*)&d_bias);
	err |= clSetKernelArg(kernel[0], 6, sizeof(float)*localWorkSize[0]*(localWorkSize[1]+filter_height-1), (void*)NULL);

	if (err != CL_SUCCESS) {
		printf("Error: Failed to set kernel arguments! %d\n", err);	
		exit(1);
	}

	/*Enqueue task for parallel execution*/
	err = clEnqueueNDRangeKernel(commands, kernel[0], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
	if (err != CL_SUCCESS)
	{
		if(err == CL_INVALID_WORK_ITEM_SIZE)
			printf("CL_INVALID_WORK_ITEM_SIZE \n");
		if(err == CL_INVALID_WORK_GROUP_SIZE)
			printf("CL_INVALID_WORK_GROUP_SIZE \n");
		printf("Error: Failed to execute kernel! %d\n", err);
		exit(1);
	}
	clWaitForEvents(1,&event);

	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
	total_time  = (double)(time_end - time_start);

	// Retrieve result from device
	err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0, mem_size_output, h_output, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}

	clReleaseMemObject(d_filter);
	clReleaseMemObject(d_output);
	clReleaseMemObject(d_bias);

	char fileoutputname[15];
	output_pgm.width = opgm_img_width;
	output_pgm.height = opgm_img_height;
	normalizeF2PGM(&output_pgm, h_output);
	sprintf(fileoutputname, "output2d.pgm");	
	/* Output image */
	writePGM(&output_pgm,fileoutputname);

	printf("cl:main timing %0.3f us\n", total_time / 1000.0);

	destroyPGM(&input_pgm);
	destroyPGM(&output_pgm);

	free(h_image);
	free(h_output);
	clReleaseMemObject(d_image);

	clReleaseProgram(program);
	clReleaseKernel(kernel[0]);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	return 0;
}
int main()
{
	long long timer1 = 0;
	long long timer2 = 0;

	register int i,j;
	float *in_image;
	float *out_image;
	int width, height;
	
	pgm_t ipgm;
    	pgm_t opgm;


   	/* Image file input */
	readPGM(&ipgm, "lena.pgm");
	printf("c:main program:log read_done\n");

    	width = ipgm.width; 
	height = ipgm.height;
	printf("c:main program:log img_width %d\n",width);
	printf("c:main program:log img_height %d\n", height);

	in_image = (float *)malloc(width * height * sizeof(float));
	out_image = (float *)malloc(width * height * sizeof(float));

    	for( i = 0; i < width; i++ ) {
        	for( j = 0; j < height; j++ ) {

			((float*)in_image)[(width*j) + i] = (float)ipgm.buf[width*j + i];
        	}
    	}
	
	timer1 = PAPI_get_virt_usec();
	
 	for( i = 0; i < width; i++ ) {
                for( j = 0; j < height; j++ ) {
			((float*)out_image)[(height*i) + j] = ((float*)in_image)[(width*j) + i];
        	}	
    	 }

    	timer2 = PAPI_get_virt_usec();
    	printf("c:main timing:PAPI logic %llu us\n",(timer2-timer1));
	
    	printf("c:main program:log compute_done\n");

    	opgm.width = height ;
    	opgm.height = width ;
    	normalizeF2PGM(&opgm, out_image);

	/* Image file output */
	writePGM(&opgm, "output.pgm");

	printf("c:main program:log output_done\n");


    	destroyPGM(&ipgm);
    	destroyPGM(&opgm);

    	free(in_image);
    	free(out_image);

    	return 0;
}
Esempio n. 4
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    int test_fail = 0;
    pgm_t input_img, output_img;

    IMG_DTYPE filter[FILTER_SIZE*FILTER_SIZE] = {-1, -1, -1, -1, 8, -1, -1, -1, -1};
    IMG_DTYPE *h_input;      // input image buffer
    IMG_DTYPE *hw_output;    // host buffer for device output
    IMG_DTYPE *sw_output;    // host buffer for reference output

    size_t global[2];                   // global domain size for our calculation
    size_t local[2];                    // local domain size for our calculation

    cl_platform_id platform_id;         // platform id
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

    char cl_platform_vendor[1001];
    char cl_platform_name[1001];

    cl_mem d_in_image;                  // device buffer for input image
    cl_mem d_in_filter;                 // device buffer for filter kernel
    cl_mem d_out_image;                 // device buffer for filtered image

    printf("Application start\n");
    if (argc != 3) {
        printf("Usage: %s conv_2d.xclbin image_path/image_name.pgm\n", argv[0]);
        return EXIT_FAILURE;
    }

    int row, col, pix;
    // read the image and initialize the host buffer with that
    err = readPGM(&input_img, argv[2]);
    if(err < 0) {
        printf("Cound not read the image\n");
        return EXIT_FAILURE;
    }
    printf("Input image resolution = %xx%d\n", input_img.width, input_img.height);
    h_input = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); 
    hw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); 
    sw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); 
    for(pix = 0; pix < input_img.height*input_img.width; pix++) {
        h_input[pix] = input_img.buf[pix];
    }

    // Connect to first platform
    //
    err = clGetPlatformIDs(1,&platform_id,NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to find an OpenCL platform!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
    if (err != CL_SUCCESS) {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    printf("INFO: CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
    err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
    if (err != CL_SUCCESS) {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    printf("INFO: CL_PLATFORM_NAME %s\n",cl_platform_name);

    // Connect to a compute device
    //
    err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR,
                         1, &device_id, NULL);
    if (err != CL_SUCCESS) {
            printf("Error: Failed to create a device group!\n");
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context) {
        printf("Error: Failed to create a compute context!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands) {
        printf("Error: Failed to create a command commands!\n");
        printf("Error: code %i\n",err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    int status;

    // Create Program Objects
    //

    // Load binary from disk
    unsigned char *kernelbinary;
    char *xclbin = argv[1];

    printf("INFO: loading xclbin %s\n", xclbin);
    int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
    if (n_i < 0) {
        printf("failed to load kernel from xclbin0: %s\n", xclbin);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    size_t n = n_i;

    // Create the compute program from offline
    program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                        (const unsigned char **) &kernelbinary, &status, &err);

    if ((!program) || (err!=CL_SUCCESS)) {
        printf("Error: Failed to create compute program0 from binary %d!\n", err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "conv_2d", &err);
    if (!kernel || err != CL_SUCCESS) {
        printf("Error: Failed to create compute kernel!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create the input and output arrays in device memory for our calculation
    //
    d_in_image = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL);
    d_in_filter = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, NULL, NULL);
    d_out_image = clCreateBuffer(context, CL_MEM_WRITE_ONLY,  sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL);
    if (!d_in_image || !d_in_filter || !d_out_image) {
        printf("Error: Failed to allocate device memory!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Write the image from host buffer to device memory
    //
    err = clEnqueueWriteBuffer(commands, d_in_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.height*input_img.width, h_input, 0, NULL, NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to write to image to device memory!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    // Write filter kernel into device buffer
    //
    err = clEnqueueWriteBuffer(commands, d_in_filter, CL_TRUE, 0, sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, filter, 0, NULL, NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to write to filter coeff into device memory!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Set the arguments to our compute kernel
    //
    int filter_size = FILTER_SIZE;
    IMG_DTYPE bias = 1;
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_in_image);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_in_filter);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_out_image);
    //err |= clSetKernelArg(kernel, 3, sizeof(int),    &filter_size);
    err |= clSetKernelArg(kernel, 3, sizeof(IMG_DTYPE),    &bias);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Launch computation kernel
    global[0] = input_img.width * WORKGROUP_SIZE_0;
    global[1] = input_img.height * WORKGROUP_SIZE_1;
    local[0] = WORKGROUP_SIZE_0;
    local[1] = WORKGROUP_SIZE_1;

    err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL,
                                 (size_t*)&global, (size_t*)&local, 0, NULL, NULL);
    if (err) {
            printf("Error: Failed to execute kernel! %d\n", err);
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

    // Read back the results from the device to verify the output
    //
    cl_event readevent;
    err = clEnqueueReadBuffer( commands, d_out_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.width*input_img.height, hw_output, 0, NULL, &readevent
);
    if (err != CL_SUCCESS) {
            printf("Error: Failed to read output array! %d\n", err);
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

    clWaitForEvents(1, &readevent);

    // Generate reference output
    int kr, kc;
    IMG_DTYPE sum = 0;
    for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) {
        for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) {
            sum = 0;
            for(kr = 0; kr < FILTER_SIZE; kr++) {
                for(kc = 0; kc < FILTER_SIZE; kc++ ) {
                    sum += (filter[kr*FILTER_SIZE + kc] * h_input[(row+kr)*input_img.width + col + kc]);
                }
            }
            sw_output[row*input_img.width + col] = sum + bias;
        }
    }
    // Check Results
    for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) {
        for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) {
             if(sw_output[row*input_img.width+col] != hw_output[row*input_img.width+col]){
                 printf("Mismatch at : row = %d, col = %d, expected = %f, got = %f\n",
                     row, col, sw_output[row*input_img.width+col], hw_output[row*input_img.width+col]);
                 test_fail = 1;
             }
        }
    }
    printf("---------Input image-----------\n");
    //print_matrix(h_input, input_img.height, input_img.width);
    printf("---------Reference output------\n");
    //print_matrix(sw_output, input_img.height, input_img.width);
    printf("---------OCL Kernel output-----\n");
    //print_matrix(hw_output, input_img.height, input_img.width);

    // store the output image
    output_img.width = input_img.width;
    output_img.height = input_img.height;
    normalizeF2PGM(&output_img, hw_output);
    writePGM(&output_img, "../../../../fpga_output.pgm");
    //--------------------------------------------------------------------------
    // Shutdown and cleanup
    //--------------------------------------------------------------------------
    clReleaseMemObject(d_in_image);
    clReleaseMemObject(d_in_filter);
    clReleaseMemObject(d_out_image);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    destroyPGM(&input_img);
    if (test_fail) {
        printf("INFO: Test failed\n");
        return EXIT_FAILURE;
    } else {
        printf("INFO: Test passed\n");
    }
}
Esempio n. 5
0
int main(int argc, char** argv)
{
	cl_event event;
	int err, i = 0;                            // error code returned from api calls
	cl_ulong time_start, time_end;
	double total_time = 0;

	pgm_t input_pgm, output_pgm;


	cl_device_id device_id;             // compute device id 
	cl_context context;                 // compute context
	cl_command_queue commands;          // compute command queue
	cl_program program;                 // compute program
	cl_kernel kernel;                // compute kernel

	// OpenCL device memory for matrices
	cl_mem d_image, d_filter, d_output;

	// Simple laplacian kernel
	DTYPE lap_filter[FILTER_SIZE*FILTER_SIZE] = {-1.0, -1.0, -1.0, -1.0, 8.0, -1.0, -1.0, -1.0, -1.0};
	DTYPE bias = 0.01;

	if (argc != 2) {
		printf("Usage: %s <image_name.pgm>\n", argv[0]);
		exit(1);
	}

	// Read the input image
	readPGM(&input_pgm, argv[1]);

	printf("Host: Input image resolution:%dx%d\n", input_pgm.width, input_pgm.height);

	DTYPE  *h_image, *h_image_padded;
	DTYPE  *h_filter, *h_output, *ref_output;

	// Allocate host memory for images and outputs
	h_image    = (DTYPE*)malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height);
	ref_output = (DTYPE*)malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height);
	
	//setup padded input image
	const int PADDED_SIZE = sizeof(DTYPE)*(input_pgm.width+FILTER_SIZE-1)*(input_pgm.height+FILTER_SIZE-1);
	h_image_padded = (DTYPE*)malloc(PADDED_SIZE);
	memset((void*)h_image_padded, 0, PADDED_SIZE); //init padded image to 0s
	int offset = 0; //Used for padded image

	// Convert range from [0, 255] to [0.0, 1.0]
	for(i = 0; i < input_pgm.width * input_pgm.height; i++)
	{
		if(i%input_pgm.width == 0 && i>0){ //if end of image row
			offset += FILTER_SIZE-1; //bump padded image to next row
		}
		h_image[i] = (DTYPE) input_pgm.buf[i]/255.0;
		h_image_padded[i+offset] = h_image[i];
	}

	h_filter = (DTYPE*) lap_filter;
	h_output = (DTYPE*) malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height);


	// Platform and device query
	cl_uint dev_cnt = 0;
	clGetPlatformIDs(0, 0, &dev_cnt);

	cl_platform_id platform_ids[5];
	clGetPlatformIDs(dev_cnt, platform_ids, NULL);

	for(i = 0;i < dev_cnt; i++)
	{
		err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
		if(err == CL_SUCCESS)
			break;
	}
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}

	// Create a compute context 
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context)
	{
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}

	// Create a command commands
	commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	if (!commands)
	{
		printf("Error: Failed to create a command commands!\n");
		return EXIT_FAILURE;
	}

	// Create the compute program from the source file
	char *KernelSource;
	long lFileSize;
	lFileSize = LoadOpenCLKernel("conv_kernel.cl", &KernelSource);
	if( lFileSize < 0L ) {
		perror("File read failed");
		return 1;
	}

	program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
	if (!program)
	{
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}

	// Build the program executable
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		exit(1);
	}

	kernel = clCreateKernel(program, "conv_2d", &err);
	if (!kernel || err != CL_SUCCESS)
	{
		printf("Error: Failed to create compute kernel!\n");
		exit(1);
	}

	// Allocate the device buffer for input image, kernel and transfer the data
	d_image  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, PADDED_SIZE, h_image_padded, &err);

	// Create the input and output arrays in device memory for our calculation
	d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(DTYPE)*FILTER_SIZE*FILTER_SIZE, h_filter, &err);
	d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(DTYPE)*input_pgm.width*input_pgm.height, NULL, &err);

	if (!d_image || !d_filter || !d_output)
	{
		printf("Error: Failed to allocate device memory!\n");
		exit(1);
	}    
		
	size_t localWorkSize[2], globalWorkSize[2];
	int filter_size  = FILTER_SIZE;

	// Setup the kernel arguments
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_image);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_filter);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_output);
	err |= clSetKernelArg(kernel, 3, sizeof(int), &filter_size);
	err |= clSetKernelArg(kernel, 4, sizeof(DTYPE), &bias);

	if (err != CL_SUCCESS) {
		printf("Error: Failed to set kernel arguments! %d\n", err);	
		exit(1);
	}

	globalWorkSize[0] = input_pgm.width;
	globalWorkSize[1] = input_pgm.height;

	localWorkSize[0] = 1;
	localWorkSize[1] = 1;

	uint trials = 1;
	printf("Launching the kernel...\n");
	for(uint j=0; j<trials;j++){
		/*Enqueue task for parallel execution*/
		err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
		if (err != CL_SUCCESS)
		{
			printf("Error: Failed to execute kernel! %d\n", err);
			exit(1);
		}

		// Wait for the commands to finish
		clWaitForEvents(1, &event);

		// Get the profiling info
		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
		total_time  += (double)(time_end - time_start);

	}
	total_time /= trials;

	// Retrieve result from device
	printf("Reading output buffer into host memory...\n");
	err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0, sizeof(DTYPE)*input_pgm.width*input_pgm.height, h_output, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to read output array! %d\n", err);
		exit(1);
	}

	//-------------------------------------------------------------
	// Compare between host and device output
    // Generate reference output
    int kr, kc, row, col;
    DTYPE sum = 0;
    for(row = 0; row < input_pgm.height; row++) {
        for(col = 0; col < input_pgm.width; col++) {
            sum = 0;
            for(kr = 0; kr < FILTER_SIZE; kr++) {
                for(kc = 0; kc < FILTER_SIZE; kc++ ) {
                    sum += (lap_filter[kr*FILTER_SIZE + kc] * h_image_padded[(row+kr)*(input_pgm.width+FILTER_SIZE-1) + col + kc]);
                }
            }
            ref_output[row*input_pgm.width + col] = sum + bias;
        }
    }
    // Check Results
	int test_fail = 0;
    for(row = 0; row < input_pgm.height; row++) {
        for(col = 0; col < input_pgm.width; col++) {
             if(ref_output[row*input_pgm.width+col] != h_output[row*input_pgm.width+col]){
                 printf("Mismatch at : row = %d, col = %d, expected = %f, got = %f\n",
                     row, col, ref_output[row*input_pgm.width+col], h_output[row*input_pgm.width+col]);
                 test_fail = 1;
             }
        }
    }


	output_pgm.width = input_pgm.width;
	output_pgm.height = input_pgm.height;

	// Remove garbage pixels in the border. If not, this will effect the subsequent normalization.!
	for(row = 0; row < output_pgm.height; row++) {
		for(col = 0; col < output_pgm.width; col++) {
			if(row > output_pgm.height- FILTER_SIZE || col > output_pgm.width-FILTER_SIZE)
				h_output[row * output_pgm.width + col] = 0.0;
		}
	}

	normalizeF2PGM(&output_pgm, h_output);
	/* Output image */
	writePGM(&output_pgm, "ocl_output.pgm");

	if (test_fail) {
		printf("INFO: TEST FAILED !!!!\n");
	} else {
		printf("INFO: ****TEST PASSED****\n");
	}
	printf("Kernel runtime = %0.3f us\n", total_time / 1000.0);

	destroyPGM(&input_pgm);
	destroyPGM(&output_pgm);

	free(h_image);
	free(h_image_padded);
	free(h_output);
	free(ref_output);
	clReleaseMemObject(d_image);
	clReleaseMemObject(d_filter);
	clReleaseMemObject(d_output);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	return 0;
}