int main(int argc, char *argv[]) { //fprintf(stderr, "[%s:%d:%s()] FFT!\n", __FILE__, __LINE__, __func__); LOG("FFT Start\n"); cl_mem xmobj = NULL; cl_mem rmobj = NULL; cl_mem wmobj = NULL; cl_kernel sfac = NULL; cl_kernel trns = NULL; cl_kernel hpfl = NULL; cl_uint ret_num_platforms; cl_uint ret_num_devices; cl_int ret; cl_float2 *xm; cl_float2 *rm; cl_float2 *wm; pgm_t ipgm; pgm_t opgm; FILE *fp; const char fileName[] = "./fft.cl"; size_t source_size; char *source_str; cl_int i, j; cl_int n; cl_int m; size_t gws[2]; size_t lws[2]; fp = fopen(fileName, "r"); if(!fp) { fprintf(stderr, "[%s:%d:%s()] ERROR, Failed to load kernel source.\n", __FILE__, __LINE__, __func__); return 1; } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); readPGM(&ipgm, "./lena.pgm"); n = ipgm.width; m = (cl_int)(log((double)n)/log(2.0)); LOG("n = %d, m = %d.\n", m, n); xm = (cl_float2*)malloc(n*n*sizeof(cl_float2)); rm = (cl_float2*)malloc(n*n*sizeof(cl_float2)); wm = (cl_float2*)malloc(n/2 *sizeof(cl_float2)); for( i = 0; i < n; i++) { for(j = 0; j < n; j++) { ((float*)xm)[2*(n*j + i) + 0] = (float)ipgm.buf[n*j + i]; ((float*)xm)[2*(n*j + i) + 1] = (float)0; } } CL_CHECK(ret = clGetPlatformIDs(MAX_PLATFORM_IDS, platform_ids, &ret_num_platforms)); platform_id = platform_ids[0]; CL_CHECK(ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices)); LOG("platform_id = %p, device_id = %p\n", platform_id, device_id); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); CL_CHECK(ret); queue = clCreateCommandQueue(context, device_id, 0, &ret); xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); CL_CHECK(ret); rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); CL_CHECK(ret); wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); CL_CHECK(ret); CL_CHECK(ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL)); program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); CL_CHECK(ret); CL_CHECK(ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); sfac = clCreateKernel(program, "spinFact", &ret); CL_CHECK(ret); trns = clCreateKernel(program, "transpose", &ret); CL_CHECK(ret); hpfl = clCreateKernel(program, "highPassFilter", &ret); CL_CHECK(ret); CL_CHECK(ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj)); CL_CHECK(ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n)); setWorkSize(gws, lws, n/2, 1); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL)); fftCore(rmobj, xmobj, wmobj, m, forward); CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj)); CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj)); CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n)); setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL)); fftCore(rmobj, xmobj, wmobj, m, forward); #if 1 //FILTER cl_int radius = n>>4; CL_CHECK(ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj)); CL_CHECK(ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius)); setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL)); #endif #if 1 /* Inverse FFT */ fftCore(xmobj, rmobj, wmobj, m, inverse); CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj)); CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj)); CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n)); setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL)); fftCore(xmobj, rmobj, wmobj, m, inverse); #endif CL_CHECK(ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL)); float *ampd; ampd = (float*)malloc(n*n*sizeof(float)); for(i = 0; i < n; i++) { for(j = 0; j < n; j++) { ampd[n*i + j] = AMP( ((float*)xm)[2*(n*i + j)], ((float*)xm)[2*(n*i + j) + 1] ); // fprintf(stderr, "%d ", (int)ampd[n*i + j]); } // fprintf(stderr, "\n"); } opgm.width = n; opgm.height = n; normalizeF2PGM(&opgm, ampd); free(ampd); writePGM(&opgm, "output.pgm"); /* Termination */ CL_CHECK(ret = clFlush(queue)); CL_CHECK(ret = clFinish(queue)); CL_CHECK(ret = clReleaseKernel(hpfl)); CL_CHECK(ret = clReleaseKernel(trns)); CL_CHECK(ret = clReleaseKernel(sfac)); CL_CHECK(ret = clReleaseProgram(program)); CL_CHECK(ret = clReleaseMemObject(xmobj)); CL_CHECK(ret = clReleaseMemObject(rmobj)); CL_CHECK(ret = clReleaseMemObject(wmobj)); CL_CHECK(ret = clReleaseCommandQueue(queue)); CL_CHECK(ret = clReleaseContext(context)); destroyPGM(&ipgm); destroyPGM(&opgm); free(source_str); free(wm); free(rm); free(xm); return 0; }
int main(int argc, char** argv) { cl_event event,event1,event2; int j =0,stride=2; int err, i =0, index =0; // error code returned from api calls pgm_t input_pgm,output_pgm; int ipgm_img_width,opgm_img_width; int ipgm_img_height,opgm_img_height; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel[3]; // compute kernel // OpenCL device memory for matrices cl_mem d_image, d_filter, d_output, d_bias; if (argc != 2) { printf("Expecting 2 arguments.\n"); exit(1); } readPGM(&input_pgm,argv[1]); ipgm_img_width = input_pgm.width; ipgm_img_height = input_pgm.height; opgm_img_width = input_pgm.width;//-CONV1_FILTER_WIDTH+1; opgm_img_height = input_pgm.height;//-CONV1_FILTER_HEIGHT+1; printf("cl:main input image resolution:%dx%d\n", ipgm_img_width,ipgm_img_height); printf("cl:main output image resolution:%dx%d\n", opgm_img_width,opgm_img_height); DTYPE *h_image; DTYPE *h_filter, *h_bias, *h_output; // Allocate host memory for matrices unsigned int size_image = ipgm_img_width*ipgm_img_height; unsigned int mem_size_image = sizeof(DTYPE) * size_image; h_image = (DTYPE*)malloc(mem_size_image); for(i=0;i<size_image;i++) { h_image[i] = (DTYPE) input_pgm.buf[i]/255; } unsigned int size_filter = CONV1_FILTER_WIDTH*CONV1_FILTER_HEIGHT; unsigned int mem_size_filter = sizeof(DTYPE) * size_filter; h_filter = (DTYPE*) conv1_weights; unsigned int size_output = opgm_img_width * opgm_img_height; unsigned int mem_size_output = sizeof(DTYPE) * size_output; h_output = (DTYPE*) malloc(mem_size_output); unsigned int size_bias = 1; //1 bias value for 1 output map unsigned int mem_size_bias = sizeof(DTYPE) * size_bias; h_bias = (DTYPE*) conv1_bias; cl_uint dev_cnt = 0; clGetPlatformIDs(0, 0, &dev_cnt); cl_platform_id platform_ids[5]; clGetPlatformIDs(dev_cnt, platform_ids, NULL); for(i=0;i<dev_cnt;i++) { #ifdef DEVICE_GPU err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); #else err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); #endif if(err == CL_SUCCESS) break; } if (err != CL_SUCCESS) { if(err == CL_INVALID_PLATFORM) printf("CL_INVALID_PLATFORM\n"); if(err == CL_INVALID_DEVICE_TYPE) printf("CL_INVALID_DEVICE_TYPE\n"); if(err == CL_INVALID_VALUE) printf("CL_INVALID_VALUE\n"); if(err == CL_DEVICE_NOT_FOUND) printf("CL_DEVICE_NOT_FOUND\n"); printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source file char *KernelSource; long lFileSize; lFileSize = LoadOpenCLKernel("kernels.cl", &KernelSource); if( lFileSize < 0L ) { perror("File read failed"); return 1; } program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } kernel[0] = clCreateKernel(program, "conv_2d", &err); if (!kernel[0] || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } d_image = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR /*| CL_MEM_USE_MSMC_TI*/, mem_size_image, h_image, &err); cl_ulong time_start, time_end; double total_time; // Create the input and output arrays in device memory for our calculation d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR /*| CL_MEM_USE_MSMC_TI*/, mem_size_filter, h_filter, &err); d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY /*| CL_MEM_USE_MSMC_TI*/, mem_size_output, NULL, &err); d_bias = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR , mem_size_bias, h_bias, &err); if (!d_image || !d_filter || !d_output || !d_bias) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int filter_width = CONV1_FILTER_WIDTH; int filter_height = CONV1_FILTER_HEIGHT; localWorkSize[0] = opgm_img_width; localWorkSize[1] = opgm_img_height/NUM_WORK_GROUPS; globalWorkSize[0] = opgm_img_width; globalWorkSize[1] = opgm_img_height; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), (void *)&d_image); err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), (void *)&d_filter); err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), (void *)&d_output); err |= clSetKernelArg(kernel[0], 3, sizeof(int), (void *)&filter_width); err |= clSetKernelArg(kernel[0], 4, sizeof(int), (void *)&filter_height); err |= clSetKernelArg(kernel[0], 5, sizeof(cl_mem), (void*)&d_bias); err |= clSetKernelArg(kernel[0], 6, sizeof(float)*localWorkSize[0]*(localWorkSize[1]+filter_height-1), (void*)NULL); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } /*Enqueue task for parallel execution*/ err = clEnqueueNDRangeKernel(commands, kernel[0], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &event); if (err != CL_SUCCESS) { if(err == CL_INVALID_WORK_ITEM_SIZE) printf("CL_INVALID_WORK_ITEM_SIZE \n"); if(err == CL_INVALID_WORK_GROUP_SIZE) printf("CL_INVALID_WORK_GROUP_SIZE \n"); printf("Error: Failed to execute kernel! %d\n", err); exit(1); } clWaitForEvents(1,&event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time = (double)(time_end - time_start); // Retrieve result from device err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0, mem_size_output, h_output, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } clReleaseMemObject(d_filter); clReleaseMemObject(d_output); clReleaseMemObject(d_bias); char fileoutputname[15]; output_pgm.width = opgm_img_width; output_pgm.height = opgm_img_height; normalizeF2PGM(&output_pgm, h_output); sprintf(fileoutputname, "output2d.pgm"); /* Output image */ writePGM(&output_pgm,fileoutputname); printf("cl:main timing %0.3f us\n", total_time / 1000.0); destroyPGM(&input_pgm); destroyPGM(&output_pgm); free(h_image); free(h_output); clReleaseMemObject(d_image); clReleaseProgram(program); clReleaseKernel(kernel[0]); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main() { long long timer1 = 0; long long timer2 = 0; register int i,j; float *in_image; float *out_image; int width, height; pgm_t ipgm; pgm_t opgm; /* Image file input */ readPGM(&ipgm, "lena.pgm"); printf("c:main program:log read_done\n"); width = ipgm.width; height = ipgm.height; printf("c:main program:log img_width %d\n",width); printf("c:main program:log img_height %d\n", height); in_image = (float *)malloc(width * height * sizeof(float)); out_image = (float *)malloc(width * height * sizeof(float)); for( i = 0; i < width; i++ ) { for( j = 0; j < height; j++ ) { ((float*)in_image)[(width*j) + i] = (float)ipgm.buf[width*j + i]; } } timer1 = PAPI_get_virt_usec(); for( i = 0; i < width; i++ ) { for( j = 0; j < height; j++ ) { ((float*)out_image)[(height*i) + j] = ((float*)in_image)[(width*j) + i]; } } timer2 = PAPI_get_virt_usec(); printf("c:main timing:PAPI logic %llu us\n",(timer2-timer1)); printf("c:main program:log compute_done\n"); opgm.width = height ; opgm.height = width ; normalizeF2PGM(&opgm, out_image); /* Image file output */ writePGM(&opgm, "output.pgm"); printf("c:main program:log output_done\n"); destroyPGM(&ipgm); destroyPGM(&opgm); free(in_image); free(out_image); return 0; }
int main(int argc, char** argv) { int err; // error code returned from api calls int test_fail = 0; pgm_t input_img, output_img; IMG_DTYPE filter[FILTER_SIZE*FILTER_SIZE] = {-1, -1, -1, -1, 8, -1, -1, -1, -1}; IMG_DTYPE *h_input; // input image buffer IMG_DTYPE *hw_output; // host buffer for device output IMG_DTYPE *sw_output; // host buffer for reference output size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem d_in_image; // device buffer for input image cl_mem d_in_filter; // device buffer for filter kernel cl_mem d_out_image; // device buffer for filtered image printf("Application start\n"); if (argc != 3) { printf("Usage: %s conv_2d.xclbin image_path/image_name.pgm\n", argv[0]); return EXIT_FAILURE; } int row, col, pix; // read the image and initialize the host buffer with that err = readPGM(&input_img, argv[2]); if(err < 0) { printf("Cound not read the image\n"); return EXIT_FAILURE; } printf("Input image resolution = %xx%d\n", input_img.width, input_img.height); h_input = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); hw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); sw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); for(pix = 0; pix < input_img.height*input_img.width; pix++) { h_input[pix] = input_img.buf[pix]; } // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("INFO: CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("INFO: CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return EXIT_FAILURE; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin = argv[1]; printf("INFO: loading xclbin %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin0: %s\n", xclbin); printf("Test failed\n"); return EXIT_FAILURE; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program0 from binary %d!\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return EXIT_FAILURE; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "conv_2d", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation // d_in_image = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL); d_in_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, NULL, NULL); d_out_image = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL); if (!d_in_image || !d_in_filter || !d_out_image) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write the image from host buffer to device memory // err = clEnqueueWriteBuffer(commands, d_in_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.height*input_img.width, h_input, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to image to device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write filter kernel into device buffer // err = clEnqueueWriteBuffer(commands, d_in_filter, CL_TRUE, 0, sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, filter, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to filter coeff into device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Set the arguments to our compute kernel // int filter_size = FILTER_SIZE; IMG_DTYPE bias = 1; err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_in_image); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_in_filter); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_out_image); //err |= clSetKernelArg(kernel, 3, sizeof(int), &filter_size); err |= clSetKernelArg(kernel, 3, sizeof(IMG_DTYPE), &bias); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Launch computation kernel global[0] = input_img.width * WORKGROUP_SIZE_0; global[1] = input_img.height * WORKGROUP_SIZE_1; local[0] = WORKGROUP_SIZE_0; local[1] = WORKGROUP_SIZE_1; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, d_out_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.width*input_img.height, hw_output, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); // Generate reference output int kr, kc; IMG_DTYPE sum = 0; for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) { for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) { sum = 0; for(kr = 0; kr < FILTER_SIZE; kr++) { for(kc = 0; kc < FILTER_SIZE; kc++ ) { sum += (filter[kr*FILTER_SIZE + kc] * h_input[(row+kr)*input_img.width + col + kc]); } } sw_output[row*input_img.width + col] = sum + bias; } } // Check Results for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) { for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) { if(sw_output[row*input_img.width+col] != hw_output[row*input_img.width+col]){ printf("Mismatch at : row = %d, col = %d, expected = %f, got = %f\n", row, col, sw_output[row*input_img.width+col], hw_output[row*input_img.width+col]); test_fail = 1; } } } printf("---------Input image-----------\n"); //print_matrix(h_input, input_img.height, input_img.width); printf("---------Reference output------\n"); //print_matrix(sw_output, input_img.height, input_img.width); printf("---------OCL Kernel output-----\n"); //print_matrix(hw_output, input_img.height, input_img.width); // store the output image output_img.width = input_img.width; output_img.height = input_img.height; normalizeF2PGM(&output_img, hw_output); writePGM(&output_img, "../../../../fpga_output.pgm"); //-------------------------------------------------------------------------- // Shutdown and cleanup //-------------------------------------------------------------------------- clReleaseMemObject(d_in_image); clReleaseMemObject(d_in_filter); clReleaseMemObject(d_out_image); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); destroyPGM(&input_img); if (test_fail) { printf("INFO: Test failed\n"); return EXIT_FAILURE; } else { printf("INFO: Test passed\n"); } }
int main(int argc, char** argv) { cl_event event; int err, i = 0; // error code returned from api calls cl_ulong time_start, time_end; double total_time = 0; pgm_t input_pgm, output_pgm; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel // OpenCL device memory for matrices cl_mem d_image, d_filter, d_output; // Simple laplacian kernel DTYPE lap_filter[FILTER_SIZE*FILTER_SIZE] = {-1.0, -1.0, -1.0, -1.0, 8.0, -1.0, -1.0, -1.0, -1.0}; DTYPE bias = 0.01; if (argc != 2) { printf("Usage: %s <image_name.pgm>\n", argv[0]); exit(1); } // Read the input image readPGM(&input_pgm, argv[1]); printf("Host: Input image resolution:%dx%d\n", input_pgm.width, input_pgm.height); DTYPE *h_image, *h_image_padded; DTYPE *h_filter, *h_output, *ref_output; // Allocate host memory for images and outputs h_image = (DTYPE*)malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height); ref_output = (DTYPE*)malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height); //setup padded input image const int PADDED_SIZE = sizeof(DTYPE)*(input_pgm.width+FILTER_SIZE-1)*(input_pgm.height+FILTER_SIZE-1); h_image_padded = (DTYPE*)malloc(PADDED_SIZE); memset((void*)h_image_padded, 0, PADDED_SIZE); //init padded image to 0s int offset = 0; //Used for padded image // Convert range from [0, 255] to [0.0, 1.0] for(i = 0; i < input_pgm.width * input_pgm.height; i++) { if(i%input_pgm.width == 0 && i>0){ //if end of image row offset += FILTER_SIZE-1; //bump padded image to next row } h_image[i] = (DTYPE) input_pgm.buf[i]/255.0; h_image_padded[i+offset] = h_image[i]; } h_filter = (DTYPE*) lap_filter; h_output = (DTYPE*) malloc(sizeof(DTYPE)*input_pgm.width*input_pgm.height); // Platform and device query cl_uint dev_cnt = 0; clGetPlatformIDs(0, 0, &dev_cnt); cl_platform_id platform_ids[5]; clGetPlatformIDs(dev_cnt, platform_ids, NULL); for(i = 0;i < dev_cnt; i++) { err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if(err == CL_SUCCESS) break; } if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source file char *KernelSource; long lFileSize; lFileSize = LoadOpenCLKernel("conv_kernel.cl", &KernelSource); if( lFileSize < 0L ) { perror("File read failed"); return 1; } program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } kernel = clCreateKernel(program, "conv_2d", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Allocate the device buffer for input image, kernel and transfer the data d_image = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, PADDED_SIZE, h_image_padded, &err); // Create the input and output arrays in device memory for our calculation d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(DTYPE)*FILTER_SIZE*FILTER_SIZE, h_filter, &err); d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(DTYPE)*input_pgm.width*input_pgm.height, NULL, &err); if (!d_image || !d_filter || !d_output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } size_t localWorkSize[2], globalWorkSize[2]; int filter_size = FILTER_SIZE; // Setup the kernel arguments err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_image); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_filter); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_output); err |= clSetKernelArg(kernel, 3, sizeof(int), &filter_size); err |= clSetKernelArg(kernel, 4, sizeof(DTYPE), &bias); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } globalWorkSize[0] = input_pgm.width; globalWorkSize[1] = input_pgm.height; localWorkSize[0] = 1; localWorkSize[1] = 1; uint trials = 1; printf("Launching the kernel...\n"); for(uint j=0; j<trials;j++){ /*Enqueue task for parallel execution*/ err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &event); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel! %d\n", err); exit(1); } // Wait for the commands to finish clWaitForEvents(1, &event); // Get the profiling info clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += (double)(time_end - time_start); } total_time /= trials; // Retrieve result from device printf("Reading output buffer into host memory...\n"); err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0, sizeof(DTYPE)*input_pgm.width*input_pgm.height, h_output, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } //------------------------------------------------------------- // Compare between host and device output // Generate reference output int kr, kc, row, col; DTYPE sum = 0; for(row = 0; row < input_pgm.height; row++) { for(col = 0; col < input_pgm.width; col++) { sum = 0; for(kr = 0; kr < FILTER_SIZE; kr++) { for(kc = 0; kc < FILTER_SIZE; kc++ ) { sum += (lap_filter[kr*FILTER_SIZE + kc] * h_image_padded[(row+kr)*(input_pgm.width+FILTER_SIZE-1) + col + kc]); } } ref_output[row*input_pgm.width + col] = sum + bias; } } // Check Results int test_fail = 0; for(row = 0; row < input_pgm.height; row++) { for(col = 0; col < input_pgm.width; col++) { if(ref_output[row*input_pgm.width+col] != h_output[row*input_pgm.width+col]){ printf("Mismatch at : row = %d, col = %d, expected = %f, got = %f\n", row, col, ref_output[row*input_pgm.width+col], h_output[row*input_pgm.width+col]); test_fail = 1; } } } output_pgm.width = input_pgm.width; output_pgm.height = input_pgm.height; // Remove garbage pixels in the border. If not, this will effect the subsequent normalization.! for(row = 0; row < output_pgm.height; row++) { for(col = 0; col < output_pgm.width; col++) { if(row > output_pgm.height- FILTER_SIZE || col > output_pgm.width-FILTER_SIZE) h_output[row * output_pgm.width + col] = 0.0; } } normalizeF2PGM(&output_pgm, h_output); /* Output image */ writePGM(&output_pgm, "ocl_output.pgm"); if (test_fail) { printf("INFO: TEST FAILED !!!!\n"); } else { printf("INFO: ****TEST PASSED****\n"); } printf("Kernel runtime = %0.3f us\n", total_time / 1000.0); destroyPGM(&input_pgm); destroyPGM(&output_pgm); free(h_image); free(h_image_padded); free(h_output); free(ref_output); clReleaseMemObject(d_image); clReleaseMemObject(d_filter); clReleaseMemObject(d_output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }