/* * Create and build OpenCL program from its source code */ int CreateAndBuildProgram(ocl_args_d_t *ocl) { cl_int err = CL_SUCCESS; // Upload the OpenCL C source code from the input file to source // The size of the C program is returned in sourceSize char* source = NULL; size_t src_size = 0; err = ReadSourceFromFile("Template.cl", &source, &src_size); if (CL_SUCCESS != err) { LogError("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object. ocl->program = clCreateProgramWithSource(ocl->context, 1, (const char**)&source, &src_size, &err); if (CL_SUCCESS != err) { LogError("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // Build the program // During creation a program is not built. You need to explicitly call build function. // Here you just use create-build sequence, // but there are also other possibilities when program consist of several parts, // some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as // alternatives. err = clBuildProgram(ocl->program, 1, &ocl->device, "", NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); // In case of error print the build log to the standard output // First check the size of the log // Then allocate the memory and obtain the log from the program if (err == CL_BUILD_PROGRAM_FAILURE) { size_t log_size = 0; clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); std::vector<char> build_log(log_size); clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); } } Finish: if (source) { delete[] source; source = NULL; } return err; }
cl_uint CreateAndBuildProgram() { cl_int err = CL_SUCCESS; // Upload the OpenCL C source code from the input file to source // The size of the C program is returned in sourceSize char* source = NULL; size_t src_size = 0; err = ReadSourceFromFile("ray_algorithm.cl", &source, &src_size); if (CL_SUCCESS != err) { printf("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object. ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); if (CL_SUCCESS != err) { printf("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // Build the program // During creation a program is not built. You need to explicitly call build function. // Here you just use create-build sequence, // but there are also other possibilities when program consist of several parts, // some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as // alternatives. err = clBuildProgram(ocl.program, 2, ocl.device, "", NULL, NULL); if (CL_SUCCESS != err) { printf("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); } Finish: if (source) { delete[] source; source = NULL; } return err; }
void imgdiff(size_t N, size_t width, size_t height, double* diff_matrix, unsigned char* images) { //// we need to fill in //// cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_GPU; cl_device_id *devs; cl_context context; cl_command_queue *cmd_queues; cl_program program; cl_kernel *kernels; cl_uint num_platforms; cl_uint num_devs; cl_mem* m_image1; cl_mem* m_image2; cl_mem* m_result; cl_event* ev_kernels; int err = CL_SUCCESS; int i, j, k; // modify version err = clGetPlatformIDs(0, NULL, &num_platforms); if(err != CL_SUCCESS) { printf("Error: platform error\n"); return 0; } if(num_platforms == 0) { printf("Error: platform no count\n"); return 0; } platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); if(err != CL_SUCCESS) { printf("Error: clGetPlatformIDs error\n"); return 0; } for(i = 0; i<num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs); if(err != CL_SUCCESS) { printf("Error: clGetDevice\n"); return 0; } if(num_devs >= 1) { devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs); clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL); break; } } context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("Error: clCreateContext error\n"); return 0; } char* source = NULL; size_t src_size = 0; err = ReadSourceFromFile("./imgdiff_cal.cl", &source, &src_size); if (CL_SUCCESS != err) { printf("Error: ReadSourceFromFile returned %s.\n", err); free(source); return 0; } program = clCreateProgramWithSource(context, 1, (const char**)&source, &src_size, &err); if(err != CL_SUCCESS) { printf("Error: clCreateProgram error\n"); return 0; } free(source); printf("Create Program Success\n"); #if DBG // Measure clBuildProgram -@henry added gettimeofday(&start_m, NULL ); #endif err = clBuildProgram(program, num_devs, devs, "", NULL, NULL); #if DBG gettimeofday(&end_m, NULL ); double time = (end_m.tv_usec - start_m.tv_usec)*1e-6 + (end_m.tv_sec - start_m.tv_sec); printf("[Debug] Elapsed Time of clBuildProgram() : %lf s\n",time); #endif if(err != CL_SUCCESS) { printf("Error: clBuildProgram\n"); return 0; } printf("Build Program Success\n"); kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs); for(i = 0; i<num_devs; i++) { kernels[i] = clCreateKernel(program, "imgdiff_cal", NULL); } printf("Create Kernel Success\n"); cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs); for(i=0; i<num_devs; i++) { cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err); if(err != CL_SUCCESS) { printf("Error: clCreateCommandQueue error\n"); return 0; } } printf("Create commandQueue Success\n"); int LOCAL_WIDTH = 16; int LOCAL_HEIGHT = 16; int WORK_WIDTH = ceil((double)width / LOCAL_WIDTH)*LOCAL_WIDTH; int WORK_HEIGHT = ceil((double)height/LOCAL_HEIGHT) * LOCAL_HEIGHT; int WORK_AMOUNT = width * height; int WORK_GROUP_COUNT = ceil(((double)WORK_WIDTH * WORK_HEIGHT) / (LOCAL_WIDTH * LOCAL_HEIGHT)); int WORK_GROUP_WIDTH = width; int WORK_GROUP_HEIGHT = height; int SAMPLE_COUNT = 16; int WORK_COUNT[num_devs]; double tmp_result_data[WORK_GROUP_COUNT*SAMPLE_COUNT]; printf("WORK_WIDTH %d\tWORK_HEIGHT %d\t WORK_AMOUNT %d\t WORK_GROUP_COUNT %d\n", WORK_WIDTH, WORK_HEIGHT, WORK_AMOUNT, WORK_GROUP_COUNT); m_image1 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); m_image2 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); m_result = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); for(i=0; i<num_devs; i++) { m_image1[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT * 3, NULL, NULL); m_image2[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT*SAMPLE_COUNT * 3, NULL, NULL); m_result[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double) * WORK_GROUP_COUNT * SAMPLE_COUNT, NULL, NULL); clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*)&m_image1[i]); clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*)&m_image2[i]); clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*)&m_result[i]); clSetKernelArg(kernels[i], 3, sizeof(cl_int), &WORK_GROUP_WIDTH); clSetKernelArg(kernels[i], 4, sizeof(cl_int), &WORK_GROUP_HEIGHT); } ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs); int row, col; row = 0; col = 1; for(row = 0; row < N; row++) { if( (N-row-1) < (SAMPLE_COUNT*4) && SAMPLE_COUNT > 1) SAMPLE_COUNT = SAMPLE_COUNT / 2; int remain_count = N - (row + 1); for(i=0; i<num_devs; i++) { clEnqueueWriteBuffer(cmd_queues[i], m_image1[i], CL_FALSE, 0, sizeof(unsigned char) * WORK_AMOUNT * 3, (void*)(images + (row * width*height)*3), 0, NULL, NULL); } diff_matrix[row*N + row] = 0; col = row + 1; while( col< N) { size_t lws[2] = { LOCAL_WIDTH, LOCAL_HEIGHT }; size_t gws[2] = { WORK_WIDTH, WORK_HEIGHT}; for(i=0; i<num_devs; i++) { if((remain_count - SAMPLE_COUNT) < 0) { WORK_COUNT[i] = remain_count; remain_count = 0; } else { WORK_COUNT[i] = SAMPLE_COUNT; remain_count = remain_count - SAMPLE_COUNT; } if(WORK_COUNT[i] != 0) { clSetKernelArg(kernels[i], 5, sizeof(cl_int), &WORK_COUNT[i]); int offset = 0; for(j=0; j<i; j++) offset += WORK_COUNT[j]; err = clEnqueueWriteBuffer(cmd_queues[i], m_image2[i], CL_FALSE, 0, sizeof(unsigned char)*WORK_AMOUNT*WORK_COUNT[i]*3, (void*)(images +((col * width*height) + (WORK_AMOUNT * offset))*3), 0, NULL, NULL); } } for( i=0; i < num_devs; i++ ) { if(WORK_COUNT[i] != 0) { err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 2, NULL, gws, lws, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: clEnqueueNDRangeKernel %d error\n", i); printf("%s\n", TranslateOpenCLError(err)); return 0; } } } double tmp_sum = 0; i = 0; for( i = num_devs -1; i >= 0; i-- ) { if(WORK_COUNT[i] != 0) { err = clEnqueueReadBuffer( cmd_queues[i], m_result[i], CL_TRUE, 0, sizeof(double) * WORK_GROUP_COUNT * WORK_COUNT[i], tmp_result_data, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: clEnqueueReadBuffer%d error\n", i); return 0; } //printf("receive......"); for(j = 0; j<WORK_COUNT[i]; j++) { tmp_sum = 0; for(k = 0; k<WORK_GROUP_COUNT; k++) { tmp_sum += tmp_result_data[k + j*WORK_GROUP_COUNT]; //printf("%lf\t", tmp_result_data[k+j*WORK_GROUP_COUNT]); } //printf("%lf %lf\n", tmp_sum, tmp_result_data[j*WORK_GROUP_COUNT]); int offset = 0; for(k=0; k<i; k++) offset += WORK_COUNT[k]; diff_matrix[row*N+col+j+offset] = diff_matrix[(col+j+offset)*N+row] = tmp_sum; } } } for( i = 0; i < num_devs; i++ ) { col += WORK_COUNT[i]; } } } }