void OsdClKernelDispatcher::ApplyCatmarkEdgeVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const { cl_int ciErrNum; size_t globalWorkSize[1] = { end-start }; cl_kernel kernel = _clKernel->GetCatmarkEdgeKernel(); clSetKernelArg(kernel, 0, sizeof(cl_mem), GetVertexBuffer()); clSetKernelArg(kernel, 1, sizeof(cl_mem), GetVaryingBuffer()); clSetKernelArg(kernel, 2, sizeof(cl_mem), &_tables[E_IT].devicePtr); clSetKernelArg(kernel, 3, sizeof(cl_mem), &_tables[E_W].devicePtr); clSetKernelArg(kernel, 4, sizeof(int), &_tableOffsets[E_IT][level-1]); clSetKernelArg(kernel, 5, sizeof(int), &_tableOffsets[E_W][level-1]); clSetKernelArg(kernel, 6, sizeof(int), &offset); clSetKernelArg(kernel, 7, sizeof(int), &start); clSetKernelArg(kernel, 8, sizeof(int), &end); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); CL_CHECK_ERROR(ciErrNum, "edge kernel %d\n", ciErrNum); }
bool OsdClKernelDispatcher::ClKernel::Compile(cl_context clContext, int numVertexElements, int numVaryingElements) { cl_int ciErrNum; _numVertexElements = numVertexElements; _numVaryingElements = numVaryingElements; char constantDefine[256]; snprintf(constantDefine, 256, "#define NUM_VERTEX_ELEMENTS %d\n" "#define NUM_VARYING_ELEMENTS %d\n", numVertexElements, numVaryingElements); const char *sources[] = { constantDefine, clSource }; _clProgram = clCreateProgramWithSource(clContext, 2, sources, 0, &ciErrNum); CL_CHECK_ERROR(ciErrNum, "clCreateProgramWithSource\n"); ciErrNum = clBuildProgram(_clProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { OSD_ERROR("ERROR in clBuildProgram %d\n", ciErrNum); char cBuildLog[10240]; clGetProgramBuildInfo(_clProgram, _clDevice, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL); OSD_ERROR(cBuildLog); return false; } // ------- _clBilinearEdge = buildKernel(_clProgram, "computeBilinearEdge"); _clBilinearVertex = buildKernel(_clProgram, "computeBilinearVertex"); _clCatmarkFace = buildKernel(_clProgram, "computeFace"); _clCatmarkEdge = buildKernel(_clProgram, "computeEdge"); _clCatmarkVertexA = buildKernel(_clProgram, "computeVertexA"); _clCatmarkVertexB = buildKernel(_clProgram, "computeVertexB"); _clLoopEdge = buildKernel(_clProgram, "computeEdge"); _clLoopVertexA = buildKernel(_clProgram, "computeVertexA"); _clLoopVertexB = buildKernel(_clProgram, "computeLoopVertexB"); return true; }
void OsdClKernelDispatcher::ApplyLoopVertexVerticesKernelA(FarMesh<OsdVertex> * mesh, int offset, bool pass, int level, int start, int end, void * data) const { cl_int ciErrNum; size_t globalWorkSize[1] = { end-start }; int ipass = pass; cl_kernel kernel = _clKernel->GetLoopVertexKernelA(); clSetKernelArg(kernel, 0, sizeof(cl_mem), GetVertexBuffer()); clSetKernelArg(kernel, 1, sizeof(cl_mem), GetVaryingBuffer()); clSetKernelArg(kernel, 2, sizeof(cl_mem), &_tables[V_ITa].devicePtr); clSetKernelArg(kernel, 3, sizeof(cl_mem), &_tables[V_W].devicePtr); clSetKernelArg(kernel, 4, sizeof(int), &_tableOffsets[V_ITa][level-1]); clSetKernelArg(kernel, 5, sizeof(int), &_tableOffsets[V_W][level-1]); clSetKernelArg(kernel, 6, sizeof(int), (void*)&offset); clSetKernelArg(kernel, 7, sizeof(int), (void*)&start); clSetKernelArg(kernel, 8, sizeof(int), (void*)&end); clSetKernelArg(kernel, 9, sizeof(int), (void*)&ipass); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); CL_CHECK_ERROR(ciErrNum, "vertex kernel 2 %d\n", ciErrNum); }
void dfi_process_sinogram(const char* tiff_input, const char* tiff_output, int center_rotation) { cl_event events[11]; if(!tiff_input) { printf("The filename of input is not valid. (pointer tiff_input = %p)", tiff_input); return; } if(!tiff_output) { printf("The filename of output is not valid. (pointer tiff_output = %p)", tiff_output); return; } ///////////////////// /* Input Data Part */ ///////////////////// /* Input a slice properties */ int bits_per_sample; int samples_per_pixel; int theta_size; int slice_size; /* Read the slice */ clFFT_Complex *data_tiff = tiff_read_complex(tiff_input, center_rotation, &bits_per_sample, &samples_per_pixel, &slice_size, &theta_size); //tiff_write_complex("resources/initial-sino.tif", data_tiff, slice_size, theta_size); /* * OpenCL */ printf("Hey!1\n"); cl_int status = CL_SUCCESS; cl_platform_id platform; printf("Hey!1.2\n"); CL_CHECK_ERROR(clGetPlatformIDs(1, &platform, NULL)); printf("Hey!2\n"); cl_device_id devices[10]; // Compute device cl_context context; // Compute context cl_uint n_devices = 0; printf("@Hey!3\n"); #if GPU printf("@Hey!GPU Choosed\n"); CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 3, devices, &n_devices)); #else printf("@Hey!CPU Choosed\n"); CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 3, devices, &n_devices)); #endif //cl_device_id current_device = devices[0]; #define current_device devices[0] printf("Hey!3.1 n_devices %d\n", n_devices); context = clCreateContext(NULL, 1, devices, NULL, NULL, &status); printf("Hey!3.2\n"); CL_CHECK_ERROR(status); /* * Device */ printf("Hey!3.3\n"); cl_int device_max_cu = 0; CL_CHECK_ERROR(clGetDeviceInfo(current_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_max_cu, NULL)); size_t wg_count = device_max_cu * wavefronts_per_SIMD; size_t global_work_size = wg_count * local_work_size; printf("Hey!3.4\n"); /* * Queues, Kernels */ cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; cl_command_queue command_queue = clCreateCommandQueue(context, current_device, properties, &status); CL_CHECK_ERROR(status); printf("Hey!3.5\n"); cl_kernel kernel_linear_interp = get_kernel("linear_interp", &context, ¤t_device); cl_kernel kernel_zero_ifftshift = get_kernel("zero_ifftshift", &context, ¤t_device); cl_kernel kernel_fftshift = get_kernel("fftshift", &context, ¤t_device); cl_kernel kernel_2dshift = get_kernel("shift2d", &context, ¤t_device); cl_kernel kernel_crop_data = get_kernel("crop_data", &context, ¤t_device); printf("@Hey!3.6\n\n"); //////////////////////// /* OpenCL - DFI Part */ //////////////////////// /* Reconstruction properties */ int oversampling_ratio = 2; int dx = 1; /* zoom times */ //int size_s = slice_size * oversampling_ratio; int min_theta = 0; int max_theta = theta_size - 1; int size_zeropad_s = pow(2, ceil(log2((float)slice_size))); /* get length of FFT operations */ int size_s = size_zeropad_s; float d_omega_s = 2 * M_PI / (size_zeropad_s * dx); //normalized ratio [0; 2PI] /* Start timer */ timeval global_tim; gettimeofday(&global_tim, NULL); double t1_global = global_tim.tv_sec + (global_tim.tv_usec/1000000.0), t2_global = 0.0; ///////////////////////////////////// /* Sinogram shifting + Zeropadding */ ///////////////////////////////////// long data_size = slice_size * theta_size * sizeof(clFFT_Complex); printf("6 "); long zeropad_data_size = theta_size * size_zeropad_s * sizeof(clFFT_Complex); /* Buffers */ cl_mem original_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_size, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, original_data_buffer, CL_FALSE, 0, data_size, data_tiff, 0, NULL, &events[0])); cl_mem zeropad_ifftshift_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); float *zero_out = (float *)g_malloc0(zeropad_data_size); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, zeropad_ifftshift_data_buffer, CL_FALSE, 0, zeropad_data_size, zero_out, 0, NULL, &events[1])); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 0, sizeof(void *), (void *)&original_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 1, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 2, sizeof(slice_size), &slice_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 3, sizeof(void *), (void *)&zeropad_ifftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 4, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_zero_ifftshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 2, // number events in wait list events, // event wait list &events[2]); // event CL_CHECK_ERROR(status); // Copy result from device to host /* clFFT_Complex *fur_kernel_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, zeropad_ifftshift_data_buffer, CL_TRUE, CL_MAP_READ, 0, zeropad_data_size, 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/zeropad-sino.tif", fur_kernel_sino, size_zeropad_s, theta_size); */ //////////////////////////////////////////////////////////////////////// /* Applying 1-D FFT to the each strip of the sinogram and shifting it */ //////////////////////////////////////////////////////////////////////// cl_mem zeropadded_1dfft_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); /* Setup clAmdFft */ clFFT_Dim3 sino_fft; sino_fft.x = size_zeropad_s; sino_fft.y = 1; sino_fft.z = 1; /* Create FFT plan */ clFFT_Plan plan_1dfft_sinogram = clFFT_CreatePlan(context, sino_fft, clFFT_1D, clFFT_InterleavedComplexFormat, &status); CL_CHECK_ERROR(status); /* Execute FFT */ status = clFFT_ExecuteInterleaved(command_queue, plan_1dfft_sinogram, theta_size, clFFT_Forward, zeropad_ifftshift_data_buffer, zeropadded_1dfft_data_buffer, 0, NULL, NULL); CL_CHECK_ERROR(status); // Free FFT plan //clFFT_DestroyPlan(plan_1dfft_sinogram); // Copy result from device to host /* clFFT_Complex *fourier_kernel_sinogram = (clFFT_Complex *)malloc(zeropad_data_size); clEnqueueReadBuffer(command_queue, zeropadded_1dfft_data_buffer, CL_TRUE, 0, zeropad_data_size, fourier_kernel_sinogram, 0, NULL, NULL); clFinish(command_queue); tiff_write_complex("resources/1dfft-sino.tif", fourier_kernel_sinogram, size_zeropad_s, theta_size); */ /////////////////// /* Make fftshift */ /////////////////// /* Buffers */ cl_mem zeropad_fftshift_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 0, sizeof(void *), (void *)&zeropadded_1dfft_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 1, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 2, sizeof(size_zeropad_s), &size_zeropad_s)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 3, sizeof(void *), (void *)&zeropad_fftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 4, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_fftshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 0, // number events in wait list NULL, // event wait list &events[3]); // event CL_CHECK_ERROR(status); /* Copy result from device to host */ /* clFFT_Complex *fur_kernel_fftshift_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, zeropad_fftshift_data_buffer, CL_TRUE, CL_MAP_READ,0,zeropad_data_size, 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/fftshift-sino.tif", fur_kernel_fftshift_sino, size_zeropad_s, theta_size); */ //////////////////////// /* Data Interpolation */ //////////////////////// /* Performing Interpolation */ cl_long data_length = size_s * size_s; cl_int in_rows = theta_size; cl_int in_cols = size_zeropad_s; cl_float norm_ratio = d_omega_s/dx; cl_float in_rows_first_val = min_theta; cl_float in_rows_last_val = max_theta; cl_float in_cols_first_val = (-in_cols/2)*norm_ratio; cl_float in_cols_last_val = (in_cols/2-1)*norm_ratio; cl_int interp_rows = size_s; cl_int interp_cols = interp_rows; cl_int iparams[5]; iparams[0] = in_rows; iparams[1] = in_cols; iparams[2] = dx; iparams[3] = interp_rows; iparams[4] = interp_cols; cl_float fparams[5]; fparams[0] = in_rows_first_val; fparams[1] = in_rows_last_val; fparams[2] = in_cols_first_val; fparams[3] = in_cols_last_val; fparams[4] = norm_ratio; /* Buffers */ cl_mem i_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * 5, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, i_buffer, CL_FALSE, 0, sizeof(cl_int) * 5, iparams, 0, NULL, &events[4])); cl_mem f_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * 5, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, f_buffer, CL_FALSE, 0, sizeof(cl_float) * 5, fparams, 0, NULL, &events[5])); cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 0, sizeof(void *), (void *)&i_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 1, sizeof(void *), (void *)&f_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 2, sizeof(void *), (void *)&zeropad_fftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 3, sizeof(void *), (void *)&output_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 4, sizeof(data_length), &data_length)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_linear_interp, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // nomber of WI in WG 3, // num events in wait list events + 3, // event wait list &events[6]); // event CL_CHECK_ERROR(status); //clFinish(command_queue); // Copy result from device to host /* clFFT_Complex *interpolated_spectrum = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, output_buffer, CL_TRUE, CL_MAP_READ, 0, data_length * sizeof(clFFT_Complex), 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/interpolated-sino.tif", interpolated_spectrum, size_s, size_s); */ /////////////////////////////////////////////////// /* Applying 2-D FFT to the interpolated spectrum */ /////////////////////////////////////////////////// /* Setup 2D IFFT */ clFFT_Dim3 sino_2dfft; sino_2dfft.x = size_s; sino_2dfft.y = size_s; sino_2dfft.z = 1; /* Create 2D IFFT plan */ clFFT_Plan plan_2difft = clFFT_CreatePlan(context, sino_2dfft, clFFT_2D, clFFT_InterleavedComplexFormat, &status); CL_CHECK_ERROR(status); /* Execute 2D IFFT */ cl_mem reconstructed_image_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); status = clFFT_ExecuteInterleaved(command_queue, plan_2difft, 1, clFFT_Inverse, output_buffer, reconstructed_image_buffer, 0, NULL, NULL); CL_CHECK_ERROR(status); // Copy result from device to host /* clFFT_Complex *ifft2d_interpolated_spectrum = (clFFT_Complex *)malloc(data_length * sizeof(clFFT_Complex)); clEnqueueReadBuffer(command_queue, reconstructed_image_buffer, CL_TRUE, 0, data_length * sizeof(clFFT_Complex), ifft2d_interpolated_spectrum, 0, NULL, NULL); tiff_write_complex("resources/ifft2d_interpolated_spectrum.tif", ifft2d_interpolated_spectrum, size_s, size_s); clFinish(command_queue); */ ///////////////////////////////////////////////// /* Applying 2-D fftshidt to the restored image */ ///////////////////////////////////////////////// /* Buffers */ cl_mem two_dim_fftshifted_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ cl_int inverse_flag = 0; CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 0, sizeof(void *), (void *)&reconstructed_image_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 1, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 2, sizeof(interp_rows), &interp_rows)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 3, sizeof(interp_cols), &interp_cols)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 4, sizeof(inverse_flag), &inverse_flag)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_2dshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 1, // number events in wait list &events[6], // event wait list &events[7]); // event CL_CHECK_ERROR(status); /* Copy result from device to host */ /* clFFT_Complex *two_dim_fftshifted_data = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, two_dim_fftshifted_data_buffer, CL_TRUE, CL_MAP_READ, 0, data_length * sizeof(clFFT_Complex), 0, NULL, NULL, NULL ); clFinish(command_queue); */ //////////////// /* Crop data */ /////////////// float lt_offset = 0, rb_offset = 0; int dif_sides = interp_cols - slice_size; if (dif_sides%2) { lt_offset = floor(dif_sides / 2.0); rb_offset = ceil(dif_sides / 2.0); } else { lt_offset = rb_offset = dif_sides / 2.0; } /* Buffers */ long cropped_data_length = slice_size * slice_size * sizeof(clFFT_Complex); cl_mem cropped_restored_image_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, cropped_data_length, NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 0, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 1, sizeof(void *), (void *)&cropped_restored_image_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 2, sizeof(slice_size), &slice_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 3, sizeof(interp_cols), &interp_cols)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 4, sizeof(lt_offset), <_offset)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 5, sizeof(rb_offset), &rb_offset)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_crop_data, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 1, // number events in wait list &events[7], // event wait list &events[8]); // event CL_CHECK_ERROR(status); CL_CHECK_ERROR(clFinish(command_queue)); clFFT_DestroyPlan(plan_2difft); clFFT_DestroyPlan(plan_1dfft_sinogram); //timing float ms = 0.0, total_ms = 0.0, global_ms = 0.0, deg = 1.0e-6f; /* Stop timer */ gettimeofday(&global_tim, NULL); t2_global = global_tim.tv_sec+(global_tim.tv_usec/1000000.0); printf("\n(Total time - timeofday) %f seconds elapsed\n", (t2_global-t1_global)*1000.0); cl_ulong start, end; CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding write_op1):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding write_op2):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding):%f", ms); printf("\nTOTAL(Sinogram shifting + Zeropadding):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 1-D FFT):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Shift 1-D FFT data):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation write_op1):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation write_op2):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation):%f", ms); printf("\nTOTAL(Data Interpolation):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 2-D IFFT):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 2-D Shift):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Cropping data):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; printf("\nTOTAL TIME:%f\n", global_ms); // Copy result from device to host clFFT_Complex *cropped_restored_image = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, cropped_restored_image_data_buffer, CL_TRUE, CL_MAP_READ, 0, cropped_data_length, 0, NULL, NULL, NULL ); /* Write the restored slice */ tiff_write_complex(tiff_output, cropped_restored_image, slice_size, slice_size); }
void init(OptionParser& op, bool _do_dp) { cl_int err; do_dp = _do_dp; if (!fftCtx) { // first get the device int device, platform = op.getOptionInt("platform"); if (op.getOptionVecInt("device").size() > 0) { device = op.getOptionVecInt("device")[0]; } else { device = 0; } fftDev = ListDevicesAndGetDevice(platform, device); // now get the context fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err); CL_CHECK_ERROR(err); } if (!fftQueue) { // get a queue fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE, &err); CL_CHECK_ERROR(err); } // create the program... fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err); CL_CHECK_ERROR(err); // ...and build it string args = " -cl-mad-enable "; if (op.getOptionBool("use-native")) { args += " -cl-fast-relaxed-math "; } if (!do_dp) { args += " -DSINGLE_PRECISION "; } else if (checkExtension(fftDev, "cl_khr_fp64")) { args += " -DK_DOUBLE_PRECISION "; } else if (checkExtension(fftDev, "cl_amd_fp64")) { args += " -DAMD_DOUBLE_PRECISION "; } err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL); { char* log = NULL; size_t bytesRequired = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 0, NULL, &bytesRequired ); log = (char*)malloc( bytesRequired + 1 ); err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, bytesRequired, log, NULL ); std::cout << log << std::endl; free( log ); } if (err != CL_SUCCESS) { char log[50000]; size_t retsize = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 50000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; dumpPTXCode(fftCtx, fftProg, "oclFFT"); exit(-1); } else { // dumpPTXCode(fftCtx, fftProg, "oclFFT"); } // Create kernel for forward FFT fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for inverse FFT ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for check chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err); CL_CHECK_ERROR(err); }