void FftClFft:: compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, FftDirection direction ) { TIME_STFT TaskTimer tt("Fft ClFft"); unsigned n = input->getNumberOfElements().width; unsigned N = output->getNumberOfElements().width; if (-1 != direction) EXCEPTION_ASSERT( n == N ); { TIME_STFT TaskTimer tt("Computing fft(N=%u, n=%u, direction=%d)", N, n, direction); OpenCLContext *opencl = &OpenCLContext::Singleton(); cl_int fft_error; clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n, fft_error); if (fft_error != CL_SUCCESS) throw std::runtime_error("Could not create clFFT compute plan."); // Run the fft in OpenCL :) // fft kernel needs to have read/write access to output data fft_error |= clFFT_ExecuteInterleaved( opencl->getCommandQueue(), plan, 1, (clFFT_Direction)direction, OpenClMemoryStorage::ReadOnly<1>( input ).ptr(), OpenClMemoryStorage::ReadWrite<1>( output ).ptr(), 0, NULL, NULL ); if (fft_error != CL_SUCCESS) throw std::runtime_error("Bad stuff happened during FFT computation."); } }
void FftClFft:: compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, DataStorageSize n, FftDirection direction ) { TaskTimer tt("Stft::computeWithClFft( matrix[%d, %d], %s )", input->size().width, input->size().height, direction==FftDirection_Forward?"forward":"backward"); EXCEPTION_ASSERT( output->numberOfBytes() == input->numberOfBytes() ); const int batchSize = n.height; OpenCLContext *opencl = &OpenCLContext::Singleton(); cl_int fft_error; clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n.width, fft_error); if(fft_error != CL_SUCCESS) throw std::runtime_error("Could not create clFFT compute plan."); { TaskTimer tt("Calculating batches"); // Run the fft in OpenCL :) fft_error |= clFFT_ExecuteInterleaved( opencl->getCommandQueue(), plan, batchSize, direction==FftDirection_Forward?clFFT_Forward:clFFT_Inverse, OpenClMemoryStorage::ReadOnly<1>( input ).ptr(), OpenClMemoryStorage::ReadWrite<1>( output ).ptr(), 0, NULL, NULL ); if(fft_error != CL_SUCCESS) throw std::runtime_error("Bad stuff happened during FFT computation."); } }
void dfi_process_sinogram(const char* tiff_input, const char* tiff_output, int center_rotation) { cl_event events[11]; if(!tiff_input) { printf("The filename of input is not valid. (pointer tiff_input = %p)", tiff_input); return; } if(!tiff_output) { printf("The filename of output is not valid. (pointer tiff_output = %p)", tiff_output); return; } ///////////////////// /* Input Data Part */ ///////////////////// /* Input a slice properties */ int bits_per_sample; int samples_per_pixel; int theta_size; int slice_size; /* Read the slice */ clFFT_Complex *data_tiff = tiff_read_complex(tiff_input, center_rotation, &bits_per_sample, &samples_per_pixel, &slice_size, &theta_size); //tiff_write_complex("resources/initial-sino.tif", data_tiff, slice_size, theta_size); /* * OpenCL */ printf("Hey!1\n"); cl_int status = CL_SUCCESS; cl_platform_id platform; printf("Hey!1.2\n"); CL_CHECK_ERROR(clGetPlatformIDs(1, &platform, NULL)); printf("Hey!2\n"); cl_device_id devices[10]; // Compute device cl_context context; // Compute context cl_uint n_devices = 0; printf("@Hey!3\n"); #if GPU printf("@Hey!GPU Choosed\n"); CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 3, devices, &n_devices)); #else printf("@Hey!CPU Choosed\n"); CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 3, devices, &n_devices)); #endif //cl_device_id current_device = devices[0]; #define current_device devices[0] printf("Hey!3.1 n_devices %d\n", n_devices); context = clCreateContext(NULL, 1, devices, NULL, NULL, &status); printf("Hey!3.2\n"); CL_CHECK_ERROR(status); /* * Device */ printf("Hey!3.3\n"); cl_int device_max_cu = 0; CL_CHECK_ERROR(clGetDeviceInfo(current_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_max_cu, NULL)); size_t wg_count = device_max_cu * wavefronts_per_SIMD; size_t global_work_size = wg_count * local_work_size; printf("Hey!3.4\n"); /* * Queues, Kernels */ cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; cl_command_queue command_queue = clCreateCommandQueue(context, current_device, properties, &status); CL_CHECK_ERROR(status); printf("Hey!3.5\n"); cl_kernel kernel_linear_interp = get_kernel("linear_interp", &context, ¤t_device); cl_kernel kernel_zero_ifftshift = get_kernel("zero_ifftshift", &context, ¤t_device); cl_kernel kernel_fftshift = get_kernel("fftshift", &context, ¤t_device); cl_kernel kernel_2dshift = get_kernel("shift2d", &context, ¤t_device); cl_kernel kernel_crop_data = get_kernel("crop_data", &context, ¤t_device); printf("@Hey!3.6\n\n"); //////////////////////// /* OpenCL - DFI Part */ //////////////////////// /* Reconstruction properties */ int oversampling_ratio = 2; int dx = 1; /* zoom times */ //int size_s = slice_size * oversampling_ratio; int min_theta = 0; int max_theta = theta_size - 1; int size_zeropad_s = pow(2, ceil(log2((float)slice_size))); /* get length of FFT operations */ int size_s = size_zeropad_s; float d_omega_s = 2 * M_PI / (size_zeropad_s * dx); //normalized ratio [0; 2PI] /* Start timer */ timeval global_tim; gettimeofday(&global_tim, NULL); double t1_global = global_tim.tv_sec + (global_tim.tv_usec/1000000.0), t2_global = 0.0; ///////////////////////////////////// /* Sinogram shifting + Zeropadding */ ///////////////////////////////////// long data_size = slice_size * theta_size * sizeof(clFFT_Complex); printf("6 "); long zeropad_data_size = theta_size * size_zeropad_s * sizeof(clFFT_Complex); /* Buffers */ cl_mem original_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_size, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, original_data_buffer, CL_FALSE, 0, data_size, data_tiff, 0, NULL, &events[0])); cl_mem zeropad_ifftshift_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); float *zero_out = (float *)g_malloc0(zeropad_data_size); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, zeropad_ifftshift_data_buffer, CL_FALSE, 0, zeropad_data_size, zero_out, 0, NULL, &events[1])); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 0, sizeof(void *), (void *)&original_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 1, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 2, sizeof(slice_size), &slice_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 3, sizeof(void *), (void *)&zeropad_ifftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 4, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_zero_ifftshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 2, // number events in wait list events, // event wait list &events[2]); // event CL_CHECK_ERROR(status); // Copy result from device to host /* clFFT_Complex *fur_kernel_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, zeropad_ifftshift_data_buffer, CL_TRUE, CL_MAP_READ, 0, zeropad_data_size, 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/zeropad-sino.tif", fur_kernel_sino, size_zeropad_s, theta_size); */ //////////////////////////////////////////////////////////////////////// /* Applying 1-D FFT to the each strip of the sinogram and shifting it */ //////////////////////////////////////////////////////////////////////// cl_mem zeropadded_1dfft_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); /* Setup clAmdFft */ clFFT_Dim3 sino_fft; sino_fft.x = size_zeropad_s; sino_fft.y = 1; sino_fft.z = 1; /* Create FFT plan */ clFFT_Plan plan_1dfft_sinogram = clFFT_CreatePlan(context, sino_fft, clFFT_1D, clFFT_InterleavedComplexFormat, &status); CL_CHECK_ERROR(status); /* Execute FFT */ status = clFFT_ExecuteInterleaved(command_queue, plan_1dfft_sinogram, theta_size, clFFT_Forward, zeropad_ifftshift_data_buffer, zeropadded_1dfft_data_buffer, 0, NULL, NULL); CL_CHECK_ERROR(status); // Free FFT plan //clFFT_DestroyPlan(plan_1dfft_sinogram); // Copy result from device to host /* clFFT_Complex *fourier_kernel_sinogram = (clFFT_Complex *)malloc(zeropad_data_size); clEnqueueReadBuffer(command_queue, zeropadded_1dfft_data_buffer, CL_TRUE, 0, zeropad_data_size, fourier_kernel_sinogram, 0, NULL, NULL); clFinish(command_queue); tiff_write_complex("resources/1dfft-sino.tif", fourier_kernel_sinogram, size_zeropad_s, theta_size); */ /////////////////// /* Make fftshift */ /////////////////// /* Buffers */ cl_mem zeropad_fftshift_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 0, sizeof(void *), (void *)&zeropadded_1dfft_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 1, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 2, sizeof(size_zeropad_s), &size_zeropad_s)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 3, sizeof(void *), (void *)&zeropad_fftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 4, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_fftshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 0, // number events in wait list NULL, // event wait list &events[3]); // event CL_CHECK_ERROR(status); /* Copy result from device to host */ /* clFFT_Complex *fur_kernel_fftshift_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, zeropad_fftshift_data_buffer, CL_TRUE, CL_MAP_READ,0,zeropad_data_size, 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/fftshift-sino.tif", fur_kernel_fftshift_sino, size_zeropad_s, theta_size); */ //////////////////////// /* Data Interpolation */ //////////////////////// /* Performing Interpolation */ cl_long data_length = size_s * size_s; cl_int in_rows = theta_size; cl_int in_cols = size_zeropad_s; cl_float norm_ratio = d_omega_s/dx; cl_float in_rows_first_val = min_theta; cl_float in_rows_last_val = max_theta; cl_float in_cols_first_val = (-in_cols/2)*norm_ratio; cl_float in_cols_last_val = (in_cols/2-1)*norm_ratio; cl_int interp_rows = size_s; cl_int interp_cols = interp_rows; cl_int iparams[5]; iparams[0] = in_rows; iparams[1] = in_cols; iparams[2] = dx; iparams[3] = interp_rows; iparams[4] = interp_cols; cl_float fparams[5]; fparams[0] = in_rows_first_val; fparams[1] = in_rows_last_val; fparams[2] = in_cols_first_val; fparams[3] = in_cols_last_val; fparams[4] = norm_ratio; /* Buffers */ cl_mem i_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * 5, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, i_buffer, CL_FALSE, 0, sizeof(cl_int) * 5, iparams, 0, NULL, &events[4])); cl_mem f_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * 5, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, f_buffer, CL_FALSE, 0, sizeof(cl_float) * 5, fparams, 0, NULL, &events[5])); cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 0, sizeof(void *), (void *)&i_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 1, sizeof(void *), (void *)&f_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 2, sizeof(void *), (void *)&zeropad_fftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 3, sizeof(void *), (void *)&output_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 4, sizeof(data_length), &data_length)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_linear_interp, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // nomber of WI in WG 3, // num events in wait list events + 3, // event wait list &events[6]); // event CL_CHECK_ERROR(status); //clFinish(command_queue); // Copy result from device to host /* clFFT_Complex *interpolated_spectrum = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, output_buffer, CL_TRUE, CL_MAP_READ, 0, data_length * sizeof(clFFT_Complex), 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/interpolated-sino.tif", interpolated_spectrum, size_s, size_s); */ /////////////////////////////////////////////////// /* Applying 2-D FFT to the interpolated spectrum */ /////////////////////////////////////////////////// /* Setup 2D IFFT */ clFFT_Dim3 sino_2dfft; sino_2dfft.x = size_s; sino_2dfft.y = size_s; sino_2dfft.z = 1; /* Create 2D IFFT plan */ clFFT_Plan plan_2difft = clFFT_CreatePlan(context, sino_2dfft, clFFT_2D, clFFT_InterleavedComplexFormat, &status); CL_CHECK_ERROR(status); /* Execute 2D IFFT */ cl_mem reconstructed_image_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); status = clFFT_ExecuteInterleaved(command_queue, plan_2difft, 1, clFFT_Inverse, output_buffer, reconstructed_image_buffer, 0, NULL, NULL); CL_CHECK_ERROR(status); // Copy result from device to host /* clFFT_Complex *ifft2d_interpolated_spectrum = (clFFT_Complex *)malloc(data_length * sizeof(clFFT_Complex)); clEnqueueReadBuffer(command_queue, reconstructed_image_buffer, CL_TRUE, 0, data_length * sizeof(clFFT_Complex), ifft2d_interpolated_spectrum, 0, NULL, NULL); tiff_write_complex("resources/ifft2d_interpolated_spectrum.tif", ifft2d_interpolated_spectrum, size_s, size_s); clFinish(command_queue); */ ///////////////////////////////////////////////// /* Applying 2-D fftshidt to the restored image */ ///////////////////////////////////////////////// /* Buffers */ cl_mem two_dim_fftshifted_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ cl_int inverse_flag = 0; CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 0, sizeof(void *), (void *)&reconstructed_image_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 1, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 2, sizeof(interp_rows), &interp_rows)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 3, sizeof(interp_cols), &interp_cols)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 4, sizeof(inverse_flag), &inverse_flag)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_2dshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 1, // number events in wait list &events[6], // event wait list &events[7]); // event CL_CHECK_ERROR(status); /* Copy result from device to host */ /* clFFT_Complex *two_dim_fftshifted_data = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, two_dim_fftshifted_data_buffer, CL_TRUE, CL_MAP_READ, 0, data_length * sizeof(clFFT_Complex), 0, NULL, NULL, NULL ); clFinish(command_queue); */ //////////////// /* Crop data */ /////////////// float lt_offset = 0, rb_offset = 0; int dif_sides = interp_cols - slice_size; if (dif_sides%2) { lt_offset = floor(dif_sides / 2.0); rb_offset = ceil(dif_sides / 2.0); } else { lt_offset = rb_offset = dif_sides / 2.0; } /* Buffers */ long cropped_data_length = slice_size * slice_size * sizeof(clFFT_Complex); cl_mem cropped_restored_image_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, cropped_data_length, NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 0, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 1, sizeof(void *), (void *)&cropped_restored_image_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 2, sizeof(slice_size), &slice_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 3, sizeof(interp_cols), &interp_cols)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 4, sizeof(lt_offset), <_offset)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 5, sizeof(rb_offset), &rb_offset)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_crop_data, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 1, // number events in wait list &events[7], // event wait list &events[8]); // event CL_CHECK_ERROR(status); CL_CHECK_ERROR(clFinish(command_queue)); clFFT_DestroyPlan(plan_2difft); clFFT_DestroyPlan(plan_1dfft_sinogram); //timing float ms = 0.0, total_ms = 0.0, global_ms = 0.0, deg = 1.0e-6f; /* Stop timer */ gettimeofday(&global_tim, NULL); t2_global = global_tim.tv_sec+(global_tim.tv_usec/1000000.0); printf("\n(Total time - timeofday) %f seconds elapsed\n", (t2_global-t1_global)*1000.0); cl_ulong start, end; CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding write_op1):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding write_op2):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding):%f", ms); printf("\nTOTAL(Sinogram shifting + Zeropadding):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 1-D FFT):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Shift 1-D FFT data):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation write_op1):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation write_op2):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation):%f", ms); printf("\nTOTAL(Data Interpolation):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 2-D IFFT):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 2-D Shift):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Cropping data):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; printf("\nTOTAL TIME:%f\n", global_ms); // Copy result from device to host clFFT_Complex *cropped_restored_image = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, cropped_restored_image_data_buffer, CL_TRUE, CL_MAP_READ, 0, cropped_data_length, 0, NULL, NULL, NULL ); /* Write the restored slice */ tiff_write_complex(tiff_output, cropped_restored_image, slice_size, slice_size); }
int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) { cl_int err = CL_SUCCESS; int iter; double t; uint64_t t0, t1; int mx = log2(n.x); int my = log2(n.y); int mz = log2(n.z); int length = n.x * n.y * n.z * batchSize; double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter; clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_Complex *data_i = NULL; clFFT_Complex *data_cl = NULL; clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; clFFT_Plan plan = NULL; cl_mem data_in = NULL; cl_mem data_out = NULL; cl_mem data_in_real = NULL; cl_mem data_in_imag = NULL; cl_mem data_out_real = NULL; cl_mem data_out_imag = NULL; if(dataFormat == clFFT_SplitComplexFormat) { data_i_split.real = (float *) malloc(sizeof(float) * length); data_i_split.imag = (float *) malloc(sizeof(float) * length); data_cl_split.real = (float *) malloc(sizeof(float) * length); data_cl_split.imag = (float *) malloc(sizeof(float) * length); if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag) { err = -1; log_error("Out-of-Resources\n"); goto cleanup; } } else { data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); if(!data_i || !data_cl) { err = -2; log_error("Out-of-Resouces\n"); goto cleanup; } } data_iref.real = (double *) malloc(sizeof(double) * length); data_iref.imag = (double *) malloc(sizeof(double) * length); data_oref.real = (double *) malloc(sizeof(double) * length); data_oref.imag = (double *) malloc(sizeof(double) * length); if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) { err = -3; log_error("Out-of-Resources\n"); goto cleanup; } int i; if(dataFormat == clFFT_SplitComplexFormat) { for(i = 0; i < length; i++) { data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl_split.real[i] = 0.0f; data_cl_split.imag[i] = 0.0f; data_iref.real[i] = data_i_split.real[i]; data_iref.imag[i] = data_i_split.imag[i]; data_oref.real[i] = data_iref.real[i]; data_oref.imag[i] = data_iref.imag[i]; } } else { for(i = 0; i < length; i++) { data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl[i].real = 0.0f; data_cl[i].imag = 0.0f; data_iref.real[i] = data_i[i].real; data_iref.imag[i] = data_i[i].imag; data_oref.real[i] = data_iref.real[i]; data_oref.imag[i] = data_iref.imag[i]; } } plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); if(!plan || err) { log_error("clFFT_CreatePlan failed\n"); goto cleanup; } //clFFT_DumpPlan(plan, stdout); if(dataFormat == clFFT_SplitComplexFormat) { data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); if(!data_in_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); if(!data_in_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } if(testType == clFFT_OUT_OF_PLACE) { data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); if(!data_out_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); if(!data_out_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } } else { data_out_real = data_in_real; data_out_imag = data_in_imag; } } else { data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); if(!data_in) { log_error("clCreateBuffer failed\n"); goto cleanup; } if(testType == clFFT_OUT_OF_PLACE) { data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); if(!data_out) { log_error("clCreateBuffer failed\n"); goto cleanup; } } else data_out = data_in; } err = CL_SUCCESS; t0 = mach_absolute_time(); if(dataFormat == clFFT_SplitComplexFormat) { for(iter = 0; iter < numIter; iter++) err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL); } else { for(iter = 0; iter < numIter; iter++) err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); } err |= clFinish(queue); if(err) { log_error("clFFT_Execute\n"); goto cleanup; } t1 = mach_absolute_time(); t = subtractTimes(t1, t0); char temp[100]; sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize); log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp); if(dataFormat == clFFT_SplitComplexFormat) { err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); } else { err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); } if(err) { log_error("clEnqueueReadBuffer failed\n"); goto cleanup; } computeReferenceD(&data_oref, n, batchSize, dim, dir); double diff_avg, diff_max, diff_min; if(dataFormat == clFFT_SplitComplexFormat) { diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); } else { clFFT_SplitComplex result_split; result_split.real = (float *) malloc(length*sizeof(float)); result_split.imag = (float *) malloc(length*sizeof(float)); convertInterleavedToSplit(&result_split, data_cl, length); diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); free(result_split.real); free(result_split.imag); } cleanup: clFFT_DestroyPlan(plan); if(dataFormat == clFFT_SplitComplexFormat) { if(data_i_split.real) free(data_i_split.real); if(data_i_split.imag) free(data_i_split.imag); if(data_cl_split.real) free(data_cl_split.real); if(data_cl_split.imag) free(data_cl_split.imag); if(data_in_real) clReleaseMemObject(data_in_real); if(data_in_imag) clReleaseMemObject(data_in_imag); if(data_out_real && testType == clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out_real); if(data_out_imag && clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out_imag); } else { if(data_i) free(data_i); if(data_cl) free(data_cl); if(data_in) clReleaseMemObject(data_in); if(data_out && testType == clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out); } if(data_iref.real) free(data_iref.real); if(data_iref.imag) free(data_iref.imag); if(data_oref.real) free(data_oref.real); if(data_oref.imag) free(data_oref.imag); return err; }