void ClFFT::init( Ptr<cl::Context> context, Ptr<cl::CommandQueue> queue, clFFT_Dimension& dim, clFFT_Dim3& dim3 ) { m_context = context; m_queue = queue; m_size = uint3(dim3.x, dim3.y, dim3.z); cl_int err = CL_SUCCESS; *m_fftPlan = clFFT_CreatePlan((*m_context)(), clFFT_Dim3(m_size.x, m_size.y, 1), clFFT_2D, clFFT_SplitComplexFormat, &err); if(!*m_fftPlan || err) throw cl::Error(ERR_OPENCL, "clFFT_CreatePlan"); }
cl_int ufo_fft_update (UfoFft *fft, cl_context context, cl_command_queue queue, UfoFftParameter *param) { gboolean changed; cl_int error; error = CL_SUCCESS; changed = param->size[0] != fft->seen.size[0] || param->size[1] != fft->seen.size[1]; if (changed) memcpy (&fft->seen, param, sizeof (UfoFftParameter)); #ifdef HAVE_AMD if (fft->amd_plan == 0 || changed) { /* we use param->dimension to index into this array! */ clfftDim dimension[4] = { 0, CLFFT_1D, CLFFT_2D, CLFFT_3D }; if (fft->amd_plan != 0) { clfftDestroyPlan (&fft->amd_plan); fft->amd_plan = 0; } UFO_RESOURCES_CHECK_CLERR (clfftCreateDefaultPlan (&fft->amd_plan, context, dimension[param->dimensions], param->size)); UFO_RESOURCES_CHECK_CLERR (clfftSetPlanBatchSize (fft->amd_plan, param->batch)); UFO_RESOURCES_CHECK_CLERR (clfftSetPlanPrecision (fft->amd_plan, CLFFT_SINGLE)); UFO_RESOURCES_CHECK_CLERR (clfftSetLayout (fft->amd_plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED)); UFO_RESOURCES_CHECK_CLERR (clfftSetResultLocation (fft->amd_plan, param->zeropad ? CLFFT_INPLACE : CLFFT_OUTOFPLACE)); UFO_RESOURCES_CHECK_CLERR (clfftBakePlan (fft->amd_plan, 1, &queue, NULL, NULL)); } #else if (fft->apple_plan == NULL || changed) { clFFT_Dim3 size; /* we use param->dimension to index into this array! */ clFFT_Dimension dimension[4] = { 0, clFFT_1D, clFFT_2D, clFFT_3D }; size.x = param->size[0]; size.y = param->size[1]; size.z = param->size[2]; if (fft->apple_plan != NULL) { clFFT_DestroyPlan (fft->apple_plan); fft->apple_plan = NULL; } fft->apple_plan = clFFT_CreatePlan (context, size, dimension[param->dimensions], clFFT_InterleavedComplexFormat, &error); } #endif return error; }
clFFT_Plan CLFFTKernelBuffer::getPlan(cl_context c, unsigned int n, cl_int& error) { if (kernels.find(n) != kernels.end()) { error = CL_SUCCESS; return kernels[n]; } TaskTimer tt("Creating an OpenCL FFT compute plan for n=%u", n); clFFT_Dim3 ndim = { n, 1, 1 }; clFFT_Plan plan = clFFT_CreatePlan(c, ndim, clFFT_1D, clFFT_InterleavedComplexFormat, &error); if (error == CL_SUCCESS) kernels[n] = plan; return plan; }
void dfi_process_sinogram(const char* tiff_input, const char* tiff_output, int center_rotation) { cl_event events[11]; if(!tiff_input) { printf("The filename of input is not valid. (pointer tiff_input = %p)", tiff_input); return; } if(!tiff_output) { printf("The filename of output is not valid. (pointer tiff_output = %p)", tiff_output); return; } ///////////////////// /* Input Data Part */ ///////////////////// /* Input a slice properties */ int bits_per_sample; int samples_per_pixel; int theta_size; int slice_size; /* Read the slice */ clFFT_Complex *data_tiff = tiff_read_complex(tiff_input, center_rotation, &bits_per_sample, &samples_per_pixel, &slice_size, &theta_size); //tiff_write_complex("resources/initial-sino.tif", data_tiff, slice_size, theta_size); /* * OpenCL */ printf("Hey!1\n"); cl_int status = CL_SUCCESS; cl_platform_id platform; printf("Hey!1.2\n"); CL_CHECK_ERROR(clGetPlatformIDs(1, &platform, NULL)); printf("Hey!2\n"); cl_device_id devices[10]; // Compute device cl_context context; // Compute context cl_uint n_devices = 0; printf("@Hey!3\n"); #if GPU printf("@Hey!GPU Choosed\n"); CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 3, devices, &n_devices)); #else printf("@Hey!CPU Choosed\n"); CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 3, devices, &n_devices)); #endif //cl_device_id current_device = devices[0]; #define current_device devices[0] printf("Hey!3.1 n_devices %d\n", n_devices); context = clCreateContext(NULL, 1, devices, NULL, NULL, &status); printf("Hey!3.2\n"); CL_CHECK_ERROR(status); /* * Device */ printf("Hey!3.3\n"); cl_int device_max_cu = 0; CL_CHECK_ERROR(clGetDeviceInfo(current_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_max_cu, NULL)); size_t wg_count = device_max_cu * wavefronts_per_SIMD; size_t global_work_size = wg_count * local_work_size; printf("Hey!3.4\n"); /* * Queues, Kernels */ cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; cl_command_queue command_queue = clCreateCommandQueue(context, current_device, properties, &status); CL_CHECK_ERROR(status); printf("Hey!3.5\n"); cl_kernel kernel_linear_interp = get_kernel("linear_interp", &context, ¤t_device); cl_kernel kernel_zero_ifftshift = get_kernel("zero_ifftshift", &context, ¤t_device); cl_kernel kernel_fftshift = get_kernel("fftshift", &context, ¤t_device); cl_kernel kernel_2dshift = get_kernel("shift2d", &context, ¤t_device); cl_kernel kernel_crop_data = get_kernel("crop_data", &context, ¤t_device); printf("@Hey!3.6\n\n"); //////////////////////// /* OpenCL - DFI Part */ //////////////////////// /* Reconstruction properties */ int oversampling_ratio = 2; int dx = 1; /* zoom times */ //int size_s = slice_size * oversampling_ratio; int min_theta = 0; int max_theta = theta_size - 1; int size_zeropad_s = pow(2, ceil(log2((float)slice_size))); /* get length of FFT operations */ int size_s = size_zeropad_s; float d_omega_s = 2 * M_PI / (size_zeropad_s * dx); //normalized ratio [0; 2PI] /* Start timer */ timeval global_tim; gettimeofday(&global_tim, NULL); double t1_global = global_tim.tv_sec + (global_tim.tv_usec/1000000.0), t2_global = 0.0; ///////////////////////////////////// /* Sinogram shifting + Zeropadding */ ///////////////////////////////////// long data_size = slice_size * theta_size * sizeof(clFFT_Complex); printf("6 "); long zeropad_data_size = theta_size * size_zeropad_s * sizeof(clFFT_Complex); /* Buffers */ cl_mem original_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_size, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, original_data_buffer, CL_FALSE, 0, data_size, data_tiff, 0, NULL, &events[0])); cl_mem zeropad_ifftshift_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); float *zero_out = (float *)g_malloc0(zeropad_data_size); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, zeropad_ifftshift_data_buffer, CL_FALSE, 0, zeropad_data_size, zero_out, 0, NULL, &events[1])); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 0, sizeof(void *), (void *)&original_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 1, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 2, sizeof(slice_size), &slice_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 3, sizeof(void *), (void *)&zeropad_ifftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 4, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_zero_ifftshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 2, // number events in wait list events, // event wait list &events[2]); // event CL_CHECK_ERROR(status); // Copy result from device to host /* clFFT_Complex *fur_kernel_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, zeropad_ifftshift_data_buffer, CL_TRUE, CL_MAP_READ, 0, zeropad_data_size, 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/zeropad-sino.tif", fur_kernel_sino, size_zeropad_s, theta_size); */ //////////////////////////////////////////////////////////////////////// /* Applying 1-D FFT to the each strip of the sinogram and shifting it */ //////////////////////////////////////////////////////////////////////// cl_mem zeropadded_1dfft_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); /* Setup clAmdFft */ clFFT_Dim3 sino_fft; sino_fft.x = size_zeropad_s; sino_fft.y = 1; sino_fft.z = 1; /* Create FFT plan */ clFFT_Plan plan_1dfft_sinogram = clFFT_CreatePlan(context, sino_fft, clFFT_1D, clFFT_InterleavedComplexFormat, &status); CL_CHECK_ERROR(status); /* Execute FFT */ status = clFFT_ExecuteInterleaved(command_queue, plan_1dfft_sinogram, theta_size, clFFT_Forward, zeropad_ifftshift_data_buffer, zeropadded_1dfft_data_buffer, 0, NULL, NULL); CL_CHECK_ERROR(status); // Free FFT plan //clFFT_DestroyPlan(plan_1dfft_sinogram); // Copy result from device to host /* clFFT_Complex *fourier_kernel_sinogram = (clFFT_Complex *)malloc(zeropad_data_size); clEnqueueReadBuffer(command_queue, zeropadded_1dfft_data_buffer, CL_TRUE, 0, zeropad_data_size, fourier_kernel_sinogram, 0, NULL, NULL); clFinish(command_queue); tiff_write_complex("resources/1dfft-sino.tif", fourier_kernel_sinogram, size_zeropad_s, theta_size); */ /////////////////// /* Make fftshift */ /////////////////// /* Buffers */ cl_mem zeropad_fftshift_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, zeropad_data_size, NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 0, sizeof(void *), (void *)&zeropadded_1dfft_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 1, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 2, sizeof(size_zeropad_s), &size_zeropad_s)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 3, sizeof(void *), (void *)&zeropad_fftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 4, sizeof(theta_size), &theta_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_fftshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 0, // number events in wait list NULL, // event wait list &events[3]); // event CL_CHECK_ERROR(status); /* Copy result from device to host */ /* clFFT_Complex *fur_kernel_fftshift_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, zeropad_fftshift_data_buffer, CL_TRUE, CL_MAP_READ,0,zeropad_data_size, 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/fftshift-sino.tif", fur_kernel_fftshift_sino, size_zeropad_s, theta_size); */ //////////////////////// /* Data Interpolation */ //////////////////////// /* Performing Interpolation */ cl_long data_length = size_s * size_s; cl_int in_rows = theta_size; cl_int in_cols = size_zeropad_s; cl_float norm_ratio = d_omega_s/dx; cl_float in_rows_first_val = min_theta; cl_float in_rows_last_val = max_theta; cl_float in_cols_first_val = (-in_cols/2)*norm_ratio; cl_float in_cols_last_val = (in_cols/2-1)*norm_ratio; cl_int interp_rows = size_s; cl_int interp_cols = interp_rows; cl_int iparams[5]; iparams[0] = in_rows; iparams[1] = in_cols; iparams[2] = dx; iparams[3] = interp_rows; iparams[4] = interp_cols; cl_float fparams[5]; fparams[0] = in_rows_first_val; fparams[1] = in_rows_last_val; fparams[2] = in_cols_first_val; fparams[3] = in_cols_last_val; fparams[4] = norm_ratio; /* Buffers */ cl_mem i_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * 5, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, i_buffer, CL_FALSE, 0, sizeof(cl_int) * 5, iparams, 0, NULL, &events[4])); cl_mem f_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * 5, NULL, &status); CL_CHECK_ERROR(status); CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue, f_buffer, CL_FALSE, 0, sizeof(cl_float) * 5, fparams, 0, NULL, &events[5])); cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 0, sizeof(void *), (void *)&i_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 1, sizeof(void *), (void *)&f_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 2, sizeof(void *), (void *)&zeropad_fftshift_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 3, sizeof(void *), (void *)&output_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 4, sizeof(data_length), &data_length)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_linear_interp, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // nomber of WI in WG 3, // num events in wait list events + 3, // event wait list &events[6]); // event CL_CHECK_ERROR(status); //clFinish(command_queue); // Copy result from device to host /* clFFT_Complex *interpolated_spectrum = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, output_buffer, CL_TRUE, CL_MAP_READ, 0, data_length * sizeof(clFFT_Complex), 0, NULL, NULL, NULL ); clFinish(command_queue); tiff_write_complex("resources/interpolated-sino.tif", interpolated_spectrum, size_s, size_s); */ /////////////////////////////////////////////////// /* Applying 2-D FFT to the interpolated spectrum */ /////////////////////////////////////////////////// /* Setup 2D IFFT */ clFFT_Dim3 sino_2dfft; sino_2dfft.x = size_s; sino_2dfft.y = size_s; sino_2dfft.z = 1; /* Create 2D IFFT plan */ clFFT_Plan plan_2difft = clFFT_CreatePlan(context, sino_2dfft, clFFT_2D, clFFT_InterleavedComplexFormat, &status); CL_CHECK_ERROR(status); /* Execute 2D IFFT */ cl_mem reconstructed_image_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); status = clFFT_ExecuteInterleaved(command_queue, plan_2difft, 1, clFFT_Inverse, output_buffer, reconstructed_image_buffer, 0, NULL, NULL); CL_CHECK_ERROR(status); // Copy result from device to host /* clFFT_Complex *ifft2d_interpolated_spectrum = (clFFT_Complex *)malloc(data_length * sizeof(clFFT_Complex)); clEnqueueReadBuffer(command_queue, reconstructed_image_buffer, CL_TRUE, 0, data_length * sizeof(clFFT_Complex), ifft2d_interpolated_spectrum, 0, NULL, NULL); tiff_write_complex("resources/ifft2d_interpolated_spectrum.tif", ifft2d_interpolated_spectrum, size_s, size_s); clFinish(command_queue); */ ///////////////////////////////////////////////// /* Applying 2-D fftshidt to the restored image */ ///////////////////////////////////////////////// /* Buffers */ cl_mem two_dim_fftshifted_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_length * sizeof(clFFT_Complex), NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ cl_int inverse_flag = 0; CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 0, sizeof(void *), (void *)&reconstructed_image_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 1, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 2, sizeof(interp_rows), &interp_rows)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 3, sizeof(interp_cols), &interp_cols)); CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 4, sizeof(inverse_flag), &inverse_flag)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_2dshift, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 1, // number events in wait list &events[6], // event wait list &events[7]); // event CL_CHECK_ERROR(status); /* Copy result from device to host */ /* clFFT_Complex *two_dim_fftshifted_data = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, two_dim_fftshifted_data_buffer, CL_TRUE, CL_MAP_READ, 0, data_length * sizeof(clFFT_Complex), 0, NULL, NULL, NULL ); clFinish(command_queue); */ //////////////// /* Crop data */ /////////////// float lt_offset = 0, rb_offset = 0; int dif_sides = interp_cols - slice_size; if (dif_sides%2) { lt_offset = floor(dif_sides / 2.0); rb_offset = ceil(dif_sides / 2.0); } else { lt_offset = rb_offset = dif_sides / 2.0; } /* Buffers */ long cropped_data_length = slice_size * slice_size * sizeof(clFFT_Complex); cl_mem cropped_restored_image_data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, cropped_data_length, NULL, &status); CL_CHECK_ERROR(status); /* Set arguments */ CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 0, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 1, sizeof(void *), (void *)&cropped_restored_image_data_buffer)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 2, sizeof(slice_size), &slice_size)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 3, sizeof(interp_cols), &interp_cols)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 4, sizeof(lt_offset), <_offset)); CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 5, sizeof(rb_offset), &rb_offset)); /* Run kernel */ status = clEnqueueNDRangeKernel(command_queue, kernel_crop_data, 1, // work dimensional 1D, 2D, 3D NULL, // offset &global_work_size, // total number of WI &local_work_size, // number of WI in WG 1, // number events in wait list &events[7], // event wait list &events[8]); // event CL_CHECK_ERROR(status); CL_CHECK_ERROR(clFinish(command_queue)); clFFT_DestroyPlan(plan_2difft); clFFT_DestroyPlan(plan_1dfft_sinogram); //timing float ms = 0.0, total_ms = 0.0, global_ms = 0.0, deg = 1.0e-6f; /* Stop timer */ gettimeofday(&global_tim, NULL); t2_global = global_tim.tv_sec+(global_tim.tv_usec/1000000.0); printf("\n(Total time - timeofday) %f seconds elapsed\n", (t2_global-t1_global)*1000.0); cl_ulong start, end; CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding write_op1):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding write_op2):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Sinogram shifting + Zeropadding):%f", ms); printf("\nTOTAL(Sinogram shifting + Zeropadding):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 1-D FFT):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Shift 1-D FFT data):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation write_op1):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation write_op2):%f", ms); CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Data Interpolation):%f", ms); printf("\nTOTAL(Data Interpolation):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 2-D IFFT):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Applying 2-D Shift):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL)); CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); ms = (end - start) * deg; total_ms += ms; printf("\n(Cropping data):%f\n", total_ms); global_ms += total_ms; total_ms = 0.0; printf("\nTOTAL TIME:%f\n", global_ms); // Copy result from device to host clFFT_Complex *cropped_restored_image = (clFFT_Complex *)clEnqueueMapBuffer(command_queue, cropped_restored_image_data_buffer, CL_TRUE, CL_MAP_READ, 0, cropped_data_length, 0, NULL, NULL, NULL ); /* Write the restored slice */ tiff_write_complex(tiff_output, cropped_restored_image, slice_size, slice_size); }
void CLSimulator::initializeClFFT() { /* x x x x * x x x x * x x x x * x x x x */ for (size_t x_idx = 0, x_val = _nX - 1; x_idx < _nX; ++x_idx, --x_val) { for (size_t y_idx = 0, y_val = _nY - 1; y_idx < _nY; ++y_idx, --y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v x x * v v x x * x x x x * x x x x */ for (size_t x_idx = 0, x_val = _nX - 1; x_idx < _nX; ++x_idx, --x_val) { for (size_t y_idx = _nY, y_val = 1; y_idx < _nFFTy - 1; ++y_idx, ++y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v v x * v v v x * x x x x * x x x x */ if (_nY > 1) { for (size_t x_idx = 0; x_idx < _nFFTx; ++x_idx) { _distances_real[x_idx + (_nFFTy - 1) * _nFFTx] = 0; } } /* v v v 0 * v v v 0 * x x x 0 * x x x 0 */ for (size_t x_idx = _nX, x_val = 1; x_idx < _nFFTx - 1; ++x_idx, ++x_val) { for (size_t y_idx = 0, y_val = _nY - 1; y_idx < _nY; ++y_idx, --y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v v 0 * v v v 0 * v v x 0 * x x x 0 */ for (size_t y_idx = 0; y_idx < _nFFTy; ++y_idx) { _distances_real[(_nFFTx - 1) + y_idx * _nFFTx] = 0; } /* v v v 0 * v v v 0 * v v x 0 * 0 0 0 0 */ for (size_t x_idx = _nX, x_val = 1; x_idx < _nFFTx - 1; ++x_idx, ++x_val) { for (size_t y_idx = _nY, y_val = 1; y_idx < _nFFTy - 1; ++y_idx, ++y_val) { float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f)); _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance))); } } /* v v v 0 * v v v 0 * v v v 0 * 0 0 0 0 */ assert(isPowerOfTwo(_nFFT)); assert(_nX >= 1 && _nY >= 1 && _nZ >= 1); assert((_nX >= _nY) && (_nY >= _nZ)); clFFT_Dim3 n = { static_cast<unsigned int>(_nFFTx), static_cast<unsigned int>(_nFFTy), static_cast<unsigned int>(_nFFTz) }; clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat; clFFT_Dimension dim; if (_nY == 1) { dim = clFFT_1D; } else if (_nZ == 1) { dim = clFFT_2D; } else { dim = clFFT_3D; } _p_cl = clFFT_CreatePlan(_wrapper.getContextC(), n, dim, dataFormat, &_err); handleClError(_err); _distances_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _distances_real.get(), &_err); _distances_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _distances_f_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _distances_f_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_f_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _sVals_f_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_f_real_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _convolution_f_imag_cl = cl::Buffer(_wrapper.getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _nFFT * sizeof(float), _zeros.get(), &_err); _kernel_convolution = cl::Kernel(_program, "convolution", &_err); handleClError(_kernel_convolution.setArg(0, _convolution_f_real_cl)); handleClError(_kernel_convolution.setArg(1, _convolution_f_imag_cl)); handleClError(_kernel_convolution.setArg(2, _distances_f_real_cl)); handleClError(_kernel_convolution.setArg(3, _distances_f_imag_cl)); handleClError(_kernel_convolution.setArg(4, _sVals_f_real_cl)); handleClError(_kernel_convolution.setArg(5, _sVals_f_imag_cl)); handleClError(_kernel_convolution.setArg(6, _scaleFFT)); handleClError(clFFT_ExecutePlannar(_wrapper.getQueueC(), _p_cl, 1, clFFT_Forward, _distances_real_cl(), _distances_imag_cl(), _distances_f_real_cl(), _distances_f_imag_cl(), 0, NULL, NULL)); _wrapper.getQueue().finish(); }
int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) { cl_int err = CL_SUCCESS; int iter; double t; uint64_t t0, t1; int mx = log2(n.x); int my = log2(n.y); int mz = log2(n.z); int length = n.x * n.y * n.z * batchSize; double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter; clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_Complex *data_i = NULL; clFFT_Complex *data_cl = NULL; clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; clFFT_Plan plan = NULL; cl_mem data_in = NULL; cl_mem data_out = NULL; cl_mem data_in_real = NULL; cl_mem data_in_imag = NULL; cl_mem data_out_real = NULL; cl_mem data_out_imag = NULL; if(dataFormat == clFFT_SplitComplexFormat) { data_i_split.real = (float *) malloc(sizeof(float) * length); data_i_split.imag = (float *) malloc(sizeof(float) * length); data_cl_split.real = (float *) malloc(sizeof(float) * length); data_cl_split.imag = (float *) malloc(sizeof(float) * length); if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag) { err = -1; log_error("Out-of-Resources\n"); goto cleanup; } } else { data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); if(!data_i || !data_cl) { err = -2; log_error("Out-of-Resouces\n"); goto cleanup; } } data_iref.real = (double *) malloc(sizeof(double) * length); data_iref.imag = (double *) malloc(sizeof(double) * length); data_oref.real = (double *) malloc(sizeof(double) * length); data_oref.imag = (double *) malloc(sizeof(double) * length); if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) { err = -3; log_error("Out-of-Resources\n"); goto cleanup; } int i; if(dataFormat == clFFT_SplitComplexFormat) { for(i = 0; i < length; i++) { data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl_split.real[i] = 0.0f; data_cl_split.imag[i] = 0.0f; data_iref.real[i] = data_i_split.real[i]; data_iref.imag[i] = data_i_split.imag[i]; data_oref.real[i] = data_iref.real[i]; data_oref.imag[i] = data_iref.imag[i]; } } else { for(i = 0; i < length; i++) { data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl[i].real = 0.0f; data_cl[i].imag = 0.0f; data_iref.real[i] = data_i[i].real; data_iref.imag[i] = data_i[i].imag; data_oref.real[i] = data_iref.real[i]; data_oref.imag[i] = data_iref.imag[i]; } } plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); if(!plan || err) { log_error("clFFT_CreatePlan failed\n"); goto cleanup; } //clFFT_DumpPlan(plan, stdout); if(dataFormat == clFFT_SplitComplexFormat) { data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); if(!data_in_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); if(!data_in_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } if(testType == clFFT_OUT_OF_PLACE) { data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); if(!data_out_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); if(!data_out_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } } else { data_out_real = data_in_real; data_out_imag = data_in_imag; } } else { data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); if(!data_in) { log_error("clCreateBuffer failed\n"); goto cleanup; } if(testType == clFFT_OUT_OF_PLACE) { data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); if(!data_out) { log_error("clCreateBuffer failed\n"); goto cleanup; } } else data_out = data_in; } err = CL_SUCCESS; t0 = mach_absolute_time(); if(dataFormat == clFFT_SplitComplexFormat) { for(iter = 0; iter < numIter; iter++) err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL); } else { for(iter = 0; iter < numIter; iter++) err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); } err |= clFinish(queue); if(err) { log_error("clFFT_Execute\n"); goto cleanup; } t1 = mach_absolute_time(); t = subtractTimes(t1, t0); char temp[100]; sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize); log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp); if(dataFormat == clFFT_SplitComplexFormat) { err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); } else { err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); } if(err) { log_error("clEnqueueReadBuffer failed\n"); goto cleanup; } computeReferenceD(&data_oref, n, batchSize, dim, dir); double diff_avg, diff_max, diff_min; if(dataFormat == clFFT_SplitComplexFormat) { diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); } else { clFFT_SplitComplex result_split; result_split.real = (float *) malloc(length*sizeof(float)); result_split.imag = (float *) malloc(length*sizeof(float)); convertInterleavedToSplit(&result_split, data_cl, length); diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); free(result_split.real); free(result_split.imag); } cleanup: clFFT_DestroyPlan(plan); if(dataFormat == clFFT_SplitComplexFormat) { if(data_i_split.real) free(data_i_split.real); if(data_i_split.imag) free(data_i_split.imag); if(data_cl_split.real) free(data_cl_split.real); if(data_cl_split.imag) free(data_cl_split.imag); if(data_in_real) clReleaseMemObject(data_in_real); if(data_in_imag) clReleaseMemObject(data_in_imag); if(data_out_real && testType == clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out_real); if(data_out_imag && clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out_imag); } else { if(data_i) free(data_i); if(data_cl) free(data_cl); if(data_in) clReleaseMemObject(data_in); if(data_out && testType == clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out); } if(data_iref.real) free(data_iref.real); if(data_iref.imag) free(data_iref.imag); if(data_oref.real) free(data_oref.real); if(data_oref.imag) free(data_oref.imag); return err; }
void Convolutioner_FrequencyDomain_OpenCL::process(AudioInOutBuffers<float_type>& audio ) { // unsigned int _2B = audio.channelLength_ * 2; unsigned int _B = audio.channelLength_; unsigned int _C = audio.numOfChannels_; //numOfChannels unsigned int _P = partitionedIR_.get_numOfPartsPerChannel(); //numOfIRPartsPerChannel //. //_ if >>>latency<<< or >>>number of channels<<< changed: // set partitionedIR // recreate buffers // recreate fft plans if ( window_.get_inputBlockSize() != audio.channelLength_ || window_.get_numOfChannels() != audio.numOfChannels_) { //Setting partitionedIR if (window_.get_inputBlockSize() != audio.channelLength_) { partitionedIR_.setNewIRF( irf_, audio.channelLength_ ); _P = partitionedIR_.get_numOfPartsPerChannel(); //Recreate, initialize buffers, and set as kernel arguments: PIR //recreate bufferPIR_R_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P); bufferPIR_I_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P); //. //initialize bufferPIR_R_.set(partitionedIR_.real_ ); bufferPIR_I_.set(partitionedIR_.imaginary_); //. //set as kernel argument bufferPIR_R_.setAsKernelArgument(0, complexMultiplyAdd_kernel_); bufferPIR_I_.setAsKernelArgument(1, complexMultiplyAdd_kernel_); //. //.(Recreate...) } //. //Recreate initialize buffers, and set as kernel arguments: transform, FDL, accumulator //recreate /****/bufferTransform_R_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /****/bufferTransform_I_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /**********/bufferFDL_R_.recreate(CL_MEM_READ_WRITE, _2B * _C * _P ); /**********/bufferFDL_I_.recreate(CL_MEM_READ_WRITE, _2B * _C * _P ); /**/bufferAccumulator_R_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /**/bufferAccumulator_I_.recreate(CL_MEM_READ_WRITE, _2B * _C ); cpu_bufferAccumulator_R_ = new float_type[_2B * _C ]; cpu_bufferAccumulator_I_ = new float_type[_2B * _C ]; //. //initialize FDL with 0 bufferFDL_R_.fillWithZero(); bufferFDL_I_.fillWithZero(); lastInsertedDelayLineIdx = 0; //. //set as kernel argument /**********/bufferFDL_R_.setAsKernelArgument(2, complexMultiplyAdd_kernel_); /**********/bufferFDL_I_.setAsKernelArgument(3, complexMultiplyAdd_kernel_); /**/bufferAccumulator_R_.setAsKernelArgument(4, complexMultiplyAdd_kernel_); /**/bufferAccumulator_I_.setAsKernelArgument(5, complexMultiplyAdd_kernel_); //. //.(Recreate...) //Recreate plans clFFT_Dim3 dim; dim.x = _2B; dim.y = 1; dim.z = 1; fftPlan_ = clFFT_CreatePlan(context_, dim, clFFT_1D, clFFT_SplitComplexFormat, &lastCommandStatus_); //. } //update each time bufferGlobalParameters because of incrementing of lastInsertedDelayLineIdx /*(_2B, _C, _P, pir_C, FDL_LINE)*/ cpuData_bufferGlobalParameters_[0] = _2B; cpuData_bufferGlobalParameters_[1] = _C; cpuData_bufferGlobalParameters_[2] = _P; cpuData_bufferGlobalParameters_[3] = irf_->numOfChannels_; cpuData_bufferGlobalParameters_[4] = lastInsertedDelayLineIdx; bufferGlobalParameters_.set(cpuData_bufferGlobalParameters_); //. //Update channelsWindow window_.update( audio, /*history size*/ _B ); //. //Init >>bufferTransform<< bufferTransform_R_.set(window_.buffer_.data_); for(unsigned int i = 0; i < _2B * _C; ++i) cpu_bufferAccumulator_I_[i]=0; bufferTransform_I_.set(cpu_bufferAccumulator_I_); //. //Make fft of bufferTransform lastCommandStatus_ = clFFT_ExecutePlannar( cmdQueue_, fftPlan_, _C, clFFT_Forward, bufferTransform_R_, bufferTransform_I_, bufferTransform_R_, bufferTransform_I_, 0, NULL, NULL ); //. //Copy bufferTransform into bufferFDL (inserting new delay line) (real and imaginary part) clEnqueueCopyBuffer( cmdQueue_, bufferTransform_R_, bufferFDL_R_, 0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type), 0, NULL, NULL); clEnqueueCopyBuffer( cmdQueue_, bufferTransform_I_, bufferFDL_I_, 0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type), 0, NULL, NULL); //. //Increment host lastInsertedDelayLine lastInsertedDelayLineIdx = (lastInsertedDelayLineIdx + 1 ) % _P; //. //Execute kernel size_t globalWorkSize[1]; globalWorkSize[0] = _2B * _C /* == window_.get_allLength() */; lastCommandStatus_ = clEnqueueNDRangeKernel(cmdQueue_, complexMultiplyAdd_kernel_, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if(lastCommandStatus_ == -4) { std::cout << "Too much amount of memory must be allocated on the GPU due to lenght of impulse response and number of channels."; throw int(); } else if(lastCommandStatus_ != 0) { std::cout << "Error while sending clEnqueueNDRangeKernel."; throw int(); } //. //ifft of bufferAccumulator lastCommandStatus_ = clFFT_ExecutePlannar( cmdQueue_, fftPlan_, _C, clFFT_Inverse, bufferAccumulator_R_, bufferAccumulator_I_, bufferAccumulator_R_, bufferAccumulator_I_, 0, NULL, NULL ); //. //Copy from bufferAccumulator to cpu bufferAccumulator_R_.get(cpu_bufferAccumulator_R_); //. //Flushing and finishing clFlush(cmdQueue_); clFinish(cmdQueue_); //. //Write fftw vector form to audio.outputChannel[number of Channel] for (unsigned int channNum = 0; channNum < _C; ++channNum) for (unsigned sampleNum = 0; sampleNum < _B; ++sampleNum) audio.out_[channNum][sampleNum] = (cpu_bufferAccumulator_R_[channNum*_2B + _B + sampleNum])/_2B; //. }