Пример #1
0
void ClFFT::init( Ptr<cl::Context> context, Ptr<cl::CommandQueue> queue, clFFT_Dimension& dim, clFFT_Dim3& dim3 )
{
	m_context = context;
	m_queue = queue;
	m_size = uint3(dim3.x, dim3.y, dim3.z);
	cl_int err = CL_SUCCESS;
	*m_fftPlan = clFFT_CreatePlan((*m_context)(), clFFT_Dim3(m_size.x, m_size.y, 1), clFFT_2D, clFFT_SplitComplexFormat, &err);
	if(!*m_fftPlan || err)
		throw cl::Error(ERR_OPENCL, "clFFT_CreatePlan");
}
Пример #2
0
cl_int
ufo_fft_update (UfoFft *fft, cl_context context, cl_command_queue queue, UfoFftParameter *param)
{
    gboolean changed;
    cl_int error;

    error = CL_SUCCESS;
    changed = param->size[0] != fft->seen.size[0] || param->size[1] != fft->seen.size[1];

    if (changed)
        memcpy (&fft->seen, param, sizeof (UfoFftParameter));

#ifdef HAVE_AMD
    if (fft->amd_plan == 0 || changed) {
        /* we use param->dimension to index into this array! */
        clfftDim dimension[4] = { 0, CLFFT_1D, CLFFT_2D, CLFFT_3D };

        if (fft->amd_plan != 0) {
            clfftDestroyPlan (&fft->amd_plan);
            fft->amd_plan = 0;
        }

        UFO_RESOURCES_CHECK_CLERR (clfftCreateDefaultPlan (&fft->amd_plan, context, dimension[param->dimensions], param->size));
        UFO_RESOURCES_CHECK_CLERR (clfftSetPlanBatchSize (fft->amd_plan, param->batch));
        UFO_RESOURCES_CHECK_CLERR (clfftSetPlanPrecision (fft->amd_plan, CLFFT_SINGLE));
        UFO_RESOURCES_CHECK_CLERR (clfftSetLayout (fft->amd_plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
        UFO_RESOURCES_CHECK_CLERR (clfftSetResultLocation (fft->amd_plan, param->zeropad ? CLFFT_INPLACE : CLFFT_OUTOFPLACE));
        UFO_RESOURCES_CHECK_CLERR (clfftBakePlan (fft->amd_plan, 1, &queue, NULL, NULL));
    }
#else
    if (fft->apple_plan == NULL || changed) {
        clFFT_Dim3 size;

        /* we use param->dimension to index into this array! */
        clFFT_Dimension dimension[4] = { 0, clFFT_1D, clFFT_2D, clFFT_3D };

        size.x = param->size[0];
        size.y = param->size[1];
        size.z = param->size[2];

        if (fft->apple_plan != NULL) {
            clFFT_DestroyPlan (fft->apple_plan);
            fft->apple_plan = NULL;
        }

        fft->apple_plan = clFFT_CreatePlan (context, size, dimension[param->dimensions], clFFT_InterleavedComplexFormat, &error);
    }
#endif

    return error;
}
Пример #3
0
clFFT_Plan CLFFTKernelBuffer::getPlan(cl_context c, unsigned int n, cl_int& error)
{
    if (kernels.find(n) != kernels.end())
    {
        error = CL_SUCCESS;
        return kernels[n];
	}

    TaskTimer tt("Creating an OpenCL FFT compute plan for n=%u", n);

    clFFT_Dim3 ndim = { n, 1, 1 };
    clFFT_Plan plan = clFFT_CreatePlan(c, ndim, clFFT_1D, clFFT_InterleavedComplexFormat, &error);

    if (error == CL_SUCCESS)
        kernels[n] = plan;

	return plan;
}
Пример #4
0
void dfi_process_sinogram(const char* tiff_input, const char* tiff_output, int center_rotation)
{
  cl_event events[11];

  if(!tiff_input) {
    printf("The filename of input is not valid. (pointer tiff_input = %p)", tiff_input);
    return;
  }

  if(!tiff_output) {
    printf("The filename of output is not valid. (pointer tiff_output = %p)", tiff_output);
    return;
  }

  /////////////////////
  /* Input Data Part */
  /////////////////////

  /* Input a slice properties */
  int bits_per_sample;
  int samples_per_pixel;
  int theta_size;
  int slice_size;

  /* Read the slice */
  clFFT_Complex *data_tiff = tiff_read_complex(tiff_input,
						center_rotation,
						&bits_per_sample,
						&samples_per_pixel,
						&slice_size,
						&theta_size);
  
  //tiff_write_complex("resources/initial-sino.tif", data_tiff, slice_size, theta_size);

  /*
   * OpenCL
   */
  printf("Hey!1\n");
  cl_int status = CL_SUCCESS;
  cl_platform_id platform;

  printf("Hey!1.2\n");
  CL_CHECK_ERROR(clGetPlatformIDs(1, &platform, NULL));

  printf("Hey!2\n");
  cl_device_id devices[10];    // Compute device
  cl_context context;     // Compute context
  cl_uint n_devices = 0;

  printf("@Hey!3\n");
#if GPU
  printf("@Hey!GPU Choosed\n");
  CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 3, devices, &n_devices));
#else
  printf("@Hey!CPU Choosed\n");
  CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 3, devices, &n_devices));
#endif

  //cl_device_id current_device = devices[0];
#define current_device devices[0]
  printf("Hey!3.1  n_devices %d\n", n_devices);
  context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
  printf("Hey!3.2\n");
  CL_CHECK_ERROR(status);

	/*
	 * Device
	 */
	printf("Hey!3.3\n");

	cl_int device_max_cu = 0;
	CL_CHECK_ERROR(clGetDeviceInfo(current_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_max_cu, NULL));
	size_t wg_count = device_max_cu * wavefronts_per_SIMD;
    size_t global_work_size = wg_count * local_work_size;
    printf("Hey!3.4\n");

    /*
     * Queues, Kernels
     */
    cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
    cl_command_queue command_queue = clCreateCommandQueue(context, current_device, properties, &status);
    CL_CHECK_ERROR(status);

    printf("Hey!3.5\n");
    cl_kernel kernel_linear_interp = get_kernel("linear_interp", &context, &current_device);
    cl_kernel kernel_zero_ifftshift = get_kernel("zero_ifftshift", &context, &current_device);
    cl_kernel kernel_fftshift = get_kernel("fftshift", &context, &current_device);
    cl_kernel kernel_2dshift = get_kernel("shift2d", &context, &current_device);
    cl_kernel kernel_crop_data = get_kernel("crop_data", &context, &current_device);

    printf("@Hey!3.6\n\n");

	////////////////////////
	/* OpenCL - DFI Part */
	////////////////////////

	/* Reconstruction properties */
	int oversampling_ratio = 2;
	int dx = 1; /* zoom times */
	//int size_s = slice_size * oversampling_ratio;

	int min_theta = 0;
	int max_theta = theta_size - 1;

	int size_zeropad_s = pow(2, ceil(log2((float)slice_size))); /* get length of FFT operations */
	int size_s = size_zeropad_s;

	float d_omega_s = 2 * M_PI / (size_zeropad_s * dx); //normalized ratio [0; 2PI]

  /* Start timer */
  timeval global_tim;
  gettimeofday(&global_tim, NULL);
  double t1_global = global_tim.tv_sec + (global_tim.tv_usec/1000000.0), t2_global = 0.0;

	/////////////////////////////////////
	/* Sinogram shifting + Zeropadding */
	/////////////////////////////////////
	long data_size = slice_size * theta_size * sizeof(clFFT_Complex);
	printf("6 ");
	long zeropad_data_size = theta_size * size_zeropad_s * sizeof(clFFT_Complex);

	/* Buffers */
	cl_mem original_data_buffer = clCreateBuffer(context,
						     CL_MEM_READ_WRITE,
						     data_size,
						     NULL,
						     &status);
	CL_CHECK_ERROR(status);

	CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					    original_data_buffer,
					    CL_FALSE,
					    0,
					    data_size,
					    data_tiff,
					    0,
					    NULL,
					    &events[0]));

    	cl_mem zeropad_ifftshift_data_buffer = clCreateBuffer(context,
							  CL_MEM_READ_WRITE,
							  zeropad_data_size,
							  NULL,
							  &status);
    CL_CHECK_ERROR(status);

    float *zero_out = (float *)g_malloc0(zeropad_data_size);
    CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					zeropad_ifftshift_data_buffer,
					CL_FALSE,
					0,
					zeropad_data_size,
					zero_out,
					0,
					NULL,
					&events[1]));

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 0, sizeof(void *), (void *)&original_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 1, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 2, sizeof(slice_size), &slice_size));

    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 3, sizeof(void *), (void *)&zeropad_ifftshift_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 4, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_zero_ifftshift,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    2, // number events in wait list
				    events,  // event wait list
				    &events[2]); // event
    CL_CHECK_ERROR(status);

    // Copy result from device to host
    /*
    clFFT_Complex *fur_kernel_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
    									     	 	 	 	 	 	 	 zeropad_ifftshift_data_buffer,
    									     	 	 	 	 	 	 	 CL_TRUE,
    									     	 	 	 	 	 	 	 CL_MAP_READ,
    									     	 	 	 	 	 	 	 0,
    									     	 	 	 	 	 	 	 zeropad_data_size,
                                                                 	     0, NULL, NULL, NULL );

    clFinish(command_queue);
    tiff_write_complex("resources/zeropad-sino.tif", fur_kernel_sino, size_zeropad_s, theta_size);
    */

    ////////////////////////////////////////////////////////////////////////
    /* Applying 1-D FFT to the each strip of the sinogram and shifting it */
    ////////////////////////////////////////////////////////////////////////

    cl_mem zeropadded_1dfft_data_buffer = clCreateBuffer(context,
							CL_MEM_READ_WRITE,
							zeropad_data_size,
							NULL,
							&status);
    CL_CHECK_ERROR(status);

    /* Setup clAmdFft */
    clFFT_Dim3 sino_fft;
    sino_fft.x = size_zeropad_s;
    sino_fft.y = 1;
    sino_fft.z = 1;

    /* Create FFT plan */
    clFFT_Plan plan_1dfft_sinogram = clFFT_CreatePlan(context,
						      sino_fft,
						      clFFT_1D,
						      clFFT_InterleavedComplexFormat,
						      &status);
    CL_CHECK_ERROR(status);

    /* Execute FFT */
    status = clFFT_ExecuteInterleaved(command_queue,
				      plan_1dfft_sinogram,
				      theta_size,
				      clFFT_Forward,
				      zeropad_ifftshift_data_buffer,
				      zeropadded_1dfft_data_buffer,
				      0,
				      NULL,
				      NULL);
    CL_CHECK_ERROR(status);
    // Free FFT plan
    //clFFT_DestroyPlan(plan_1dfft_sinogram);

    // Copy result from device to host
    /*
    clFFT_Complex *fourier_kernel_sinogram = (clFFT_Complex *)malloc(zeropad_data_size);
    clEnqueueReadBuffer(command_queue, zeropadded_1dfft_data_buffer, CL_TRUE, 0, zeropad_data_size, fourier_kernel_sinogram, 0, NULL, NULL);
    clFinish(command_queue);

	tiff_write_complex("resources/1dfft-sino.tif", fourier_kernel_sinogram, size_zeropad_s, theta_size);
    */

    ///////////////////
    /* Make fftshift */
    ///////////////////

    /* Buffers */
    cl_mem zeropad_fftshift_data_buffer = clCreateBuffer(context,
							CL_MEM_READ_WRITE,
							zeropad_data_size,
							NULL,
							&status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 0, sizeof(void *), (void *)&zeropadded_1dfft_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 1, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 2, sizeof(size_zeropad_s), &size_zeropad_s));

    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 3, sizeof(void *), (void *)&zeropad_fftshift_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 4, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_fftshift,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    0, // number events in wait list
				    NULL,  // event wait list
				    &events[3]); // event
    CL_CHECK_ERROR(status);

    /* Copy result from device to host */
    /*
    clFFT_Complex *fur_kernel_fftshift_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
										      	  zeropad_fftshift_data_buffer,
										      	  CL_TRUE,
      									   	      CL_MAP_READ,0,zeropad_data_size,
      									   	      0, NULL, NULL, NULL );

    clFinish(command_queue);
    tiff_write_complex("resources/fftshift-sino.tif", fur_kernel_fftshift_sino, size_zeropad_s, theta_size);
    */

     ////////////////////////
     /* Data Interpolation */
     ////////////////////////

    /* Performing Interpolation */
    cl_long data_length = size_s * size_s;
    cl_int in_rows = theta_size;
    cl_int in_cols = size_zeropad_s;

    cl_float norm_ratio = d_omega_s/dx;

    cl_float in_rows_first_val = min_theta;
    cl_float in_rows_last_val = max_theta;

    cl_float in_cols_first_val = (-in_cols/2)*norm_ratio;
    cl_float in_cols_last_val = (in_cols/2-1)*norm_ratio;

    cl_int interp_rows = size_s;
    cl_int interp_cols = interp_rows;

    cl_int iparams[5];
    iparams[0] = in_rows;
    iparams[1] = in_cols;
    iparams[2] = dx;
    iparams[3] = interp_rows;
    iparams[4] = interp_cols;

    cl_float fparams[5];
    fparams[0] = in_rows_first_val;
    fparams[1] = in_rows_last_val;
    fparams[2] = in_cols_first_val;
    fparams[3] = in_cols_last_val;
    fparams[4] = norm_ratio;

    /* Buffers */
    cl_mem i_buffer = clCreateBuffer(context,
                                     CL_MEM_READ_ONLY,
                                     sizeof(cl_int) * 5,
                                     NULL,
                                     &status);
    CL_CHECK_ERROR(status);

    CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					i_buffer,
					CL_FALSE,
					0,
					sizeof(cl_int) * 5,
					iparams,
					0,
					NULL,
					&events[4]));

    cl_mem f_buffer = clCreateBuffer(context,
                                     CL_MEM_READ_ONLY,
                                     sizeof(cl_float) * 5,
                                     NULL,
                                     &status);
    CL_CHECK_ERROR(status);

    CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					f_buffer,
					CL_FALSE,
					0,
					sizeof(cl_float) * 5,
					fparams,
					0,
					NULL,
					&events[5]));

    cl_mem output_buffer = clCreateBuffer(context,
					  CL_MEM_READ_WRITE,
					  data_length * sizeof(clFFT_Complex),
					  NULL,
					  &status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 0, sizeof(void *), (void *)&i_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 1, sizeof(void *), (void *)&f_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 2, sizeof(void *), (void *)&zeropad_fftshift_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 3, sizeof(void *), (void *)&output_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 4, sizeof(data_length), &data_length));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_linear_interp,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // nomber of WI in WG
				    3, // num events in wait list
				    events + 3,  // event wait list
				    &events[6]); // event
    CL_CHECK_ERROR(status);
    //clFinish(command_queue);

    // Copy result from device to host
    /*
    clFFT_Complex *interpolated_spectrum = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
                                                                  	  	   output_buffer,
                                                                  	  	   CL_TRUE,
                                                                  	  	   CL_MAP_READ,
                                                                 	  	   0,
                                                                  	  	   data_length * sizeof(clFFT_Complex),
                                                                  	  	   0, NULL, NULL, NULL );

    clFinish(command_queue);
    tiff_write_complex("resources/interpolated-sino.tif", interpolated_spectrum, size_s, size_s);
    */

    ///////////////////////////////////////////////////
    /* Applying 2-D FFT to the interpolated spectrum */
    ///////////////////////////////////////////////////

    /* Setup 2D IFFT */
    clFFT_Dim3 sino_2dfft;
    sino_2dfft.x = size_s;
    sino_2dfft.y = size_s;
    sino_2dfft.z = 1;

    /* Create 2D IFFT plan */
    clFFT_Plan plan_2difft = clFFT_CreatePlan(context,
					      sino_2dfft,
					      clFFT_2D,
					      clFFT_InterleavedComplexFormat,
					      &status);
    CL_CHECK_ERROR(status);

    /* Execute 2D IFFT */
    cl_mem reconstructed_image_buffer = clCreateBuffer(context,
						       CL_MEM_READ_WRITE,
						       data_length * sizeof(clFFT_Complex),
						       NULL,
						       &status);
    CL_CHECK_ERROR(status);

    status = clFFT_ExecuteInterleaved(command_queue,
				      plan_2difft,
				      1,
				      clFFT_Inverse,
				      output_buffer,
				      reconstructed_image_buffer,
				      0,
				      NULL,
				      NULL);
    CL_CHECK_ERROR(status);


    // Copy result from device to host
    /*
    clFFT_Complex *ifft2d_interpolated_spectrum = (clFFT_Complex *)malloc(data_length * sizeof(clFFT_Complex));
    clEnqueueReadBuffer(command_queue,
    					reconstructed_image_buffer,
    					CL_TRUE,
    					0,
    					data_length * sizeof(clFFT_Complex),
    					ifft2d_interpolated_spectrum,
    					0,
    					NULL,
    					NULL);
    tiff_write_complex("resources/ifft2d_interpolated_spectrum.tif", ifft2d_interpolated_spectrum, size_s, size_s);
    clFinish(command_queue);
    */

    /////////////////////////////////////////////////
    /* Applying 2-D fftshidt to the restored image */
    /////////////////////////////////////////////////

    /* Buffers */
    cl_mem two_dim_fftshifted_data_buffer = clCreateBuffer(context,
							   CL_MEM_READ_WRITE,
							   data_length * sizeof(clFFT_Complex),
							   NULL,
							   &status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    cl_int inverse_flag = 0;

    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 0, sizeof(void *), (void *)&reconstructed_image_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 1, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 2, sizeof(interp_rows), &interp_rows));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 3, sizeof(interp_cols), &interp_cols));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 4, sizeof(inverse_flag), &inverse_flag));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_2dshift,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    1, // number events in wait list
				    &events[6],  // event wait list
				    &events[7]); // event
    CL_CHECK_ERROR(status);

    /* Copy result from device to host */
    /*
    clFFT_Complex *two_dim_fftshifted_data = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
										     	 two_dim_fftshifted_data_buffer,
										     	 CL_TRUE,
										     	 CL_MAP_READ,
										     	 0,
										     	 data_length * sizeof(clFFT_Complex),
										     	 0, NULL, NULL, NULL );


    clFinish(command_queue);
    */

    ////////////////
    /* Crop data  */
    ///////////////
    float lt_offset = 0, rb_offset = 0;

    int dif_sides = interp_cols - slice_size;
    if (dif_sides%2) {
       lt_offset = floor(dif_sides / 2.0);
       rb_offset = ceil(dif_sides / 2.0);
    }
    else {
       lt_offset = rb_offset = dif_sides / 2.0;
    }

    /* Buffers */
    long cropped_data_length = slice_size * slice_size * sizeof(clFFT_Complex);
    cl_mem cropped_restored_image_data_buffer = 
	      clCreateBuffer(context,
			     CL_MEM_READ_WRITE,
			     cropped_data_length,
			     NULL,
			     &status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 0, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 1, sizeof(void *), (void *)&cropped_restored_image_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 2, sizeof(slice_size), &slice_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 3, sizeof(interp_cols), &interp_cols));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 4, sizeof(lt_offset), &lt_offset));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 5, sizeof(rb_offset), &rb_offset));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_crop_data,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    1, // number events in wait list
				    &events[7],  // event wait list
				    &events[8]); // event
    CL_CHECK_ERROR(status);

    CL_CHECK_ERROR(clFinish(command_queue));
    clFFT_DestroyPlan(plan_2difft);
    clFFT_DestroyPlan(plan_1dfft_sinogram);


    //timing
    float ms = 0.0, total_ms = 0.0, global_ms = 0.0, deg = 1.0e-6f;
 
      /* Stop timer */
  gettimeofday(&global_tim, NULL);
  t2_global = global_tim.tv_sec+(global_tim.tv_usec/1000000.0);
  printf("\n(Total time - timeofday) %f seconds elapsed\n", (t2_global-t1_global)*1000.0);


    cl_ulong start, end;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Sinogram shifting + Zeropadding  write_op1):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Sinogram shifting + Zeropadding  write_op2):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Sinogram shifting + Zeropadding):%f", ms);
    printf("\nTOTAL(Sinogram shifting + Zeropadding):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Applying 1-D FFT):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Shift 1-D FFT data):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Data Interpolation write_op1):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Data Interpolation write_op2):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Data Interpolation):%f", ms);
    printf("\nTOTAL(Data Interpolation):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Applying 2-D IFFT):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Applying 2-D Shift):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Cropping data):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    printf("\nTOTAL TIME:%f\n", global_ms);

    // Copy result from device to host
    clFFT_Complex *cropped_restored_image = 
	(clFFT_Complex *)clEnqueueMapBuffer(command_queue,
					    cropped_restored_image_data_buffer,
					    CL_TRUE,
					    CL_MAP_READ,
					    0,
					    cropped_data_length,
					    0, NULL, NULL, NULL );




    /* Write the restored slice */
    tiff_write_complex(tiff_output, cropped_restored_image, slice_size, slice_size);
}
Пример #5
0
void CLSimulator::initializeClFFT()
{
    /* x x x x
     * x x x x
     * x x x x
     * x x x x
     */

    for (size_t x_idx = 0, x_val = _nX - 1; x_idx < _nX; ++x_idx, --x_val) {
        for (size_t y_idx = 0, y_val = _nY - 1; y_idx < _nY; ++y_idx, --y_val) {
            float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f));
            _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance)));
        }
    }

    /* v v x x
     * v v x x
     * x x x x
     * x x x x
     */

    for (size_t x_idx = 0, x_val = _nX - 1; x_idx < _nX; ++x_idx, --x_val) {
        for (size_t y_idx = _nY, y_val = 1; y_idx < _nFFTy - 1; ++y_idx, ++y_val) {
            float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f));
            _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance)));
        }
    }

    /* v v v x
     * v v v x
     * x x x x
     * x x x x
     */

    if (_nY > 1)
    {
        for (size_t x_idx = 0; x_idx < _nFFTx; ++x_idx) {
            _distances_real[x_idx + (_nFFTy - 1) * _nFFTx] = 0;
        }
    }

    /* v v v 0
     * v v v 0
     * x x x 0
     * x x x 0
     */

    for (size_t x_idx = _nX, x_val = 1; x_idx < _nFFTx - 1; ++x_idx, ++x_val) {
        for (size_t y_idx = 0, y_val = _nY - 1; y_idx < _nY; ++y_idx, --y_val) {
            float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f));
            _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance)));
        }
    }

    /* v v v 0
     * v v v 0
     * v v x 0
     * x x x 0
     */

    for (size_t y_idx = 0; y_idx < _nFFTy; ++y_idx) {
        _distances_real[(_nFFTx - 1) + y_idx * _nFFTx] = 0;
    }

    /* v v v 0
     * v v v 0
     * v v x 0
     * 0 0 0 0
     */

    for (size_t x_idx = _nX, x_val = 1; x_idx < _nFFTx - 1; ++x_idx, ++x_val) {
        for (size_t y_idx = _nY, y_val = 1; y_idx < _nFFTy - 1; ++y_idx, ++y_val) {
            float distance = sqrt(pow(float(x_val), 2.0f) + pow(float(y_val), 2.0f));
            _distances_real[x_idx + y_idx * _nFFTx] = _f_w_EE((float(distance)));
        }
    }

    /* v v v 0
     * v v v 0
     * v v v 0
     * 0 0 0 0
     */

    assert(isPowerOfTwo(_nFFT));
    assert(_nX >= 1 && _nY >= 1 && _nZ >= 1);
    assert((_nX >= _nY) && (_nY >= _nZ));
    clFFT_Dim3 n = { static_cast<unsigned int>(_nFFTx),
                     static_cast<unsigned int>(_nFFTy),
                     static_cast<unsigned int>(_nFFTz) };
    clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat;
    clFFT_Dimension dim;

    if (_nY == 1)
    {
        dim = clFFT_1D;
    } else if (_nZ == 1)
    {
        dim = clFFT_2D;
    } else
    {
        dim = clFFT_3D;
    }
    _p_cl = clFFT_CreatePlan(_wrapper.getContextC(), n, dim, dataFormat, &_err);
    handleClError(_err);

    _distances_real_cl = cl::Buffer(_wrapper.getContext(),
                                    CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                    _nFFT * sizeof(float),
                                    _distances_real.get(),
                                    &_err);
    _distances_imag_cl = cl::Buffer(_wrapper.getContext(),
                                    CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                    _nFFT * sizeof(float),
                                    _zeros.get(),
                                    &_err);
    _sVals_real_cl = cl::Buffer(_wrapper.getContext(),
                                CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                _nFFT * sizeof(float),
                                _zeros.get(),
                                &_err);
    _sVals_imag_cl = cl::Buffer(_wrapper.getContext(),
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                _nFFT * sizeof(float),
                                _zeros.get(),
                                &_err);
    _convolution_real_cl = cl::Buffer(_wrapper.getContext(),
                                      CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                      _nFFT * sizeof(float),
                                      _zeros.get(),
                                      &_err);
    _convolution_imag_cl = cl::Buffer(_wrapper.getContext(),
                                      CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                      _nFFT * sizeof(float),
                                      _zeros.get(),
                                      &_err);
    _distances_f_real_cl = cl::Buffer(_wrapper.getContext(),
                                      CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                      _nFFT * sizeof(float),
                                      _zeros.get(),
                                      &_err);
    _distances_f_imag_cl = cl::Buffer(_wrapper.getContext(),
                                      CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                      _nFFT * sizeof(float),
                                      _zeros.get(),
                                      &_err);
    _sVals_f_real_cl = cl::Buffer(_wrapper.getContext(),
                                  CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                  _nFFT * sizeof(float),
                                  _zeros.get(),
                                  &_err);
    _sVals_f_imag_cl = cl::Buffer(_wrapper.getContext(),
                                  CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                  _nFFT * sizeof(float),
                                  _zeros.get(),
                                  &_err);
    _convolution_f_real_cl = cl::Buffer(_wrapper.getContext(),
                                        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                        _nFFT * sizeof(float),
                                        _zeros.get(),
                                        &_err);
    _convolution_f_imag_cl = cl::Buffer(_wrapper.getContext(),
                                        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                        _nFFT * sizeof(float),
                                        _zeros.get(),
                                        &_err);

    _kernel_convolution = cl::Kernel(_program, "convolution", &_err);
    handleClError(_kernel_convolution.setArg(0, _convolution_f_real_cl));
    handleClError(_kernel_convolution.setArg(1, _convolution_f_imag_cl));
    handleClError(_kernel_convolution.setArg(2, _distances_f_real_cl));
    handleClError(_kernel_convolution.setArg(3, _distances_f_imag_cl));
    handleClError(_kernel_convolution.setArg(4, _sVals_f_real_cl));
    handleClError(_kernel_convolution.setArg(5, _sVals_f_imag_cl));
    handleClError(_kernel_convolution.setArg(6, _scaleFFT));

    handleClError(clFFT_ExecutePlannar(_wrapper.getQueueC(),
                                       _p_cl,
                                       1,
                                       clFFT_Forward,
                                       _distances_real_cl(),
                                       _distances_imag_cl(),
                                       _distances_f_real_cl(),
                                       _distances_f_imag_cl(),
                                       0,
                                       NULL,
                                       NULL));

    _wrapper.getQueue().finish();
}
Пример #6
0
int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, 
			clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType)
{	
	cl_int err = CL_SUCCESS;
	int iter;
	double t;
	
	uint64_t t0, t1;
	int mx = log2(n.x);
	int my = log2(n.y);
	int mz = log2(n.z);

	int length = n.x * n.y * n.z * batchSize;
		
	double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter;
	
	clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL };
	clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL };
	clFFT_Complex *data_i = NULL;
	clFFT_Complex *data_cl = NULL;
	clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; 
	clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL };
	
	clFFT_Plan plan = NULL;
	cl_mem data_in = NULL;
	cl_mem data_out = NULL;
	cl_mem data_in_real = NULL;
	cl_mem data_in_imag = NULL;
	cl_mem data_out_real = NULL;
	cl_mem data_out_imag = NULL;
	
	if(dataFormat == clFFT_SplitComplexFormat) {
		data_i_split.real     = (float *) malloc(sizeof(float) * length);
		data_i_split.imag     = (float *) malloc(sizeof(float) * length);
		data_cl_split.real    = (float *) malloc(sizeof(float) * length);
		data_cl_split.imag    = (float *) malloc(sizeof(float) * length);
		if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag)
		{
			err = -1;
			log_error("Out-of-Resources\n");
			goto cleanup;
		}
	}
	else {
		data_i  = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
		data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
		if(!data_i || !data_cl)
		{
			err = -2;
			log_error("Out-of-Resouces\n");
			goto cleanup;
		}
	}
	
	data_iref.real   = (double *) malloc(sizeof(double) * length);
	data_iref.imag   = (double *) malloc(sizeof(double) * length);
	data_oref.real   = (double *) malloc(sizeof(double) * length);
	data_oref.imag   = (double *) malloc(sizeof(double) * length);	
	if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag)
	{
		err = -3;
		log_error("Out-of-Resources\n");
		goto cleanup;
	}

	int i;
	if(dataFormat == clFFT_SplitComplexFormat) {
		for(i = 0; i < length; i++)
		{
			data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_cl_split.real[i] = 0.0f;
			data_cl_split.imag[i] = 0.0f;			
			data_iref.real[i] = data_i_split.real[i];
			data_iref.imag[i] = data_i_split.imag[i];
			data_oref.real[i] = data_iref.real[i];
			data_oref.imag[i] = data_iref.imag[i];	
		}
	}
	else {
		for(i = 0; i < length; i++)
		{
			data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_cl[i].real = 0.0f;
			data_cl[i].imag = 0.0f;			
			data_iref.real[i] = data_i[i].real;
			data_iref.imag[i] = data_i[i].imag;
			data_oref.real[i] = data_iref.real[i];
			data_oref.imag[i] = data_iref.imag[i];	
		}		
	}
	
	plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err );
	if(!plan || err) 
	{
		log_error("clFFT_CreatePlan failed\n");
		goto cleanup;
	}
	
	//clFFT_DumpPlan(plan, stdout);
	
	if(dataFormat == clFFT_SplitComplexFormat)
	{
		data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err);
	    if(!data_in_real || err) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		
		data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err);
	    if(!data_in_imag || err) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		
		if(testType == clFFT_OUT_OF_PLACE)
		{
			data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err);
			if(!data_out_real || err) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}
			
			data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err);
			if(!data_out_imag || err) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}			
		}
		else
		{
			data_out_real = data_in_real;
			data_out_imag = data_in_imag;
		}
	}
	else
	{
	    data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err);
	    if(!data_in) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		if(testType == clFFT_OUT_OF_PLACE)
		{
			data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err);
			if(!data_out) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}			
		}
		else
			data_out = data_in;
	}
		
			
	err = CL_SUCCESS;
	
	t0 = mach_absolute_time();
	if(dataFormat == clFFT_SplitComplexFormat)
	{
		for(iter = 0; iter < numIter; iter++)
		    err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL);
	}
	else
	{
	    for(iter = 0; iter < numIter; iter++) 
			err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL);
	}
	
	err |= clFinish(queue);
	
	if(err) 
	{
		log_error("clFFT_Execute\n");
		goto cleanup;	
	}
	
	t1 = mach_absolute_time(); 
	t = subtractTimes(t1, t0);
	char temp[100];
	sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize);
	log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp);

	if(dataFormat == clFFT_SplitComplexFormat)
	{	
		err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL);
		err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL);
	}
	else
	{
		err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL);
	}
	
	if(err) 
	{
		log_error("clEnqueueReadBuffer failed\n");
        goto cleanup;
	}	

	computeReferenceD(&data_oref, n, batchSize, dim, dir);
	
	double diff_avg, diff_max, diff_min;
	if(dataFormat == clFFT_SplitComplexFormat) {
		diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
		if(diff_avg > eps_avg)
			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
		else
			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);			
	}
	else {
		clFFT_SplitComplex result_split;
		result_split.real = (float *) malloc(length*sizeof(float));
		result_split.imag = (float *) malloc(length*sizeof(float));
		convertInterleavedToSplit(&result_split, data_cl, length);
		diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
		
		if(diff_avg > eps_avg)
			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
		else
			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);	
		free(result_split.real);
		free(result_split.imag);
	}
	
cleanup:
	clFFT_DestroyPlan(plan);	
	if(dataFormat == clFFT_SplitComplexFormat) 
	{
		if(data_i_split.real)
			free(data_i_split.real);
		if(data_i_split.imag)
			free(data_i_split.imag);
		if(data_cl_split.real)
			free(data_cl_split.real);
		if(data_cl_split.imag)
			free(data_cl_split.imag);
		
		if(data_in_real)
			clReleaseMemObject(data_in_real);
		if(data_in_imag)
			clReleaseMemObject(data_in_imag);
		if(data_out_real && testType == clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out_real);
		if(data_out_imag && clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out_imag);
	}
	else 
	{
		if(data_i)
			free(data_i);
		if(data_cl)
			free(data_cl);
		
		if(data_in)
			clReleaseMemObject(data_in);
		if(data_out && testType == clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out);
	}
	
	if(data_iref.real)
		free(data_iref.real);
	if(data_iref.imag)
		free(data_iref.imag);		
	if(data_oref.real)
		free(data_oref.real);
	if(data_oref.imag)
		free(data_oref.imag);
	
	return err;
}
void Convolutioner_FrequencyDomain_OpenCL::process(AudioInOutBuffers<float_type>& audio ) {
    
    //
    unsigned int _2B = audio.channelLength_ * 2;
    unsigned int _B  = audio.channelLength_;
    unsigned int _C  = audio.numOfChannels_;                        //numOfChannels
    unsigned int _P  = partitionedIR_.get_numOfPartsPerChannel();   //numOfIRPartsPerChannel
    //.
    
    //_ if >>>latency<<< or >>>number of channels<<< changed:
    //      set partitionedIR
    //      recreate buffers
    //      recreate fft plans
    if ( window_.get_inputBlockSize() != audio.channelLength_ || window_.get_numOfChannels() != audio.numOfChannels_) {
        
        //Setting partitionedIR
        if (window_.get_inputBlockSize() != audio.channelLength_) {
            
            partitionedIR_.setNewIRF( irf_, audio.channelLength_ );
            _P = partitionedIR_.get_numOfPartsPerChannel();
            
            //Recreate, initialize buffers, and set as kernel arguments: PIR
            //recreate
            bufferPIR_R_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P);
            bufferPIR_I_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P);
            //.
            
            //initialize
            bufferPIR_R_.set(partitionedIR_.real_     );
            bufferPIR_I_.set(partitionedIR_.imaginary_);
            //.
            
            //set as kernel argument
            bufferPIR_R_.setAsKernelArgument(0, complexMultiplyAdd_kernel_);
            bufferPIR_I_.setAsKernelArgument(1, complexMultiplyAdd_kernel_);
            //.
            //.(Recreate...)
            
        }
        //.
        
        //Recreate initialize buffers, and set as kernel arguments: transform, FDL, accumulator
        //recreate
        /****/bufferTransform_R_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        /****/bufferTransform_I_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        /**********/bufferFDL_R_.recreate(CL_MEM_READ_WRITE,    _2B * _C * _P   );
        /**********/bufferFDL_I_.recreate(CL_MEM_READ_WRITE,    _2B * _C * _P   );
        /**/bufferAccumulator_R_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        /**/bufferAccumulator_I_.recreate(CL_MEM_READ_WRITE,    _2B * _C        );
        cpu_bufferAccumulator_R_ = new float_type[_2B * _C ];
        cpu_bufferAccumulator_I_ = new float_type[_2B * _C ];
        //.
        
        //initialize FDL with 0
        bufferFDL_R_.fillWithZero();
        bufferFDL_I_.fillWithZero();
        lastInsertedDelayLineIdx = 0;
        //.
        
        //set as kernel argument
        /**********/bufferFDL_R_.setAsKernelArgument(2, complexMultiplyAdd_kernel_);
        /**********/bufferFDL_I_.setAsKernelArgument(3, complexMultiplyAdd_kernel_);
        /**/bufferAccumulator_R_.setAsKernelArgument(4, complexMultiplyAdd_kernel_);
        /**/bufferAccumulator_I_.setAsKernelArgument(5, complexMultiplyAdd_kernel_);
        //.
        //.(Recreate...)
        
        //Recreate plans
        clFFT_Dim3 dim;
        dim.x = _2B;
        dim.y = 1;
        dim.z = 1;
        fftPlan_ = clFFT_CreatePlan(context_, dim, clFFT_1D, clFFT_SplitComplexFormat, &lastCommandStatus_);
        //.
    }
    
    //update each time bufferGlobalParameters because of incrementing of lastInsertedDelayLineIdx
    /*(_2B, _C, _P, pir_C, FDL_LINE)*/
    cpuData_bufferGlobalParameters_[0] = _2B;
    cpuData_bufferGlobalParameters_[1] = _C;
    cpuData_bufferGlobalParameters_[2] = _P;
    cpuData_bufferGlobalParameters_[3] = irf_->numOfChannels_;
    cpuData_bufferGlobalParameters_[4] = lastInsertedDelayLineIdx;
    
    bufferGlobalParameters_.set(cpuData_bufferGlobalParameters_);
    //.
    
    //Update channelsWindow
    window_.update( audio, /*history size*/ _B );
    //.
    
    //Init >>bufferTransform<<
    bufferTransform_R_.set(window_.buffer_.data_);
    for(unsigned int i = 0; i < _2B * _C; ++i)
        cpu_bufferAccumulator_I_[i]=0;
    bufferTransform_I_.set(cpu_bufferAccumulator_I_);
    //.
    
    //Make fft of bufferTransform
    lastCommandStatus_ = clFFT_ExecutePlannar(  cmdQueue_, fftPlan_, _C, clFFT_Forward,
                                              bufferTransform_R_, bufferTransform_I_,
                                              bufferTransform_R_, bufferTransform_I_,
                                              0, NULL, NULL );
    //.
    
    //Copy bufferTransform into bufferFDL (inserting new delay line) (real and imaginary part)
    clEnqueueCopyBuffer(    cmdQueue_, bufferTransform_R_, bufferFDL_R_,
                        0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type),
                        0, NULL, NULL);
    clEnqueueCopyBuffer(    cmdQueue_, bufferTransform_I_, bufferFDL_I_,
                        0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type),
                        0, NULL, NULL);
    //.
    
    //Increment host lastInsertedDelayLine
    lastInsertedDelayLineIdx = (lastInsertedDelayLineIdx + 1 ) % _P;
    //.
    
    //Execute kernel
    size_t globalWorkSize[1];
    globalWorkSize[0] =  _2B * _C /* == window_.get_allLength() */;
    lastCommandStatus_ = clEnqueueNDRangeKernel(cmdQueue_, complexMultiplyAdd_kernel_, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    if(lastCommandStatus_ == -4) {
        std::cout << "Too much amount of memory must be allocated on the GPU due to lenght of impulse response and number of channels.";
        throw int();
    }
    else if(lastCommandStatus_ != 0) {
        std::cout << "Error while sending clEnqueueNDRangeKernel.";
        throw int();
    }
    //.
    
    //ifft of bufferAccumulator
    lastCommandStatus_ = clFFT_ExecutePlannar(  cmdQueue_, fftPlan_, _C, clFFT_Inverse,
                                              bufferAccumulator_R_, bufferAccumulator_I_,
                                              bufferAccumulator_R_, bufferAccumulator_I_,
                                              0, NULL, NULL );
    //.
    
    //Copy from bufferAccumulator to cpu
    bufferAccumulator_R_.get(cpu_bufferAccumulator_R_);
    //.
    
    //Flushing and finishing
    clFlush(cmdQueue_);
    clFinish(cmdQueue_);
    //.
    
    //Write fftw vector form to audio.outputChannel[number of Channel]
    for (unsigned int channNum = 0; channNum < _C; ++channNum)
        for (unsigned sampleNum = 0; sampleNum < _B; ++sampleNum)
            audio.out_[channNum][sampleNum] = (cpu_bufferAccumulator_R_[channNum*_2B + _B + sampleNum])/_2B;
    //.
}