Beispiel #1
0
void FftClFft::
        compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, FftDirection direction )
{
    TIME_STFT TaskTimer tt("Fft ClFft");

    unsigned n = input->getNumberOfElements().width;
    unsigned N = output->getNumberOfElements().width;

    if (-1 != direction)
        EXCEPTION_ASSERT( n == N );

    {
        TIME_STFT TaskTimer tt("Computing fft(N=%u, n=%u, direction=%d)", N, n, direction);
        OpenCLContext *opencl = &OpenCLContext::Singleton();
        cl_int fft_error;

        clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n, fft_error);
        if (fft_error != CL_SUCCESS)
            throw std::runtime_error("Could not create clFFT compute plan.");

        // Run the fft in OpenCL :)
        // fft kernel needs to have read/write access to output data
        fft_error |= clFFT_ExecuteInterleaved(
                opencl->getCommandQueue(),
                plan, 1, (clFFT_Direction)direction,
                OpenClMemoryStorage::ReadOnly<1>( input ).ptr(),
                OpenClMemoryStorage::ReadWrite<1>( output ).ptr(),
                0, NULL, NULL );

        if (fft_error != CL_SUCCESS)
            throw std::runtime_error("Bad stuff happened during FFT computation.");
    }
}
Beispiel #2
0
void FftClFft::
        compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, DataStorageSize n, FftDirection direction )
{
    TaskTimer tt("Stft::computeWithClFft( matrix[%d, %d], %s )",
                 input->size().width,
                 input->size().height,
                 direction==FftDirection_Forward?"forward":"backward");

    EXCEPTION_ASSERT( output->numberOfBytes() == input->numberOfBytes() );

    const int batchSize = n.height;

    OpenCLContext *opencl = &OpenCLContext::Singleton();
    cl_int fft_error;

    clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n.width, fft_error);
    if(fft_error != CL_SUCCESS)
        throw std::runtime_error("Could not create clFFT compute plan.");

    {
        TaskTimer tt("Calculating batches");

        // Run the fft in OpenCL :)
        fft_error |= clFFT_ExecuteInterleaved(
                opencl->getCommandQueue(),
                plan, batchSize, direction==FftDirection_Forward?clFFT_Forward:clFFT_Inverse,
                OpenClMemoryStorage::ReadOnly<1>( input ).ptr(),
                OpenClMemoryStorage::ReadWrite<1>( output ).ptr(),
                0, NULL, NULL );
        if(fft_error != CL_SUCCESS)
            throw std::runtime_error("Bad stuff happened during FFT computation.");
    }
}
Beispiel #3
0
void dfi_process_sinogram(const char* tiff_input, const char* tiff_output, int center_rotation)
{
  cl_event events[11];

  if(!tiff_input) {
    printf("The filename of input is not valid. (pointer tiff_input = %p)", tiff_input);
    return;
  }

  if(!tiff_output) {
    printf("The filename of output is not valid. (pointer tiff_output = %p)", tiff_output);
    return;
  }

  /////////////////////
  /* Input Data Part */
  /////////////////////

  /* Input a slice properties */
  int bits_per_sample;
  int samples_per_pixel;
  int theta_size;
  int slice_size;

  /* Read the slice */
  clFFT_Complex *data_tiff = tiff_read_complex(tiff_input,
						center_rotation,
						&bits_per_sample,
						&samples_per_pixel,
						&slice_size,
						&theta_size);
  
  //tiff_write_complex("resources/initial-sino.tif", data_tiff, slice_size, theta_size);

  /*
   * OpenCL
   */
  printf("Hey!1\n");
  cl_int status = CL_SUCCESS;
  cl_platform_id platform;

  printf("Hey!1.2\n");
  CL_CHECK_ERROR(clGetPlatformIDs(1, &platform, NULL));

  printf("Hey!2\n");
  cl_device_id devices[10];    // Compute device
  cl_context context;     // Compute context
  cl_uint n_devices = 0;

  printf("@Hey!3\n");
#if GPU
  printf("@Hey!GPU Choosed\n");
  CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 3, devices, &n_devices));
#else
  printf("@Hey!CPU Choosed\n");
  CL_CHECK_ERROR(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 3, devices, &n_devices));
#endif

  //cl_device_id current_device = devices[0];
#define current_device devices[0]
  printf("Hey!3.1  n_devices %d\n", n_devices);
  context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
  printf("Hey!3.2\n");
  CL_CHECK_ERROR(status);

	/*
	 * Device
	 */
	printf("Hey!3.3\n");

	cl_int device_max_cu = 0;
	CL_CHECK_ERROR(clGetDeviceInfo(current_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_max_cu, NULL));
	size_t wg_count = device_max_cu * wavefronts_per_SIMD;
    size_t global_work_size = wg_count * local_work_size;
    printf("Hey!3.4\n");

    /*
     * Queues, Kernels
     */
    cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
    cl_command_queue command_queue = clCreateCommandQueue(context, current_device, properties, &status);
    CL_CHECK_ERROR(status);

    printf("Hey!3.5\n");
    cl_kernel kernel_linear_interp = get_kernel("linear_interp", &context, &current_device);
    cl_kernel kernel_zero_ifftshift = get_kernel("zero_ifftshift", &context, &current_device);
    cl_kernel kernel_fftshift = get_kernel("fftshift", &context, &current_device);
    cl_kernel kernel_2dshift = get_kernel("shift2d", &context, &current_device);
    cl_kernel kernel_crop_data = get_kernel("crop_data", &context, &current_device);

    printf("@Hey!3.6\n\n");

	////////////////////////
	/* OpenCL - DFI Part */
	////////////////////////

	/* Reconstruction properties */
	int oversampling_ratio = 2;
	int dx = 1; /* zoom times */
	//int size_s = slice_size * oversampling_ratio;

	int min_theta = 0;
	int max_theta = theta_size - 1;

	int size_zeropad_s = pow(2, ceil(log2((float)slice_size))); /* get length of FFT operations */
	int size_s = size_zeropad_s;

	float d_omega_s = 2 * M_PI / (size_zeropad_s * dx); //normalized ratio [0; 2PI]

  /* Start timer */
  timeval global_tim;
  gettimeofday(&global_tim, NULL);
  double t1_global = global_tim.tv_sec + (global_tim.tv_usec/1000000.0), t2_global = 0.0;

	/////////////////////////////////////
	/* Sinogram shifting + Zeropadding */
	/////////////////////////////////////
	long data_size = slice_size * theta_size * sizeof(clFFT_Complex);
	printf("6 ");
	long zeropad_data_size = theta_size * size_zeropad_s * sizeof(clFFT_Complex);

	/* Buffers */
	cl_mem original_data_buffer = clCreateBuffer(context,
						     CL_MEM_READ_WRITE,
						     data_size,
						     NULL,
						     &status);
	CL_CHECK_ERROR(status);

	CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					    original_data_buffer,
					    CL_FALSE,
					    0,
					    data_size,
					    data_tiff,
					    0,
					    NULL,
					    &events[0]));

    	cl_mem zeropad_ifftshift_data_buffer = clCreateBuffer(context,
							  CL_MEM_READ_WRITE,
							  zeropad_data_size,
							  NULL,
							  &status);
    CL_CHECK_ERROR(status);

    float *zero_out = (float *)g_malloc0(zeropad_data_size);
    CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					zeropad_ifftshift_data_buffer,
					CL_FALSE,
					0,
					zeropad_data_size,
					zero_out,
					0,
					NULL,
					&events[1]));

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 0, sizeof(void *), (void *)&original_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 1, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 2, sizeof(slice_size), &slice_size));

    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 3, sizeof(void *), (void *)&zeropad_ifftshift_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 4, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_zero_ifftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_zero_ifftshift,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    2, // number events in wait list
				    events,  // event wait list
				    &events[2]); // event
    CL_CHECK_ERROR(status);

    // Copy result from device to host
    /*
    clFFT_Complex *fur_kernel_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
    									     	 	 	 	 	 	 	 zeropad_ifftshift_data_buffer,
    									     	 	 	 	 	 	 	 CL_TRUE,
    									     	 	 	 	 	 	 	 CL_MAP_READ,
    									     	 	 	 	 	 	 	 0,
    									     	 	 	 	 	 	 	 zeropad_data_size,
                                                                 	     0, NULL, NULL, NULL );

    clFinish(command_queue);
    tiff_write_complex("resources/zeropad-sino.tif", fur_kernel_sino, size_zeropad_s, theta_size);
    */

    ////////////////////////////////////////////////////////////////////////
    /* Applying 1-D FFT to the each strip of the sinogram and shifting it */
    ////////////////////////////////////////////////////////////////////////

    cl_mem zeropadded_1dfft_data_buffer = clCreateBuffer(context,
							CL_MEM_READ_WRITE,
							zeropad_data_size,
							NULL,
							&status);
    CL_CHECK_ERROR(status);

    /* Setup clAmdFft */
    clFFT_Dim3 sino_fft;
    sino_fft.x = size_zeropad_s;
    sino_fft.y = 1;
    sino_fft.z = 1;

    /* Create FFT plan */
    clFFT_Plan plan_1dfft_sinogram = clFFT_CreatePlan(context,
						      sino_fft,
						      clFFT_1D,
						      clFFT_InterleavedComplexFormat,
						      &status);
    CL_CHECK_ERROR(status);

    /* Execute FFT */
    status = clFFT_ExecuteInterleaved(command_queue,
				      plan_1dfft_sinogram,
				      theta_size,
				      clFFT_Forward,
				      zeropad_ifftshift_data_buffer,
				      zeropadded_1dfft_data_buffer,
				      0,
				      NULL,
				      NULL);
    CL_CHECK_ERROR(status);
    // Free FFT plan
    //clFFT_DestroyPlan(plan_1dfft_sinogram);

    // Copy result from device to host
    /*
    clFFT_Complex *fourier_kernel_sinogram = (clFFT_Complex *)malloc(zeropad_data_size);
    clEnqueueReadBuffer(command_queue, zeropadded_1dfft_data_buffer, CL_TRUE, 0, zeropad_data_size, fourier_kernel_sinogram, 0, NULL, NULL);
    clFinish(command_queue);

	tiff_write_complex("resources/1dfft-sino.tif", fourier_kernel_sinogram, size_zeropad_s, theta_size);
    */

    ///////////////////
    /* Make fftshift */
    ///////////////////

    /* Buffers */
    cl_mem zeropad_fftshift_data_buffer = clCreateBuffer(context,
							CL_MEM_READ_WRITE,
							zeropad_data_size,
							NULL,
							&status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 0, sizeof(void *), (void *)&zeropadded_1dfft_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 1, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 2, sizeof(size_zeropad_s), &size_zeropad_s));

    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 3, sizeof(void *), (void *)&zeropad_fftshift_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 4, sizeof(theta_size), &theta_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_fftshift, 5, sizeof(size_zeropad_s), &size_zeropad_s));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_fftshift,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    0, // number events in wait list
				    NULL,  // event wait list
				    &events[3]); // event
    CL_CHECK_ERROR(status);

    /* Copy result from device to host */
    /*
    clFFT_Complex *fur_kernel_fftshift_sino = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
										      	  zeropad_fftshift_data_buffer,
										      	  CL_TRUE,
      									   	      CL_MAP_READ,0,zeropad_data_size,
      									   	      0, NULL, NULL, NULL );

    clFinish(command_queue);
    tiff_write_complex("resources/fftshift-sino.tif", fur_kernel_fftshift_sino, size_zeropad_s, theta_size);
    */

     ////////////////////////
     /* Data Interpolation */
     ////////////////////////

    /* Performing Interpolation */
    cl_long data_length = size_s * size_s;
    cl_int in_rows = theta_size;
    cl_int in_cols = size_zeropad_s;

    cl_float norm_ratio = d_omega_s/dx;

    cl_float in_rows_first_val = min_theta;
    cl_float in_rows_last_val = max_theta;

    cl_float in_cols_first_val = (-in_cols/2)*norm_ratio;
    cl_float in_cols_last_val = (in_cols/2-1)*norm_ratio;

    cl_int interp_rows = size_s;
    cl_int interp_cols = interp_rows;

    cl_int iparams[5];
    iparams[0] = in_rows;
    iparams[1] = in_cols;
    iparams[2] = dx;
    iparams[3] = interp_rows;
    iparams[4] = interp_cols;

    cl_float fparams[5];
    fparams[0] = in_rows_first_val;
    fparams[1] = in_rows_last_val;
    fparams[2] = in_cols_first_val;
    fparams[3] = in_cols_last_val;
    fparams[4] = norm_ratio;

    /* Buffers */
    cl_mem i_buffer = clCreateBuffer(context,
                                     CL_MEM_READ_ONLY,
                                     sizeof(cl_int) * 5,
                                     NULL,
                                     &status);
    CL_CHECK_ERROR(status);

    CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					i_buffer,
					CL_FALSE,
					0,
					sizeof(cl_int) * 5,
					iparams,
					0,
					NULL,
					&events[4]));

    cl_mem f_buffer = clCreateBuffer(context,
                                     CL_MEM_READ_ONLY,
                                     sizeof(cl_float) * 5,
                                     NULL,
                                     &status);
    CL_CHECK_ERROR(status);

    CL_CHECK_ERROR(clEnqueueWriteBuffer(command_queue,
					f_buffer,
					CL_FALSE,
					0,
					sizeof(cl_float) * 5,
					fparams,
					0,
					NULL,
					&events[5]));

    cl_mem output_buffer = clCreateBuffer(context,
					  CL_MEM_READ_WRITE,
					  data_length * sizeof(clFFT_Complex),
					  NULL,
					  &status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 0, sizeof(void *), (void *)&i_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 1, sizeof(void *), (void *)&f_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 2, sizeof(void *), (void *)&zeropad_fftshift_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 3, sizeof(void *), (void *)&output_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_linear_interp, 4, sizeof(data_length), &data_length));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_linear_interp,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // nomber of WI in WG
				    3, // num events in wait list
				    events + 3,  // event wait list
				    &events[6]); // event
    CL_CHECK_ERROR(status);
    //clFinish(command_queue);

    // Copy result from device to host
    /*
    clFFT_Complex *interpolated_spectrum = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
                                                                  	  	   output_buffer,
                                                                  	  	   CL_TRUE,
                                                                  	  	   CL_MAP_READ,
                                                                 	  	   0,
                                                                  	  	   data_length * sizeof(clFFT_Complex),
                                                                  	  	   0, NULL, NULL, NULL );

    clFinish(command_queue);
    tiff_write_complex("resources/interpolated-sino.tif", interpolated_spectrum, size_s, size_s);
    */

    ///////////////////////////////////////////////////
    /* Applying 2-D FFT to the interpolated spectrum */
    ///////////////////////////////////////////////////

    /* Setup 2D IFFT */
    clFFT_Dim3 sino_2dfft;
    sino_2dfft.x = size_s;
    sino_2dfft.y = size_s;
    sino_2dfft.z = 1;

    /* Create 2D IFFT plan */
    clFFT_Plan plan_2difft = clFFT_CreatePlan(context,
					      sino_2dfft,
					      clFFT_2D,
					      clFFT_InterleavedComplexFormat,
					      &status);
    CL_CHECK_ERROR(status);

    /* Execute 2D IFFT */
    cl_mem reconstructed_image_buffer = clCreateBuffer(context,
						       CL_MEM_READ_WRITE,
						       data_length * sizeof(clFFT_Complex),
						       NULL,
						       &status);
    CL_CHECK_ERROR(status);

    status = clFFT_ExecuteInterleaved(command_queue,
				      plan_2difft,
				      1,
				      clFFT_Inverse,
				      output_buffer,
				      reconstructed_image_buffer,
				      0,
				      NULL,
				      NULL);
    CL_CHECK_ERROR(status);


    // Copy result from device to host
    /*
    clFFT_Complex *ifft2d_interpolated_spectrum = (clFFT_Complex *)malloc(data_length * sizeof(clFFT_Complex));
    clEnqueueReadBuffer(command_queue,
    					reconstructed_image_buffer,
    					CL_TRUE,
    					0,
    					data_length * sizeof(clFFT_Complex),
    					ifft2d_interpolated_spectrum,
    					0,
    					NULL,
    					NULL);
    tiff_write_complex("resources/ifft2d_interpolated_spectrum.tif", ifft2d_interpolated_spectrum, size_s, size_s);
    clFinish(command_queue);
    */

    /////////////////////////////////////////////////
    /* Applying 2-D fftshidt to the restored image */
    /////////////////////////////////////////////////

    /* Buffers */
    cl_mem two_dim_fftshifted_data_buffer = clCreateBuffer(context,
							   CL_MEM_READ_WRITE,
							   data_length * sizeof(clFFT_Complex),
							   NULL,
							   &status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    cl_int inverse_flag = 0;

    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 0, sizeof(void *), (void *)&reconstructed_image_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 1, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 2, sizeof(interp_rows), &interp_rows));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 3, sizeof(interp_cols), &interp_cols));
    CL_CHECK_ERROR(clSetKernelArg(kernel_2dshift, 4, sizeof(inverse_flag), &inverse_flag));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_2dshift,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    1, // number events in wait list
				    &events[6],  // event wait list
				    &events[7]); // event
    CL_CHECK_ERROR(status);

    /* Copy result from device to host */
    /*
    clFFT_Complex *two_dim_fftshifted_data = (clFFT_Complex *)clEnqueueMapBuffer(command_queue,
										     	 two_dim_fftshifted_data_buffer,
										     	 CL_TRUE,
										     	 CL_MAP_READ,
										     	 0,
										     	 data_length * sizeof(clFFT_Complex),
										     	 0, NULL, NULL, NULL );


    clFinish(command_queue);
    */

    ////////////////
    /* Crop data  */
    ///////////////
    float lt_offset = 0, rb_offset = 0;

    int dif_sides = interp_cols - slice_size;
    if (dif_sides%2) {
       lt_offset = floor(dif_sides / 2.0);
       rb_offset = ceil(dif_sides / 2.0);
    }
    else {
       lt_offset = rb_offset = dif_sides / 2.0;
    }

    /* Buffers */
    long cropped_data_length = slice_size * slice_size * sizeof(clFFT_Complex);
    cl_mem cropped_restored_image_data_buffer = 
	      clCreateBuffer(context,
			     CL_MEM_READ_WRITE,
			     cropped_data_length,
			     NULL,
			     &status);
    CL_CHECK_ERROR(status);

    /* Set arguments */
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 0, sizeof(void *), (void *)&two_dim_fftshifted_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 1, sizeof(void *), (void *)&cropped_restored_image_data_buffer));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 2, sizeof(slice_size), &slice_size));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 3, sizeof(interp_cols), &interp_cols));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 4, sizeof(lt_offset), &lt_offset));
    CL_CHECK_ERROR(clSetKernelArg(kernel_crop_data, 5, sizeof(rb_offset), &rb_offset));

    /* Run kernel */
    status = clEnqueueNDRangeKernel(command_queue,
				    kernel_crop_data,
				    1, // work dimensional 1D, 2D, 3D
				    NULL, // offset
				    &global_work_size, // total number of WI
				    &local_work_size, // number of WI in WG
				    1, // number events in wait list
				    &events[7],  // event wait list
				    &events[8]); // event
    CL_CHECK_ERROR(status);

    CL_CHECK_ERROR(clFinish(command_queue));
    clFFT_DestroyPlan(plan_2difft);
    clFFT_DestroyPlan(plan_1dfft_sinogram);


    //timing
    float ms = 0.0, total_ms = 0.0, global_ms = 0.0, deg = 1.0e-6f;
 
      /* Stop timer */
  gettimeofday(&global_tim, NULL);
  t2_global = global_tim.tv_sec+(global_tim.tv_usec/1000000.0);
  printf("\n(Total time - timeofday) %f seconds elapsed\n", (t2_global-t1_global)*1000.0);


    cl_ulong start, end;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Sinogram shifting + Zeropadding  write_op1):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Sinogram shifting + Zeropadding  write_op2):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Sinogram shifting + Zeropadding):%f", ms);
    printf("\nTOTAL(Sinogram shifting + Zeropadding):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[2], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Applying 1-D FFT):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Shift 1-D FFT data):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[4], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Data Interpolation write_op1):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[5], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Data Interpolation write_op2):%f", ms);

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Data Interpolation):%f", ms);
    printf("\nTOTAL(Data Interpolation):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[6], CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Applying 2-D IFFT):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[7], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Applying 2-D Shift):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &start, NULL));
    CL_CHECK_ERROR(clGetEventProfilingInfo(events[8], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
    ms = (end - start) * deg;
    total_ms += ms;
    printf("\n(Cropping data):%f\n", total_ms);
    global_ms += total_ms;
    total_ms = 0.0;

    printf("\nTOTAL TIME:%f\n", global_ms);

    // Copy result from device to host
    clFFT_Complex *cropped_restored_image = 
	(clFFT_Complex *)clEnqueueMapBuffer(command_queue,
					    cropped_restored_image_data_buffer,
					    CL_TRUE,
					    CL_MAP_READ,
					    0,
					    cropped_data_length,
					    0, NULL, NULL, NULL );




    /* Write the restored slice */
    tiff_write_complex(tiff_output, cropped_restored_image, slice_size, slice_size);
}
Beispiel #4
0
int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, 
			clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType)
{	
	cl_int err = CL_SUCCESS;
	int iter;
	double t;
	
	uint64_t t0, t1;
	int mx = log2(n.x);
	int my = log2(n.y);
	int mz = log2(n.z);

	int length = n.x * n.y * n.z * batchSize;
		
	double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter;
	
	clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL };
	clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL };
	clFFT_Complex *data_i = NULL;
	clFFT_Complex *data_cl = NULL;
	clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; 
	clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL };
	
	clFFT_Plan plan = NULL;
	cl_mem data_in = NULL;
	cl_mem data_out = NULL;
	cl_mem data_in_real = NULL;
	cl_mem data_in_imag = NULL;
	cl_mem data_out_real = NULL;
	cl_mem data_out_imag = NULL;
	
	if(dataFormat == clFFT_SplitComplexFormat) {
		data_i_split.real     = (float *) malloc(sizeof(float) * length);
		data_i_split.imag     = (float *) malloc(sizeof(float) * length);
		data_cl_split.real    = (float *) malloc(sizeof(float) * length);
		data_cl_split.imag    = (float *) malloc(sizeof(float) * length);
		if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag)
		{
			err = -1;
			log_error("Out-of-Resources\n");
			goto cleanup;
		}
	}
	else {
		data_i  = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
		data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
		if(!data_i || !data_cl)
		{
			err = -2;
			log_error("Out-of-Resouces\n");
			goto cleanup;
		}
	}
	
	data_iref.real   = (double *) malloc(sizeof(double) * length);
	data_iref.imag   = (double *) malloc(sizeof(double) * length);
	data_oref.real   = (double *) malloc(sizeof(double) * length);
	data_oref.imag   = (double *) malloc(sizeof(double) * length);	
	if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag)
	{
		err = -3;
		log_error("Out-of-Resources\n");
		goto cleanup;
	}

	int i;
	if(dataFormat == clFFT_SplitComplexFormat) {
		for(i = 0; i < length; i++)
		{
			data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_cl_split.real[i] = 0.0f;
			data_cl_split.imag[i] = 0.0f;			
			data_iref.real[i] = data_i_split.real[i];
			data_iref.imag[i] = data_i_split.imag[i];
			data_oref.real[i] = data_iref.real[i];
			data_oref.imag[i] = data_iref.imag[i];	
		}
	}
	else {
		for(i = 0; i < length; i++)
		{
			data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_cl[i].real = 0.0f;
			data_cl[i].imag = 0.0f;			
			data_iref.real[i] = data_i[i].real;
			data_iref.imag[i] = data_i[i].imag;
			data_oref.real[i] = data_iref.real[i];
			data_oref.imag[i] = data_iref.imag[i];	
		}		
	}
	
	plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err );
	if(!plan || err) 
	{
		log_error("clFFT_CreatePlan failed\n");
		goto cleanup;
	}
	
	//clFFT_DumpPlan(plan, stdout);
	
	if(dataFormat == clFFT_SplitComplexFormat)
	{
		data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err);
	    if(!data_in_real || err) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		
		data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err);
	    if(!data_in_imag || err) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		
		if(testType == clFFT_OUT_OF_PLACE)
		{
			data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err);
			if(!data_out_real || err) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}
			
			data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err);
			if(!data_out_imag || err) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}			
		}
		else
		{
			data_out_real = data_in_real;
			data_out_imag = data_in_imag;
		}
	}
	else
	{
	    data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err);
	    if(!data_in) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		if(testType == clFFT_OUT_OF_PLACE)
		{
			data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err);
			if(!data_out) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}			
		}
		else
			data_out = data_in;
	}
		
			
	err = CL_SUCCESS;
	
	t0 = mach_absolute_time();
	if(dataFormat == clFFT_SplitComplexFormat)
	{
		for(iter = 0; iter < numIter; iter++)
		    err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL);
	}
	else
	{
	    for(iter = 0; iter < numIter; iter++) 
			err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL);
	}
	
	err |= clFinish(queue);
	
	if(err) 
	{
		log_error("clFFT_Execute\n");
		goto cleanup;	
	}
	
	t1 = mach_absolute_time(); 
	t = subtractTimes(t1, t0);
	char temp[100];
	sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize);
	log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp);

	if(dataFormat == clFFT_SplitComplexFormat)
	{	
		err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL);
		err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL);
	}
	else
	{
		err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL);
	}
	
	if(err) 
	{
		log_error("clEnqueueReadBuffer failed\n");
        goto cleanup;
	}	

	computeReferenceD(&data_oref, n, batchSize, dim, dir);
	
	double diff_avg, diff_max, diff_min;
	if(dataFormat == clFFT_SplitComplexFormat) {
		diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
		if(diff_avg > eps_avg)
			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
		else
			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);			
	}
	else {
		clFFT_SplitComplex result_split;
		result_split.real = (float *) malloc(length*sizeof(float));
		result_split.imag = (float *) malloc(length*sizeof(float));
		convertInterleavedToSplit(&result_split, data_cl, length);
		diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
		
		if(diff_avg > eps_avg)
			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
		else
			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);	
		free(result_split.real);
		free(result_split.imag);
	}
	
cleanup:
	clFFT_DestroyPlan(plan);	
	if(dataFormat == clFFT_SplitComplexFormat) 
	{
		if(data_i_split.real)
			free(data_i_split.real);
		if(data_i_split.imag)
			free(data_i_split.imag);
		if(data_cl_split.real)
			free(data_cl_split.real);
		if(data_cl_split.imag)
			free(data_cl_split.imag);
		
		if(data_in_real)
			clReleaseMemObject(data_in_real);
		if(data_in_imag)
			clReleaseMemObject(data_in_imag);
		if(data_out_real && testType == clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out_real);
		if(data_out_imag && clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out_imag);
	}
	else 
	{
		if(data_i)
			free(data_i);
		if(data_cl)
			free(data_cl);
		
		if(data_in)
			clReleaseMemObject(data_in);
		if(data_out && testType == clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out);
	}
	
	if(data_iref.real)
		free(data_iref.real);
	if(data_iref.imag)
		free(data_iref.imag);		
	if(data_oref.real)
		free(data_oref.real);
	if(data_oref.imag)
		free(data_oref.imag);
	
	return err;
}