Example #1
0
  // Split-complex FFT
  void 
  fft(T const* in_re, T const* in_im, T* out_re, T* out_im, 
    length_type length, T scale, int exponent)
  {
    assert(is_dma_addr_ok(in_re));
    assert(is_dma_addr_ok(in_im));
    assert(is_dma_addr_ok(out_re));
    assert(is_dma_addr_ok(out_im));

    static char* code_ea = 0;
    static int   code_size;
    Overlay_params params;
    Fft_split_params* fftp = &params.zfft;

    if (code_ea == 0) load_plugin(code_ea, code_size, "plugin", "zhalfast_f");

    fftp->code_ea        = (uintptr_t)code_ea;
    fftp->code_size      = code_size;
    fftp->cmd            = overlay_zfft_f;
    fftp->direction      = (exponent == -1 ? fwd_fft : inv_fft);
    fftp->size           = length;
    fftp->scale          = scale;
    fftp->ea_input_re    = ea_from_ptr(in_re);
    fftp->ea_input_im    = ea_from_ptr(in_im);
    fftp->ea_output_re   = ea_from_ptr(out_re);
    fftp->ea_output_im   = ea_from_ptr(out_im);
    fftp->in_blk_stride  = 0;  // not applicable in the single FFT case
    fftp->out_blk_stride = 0;
    fftp->chunks_per_wb  = 1;
    fftp->chunks_per_spe = 1;

    Task_manager *mgr = Task_manager::instance();
    Task task = mgr->reserve_iobuf<Plugin_tag, void>
      (VSIP_IMPL_OVERLAY_STACK_SIZE,
       sizeof(Overlay_params), 
       VSIP_IMPL_OVERLAY_BUFFER_SIZE, VSIP_IMPL_OVERLAY_DTL_SIZE);
    assert(2*sizeof(complex<T>)*length <= VSIP_IMPL_OVERLAY_BUFFER_SIZE);

    Workblock block = task.create_workblock(1);
    block.set_parameters(params);
    block.enqueue();

    task.sync();
  }
Example #2
0
  // Split-complex FFTM
  void 
  fftm(T const* in_re, T const* in_im,
    T* out_re, T* out_im,
    stride_type in_r_stride, stride_type in_c_stride,
    stride_type out_r_stride, stride_type out_c_stride,
    length_type rows, length_type cols, 
    T scale, int exponent, int axis)
  {
    assert(is_dma_addr_ok(in_re));
    assert(is_dma_addr_ok(in_im));
    assert(is_dma_addr_ok(out_re));
    assert(is_dma_addr_ok(out_im));
    assert(is_dma_addr_ok(in_re  + (axis != 0 ? in_r_stride  : in_c_stride)));
    assert(is_dma_addr_ok(out_re + (axis != 0 ? out_r_stride : out_c_stride)));
    assert(is_dma_addr_ok(in_im  + (axis != 0 ? in_r_stride  : in_c_stride)));
    assert(is_dma_addr_ok(out_im + (axis != 0 ? out_r_stride : out_c_stride)));

    static char* code_ea = 0;
    static int   code_size;
    Overlay_params params;
    Fft_split_params* fftp = &params.zfft;

    if (code_ea == 0) load_plugin(code_ea, code_size, "plugin", "zhalfast_f");

    fftp->code_ea   = (uintptr_t)code_ea;
    fftp->code_size = code_size;
    fftp->cmd       = overlay_zfft_f;
    fftp->direction = (exponent == -1 ? fwd_fft : inv_fft);
    fftp->scale     = scale;

    length_type num_ffts;
    length_type in_stride;
    length_type out_stride;

    if (axis != 0)
    {
      num_ffts = rows;
      in_stride = in_r_stride;
      out_stride = out_r_stride;
      fftp->size = cols;
    }
    else
    {
      num_ffts = cols;
      in_stride = in_c_stride;
      out_stride = out_c_stride;
      fftp->size = rows;
    }

    fftp->ea_input_re    = ea_from_ptr(in_re);
    fftp->ea_input_im    = ea_from_ptr(in_im);
    fftp->ea_output_re   = ea_from_ptr(out_re);
    fftp->ea_output_im   = ea_from_ptr(out_im);
    fftp->in_blk_stride  = in_stride;
    fftp->out_blk_stride = out_stride;

    Task_manager *mgr = Task_manager::instance();

    length_type spes          = mgr->num_spes();
    length_type chunks_per_wb;

    // A chunk is the amount of data to perform 1 FFT.
    //
    // If the chunk size is less than 16 KB, send multiple chunks per
    // workblock to amortize transfer costs.

    if (fftp->size * sizeof(float) < 16384)
      chunks_per_wb = std::min<length_type>(
	16384 / (fftp->size * sizeof(float)),
	VSIP_IMPL_OVERLAY_DTL_SIZE / 4);
    else 
      chunks_per_wb = 1;

    length_type num_wb     = num_ffts / chunks_per_wb;
    length_type wb_per_spe = num_wb / spes;
    length_type extra_wb   = (num_ffts % chunks_per_wb) ? 1 : 0;

    Task task = mgr->reserve_iobuf<Plugin_tag, void>
      (VSIP_IMPL_OVERLAY_STACK_SIZE,
       sizeof(Overlay_params), 
       VSIP_IMPL_OVERLAY_BUFFER_SIZE,
       VSIP_IMPL_OVERLAY_DTL_SIZE);
    
    assert(2*sizeof(complex<T>)*chunks_per_wb*fftp->size <= 
	   VSIP_IMPL_OVERLAY_BUFFER_SIZE);
    assert(4*chunks_per_wb <= VSIP_IMPL_OVERLAY_DTL_SIZE);


    for (length_type i = 0; i < spes && i < num_wb + extra_wb; ++i)
    {
      // If wbs don't divide evenly, give the first SPEs one extra.
      length_type spe_wb = (i < num_wb % spes) ? wb_per_spe + 1
                                               : wb_per_spe;
      length_type spe_ffts = spe_wb * chunks_per_wb;

      if (extra_wb && (i == spes-1 || i >= num_wb))
      {
	spe_wb   += 1;
	spe_ffts += num_ffts % chunks_per_wb;
      }

      fftp->chunks_per_wb  = chunks_per_wb;
      fftp->chunks_per_spe = spe_ffts;
      Workblock block = task.create_workblock(spe_wb);
      block.set_parameters(params);
      block.enqueue();

      fftp->ea_input_re  += sizeof(T) * spe_ffts * in_stride;
      fftp->ea_input_im  += sizeof(T) * spe_ffts * in_stride;
      fftp->ea_output_re += sizeof(T) * spe_ffts * out_stride;
      fftp->ea_output_im += sizeof(T) * spe_ffts * out_stride;
    }
    task.sync();
  }