// Split-complex FFT void fft(T const* in_re, T const* in_im, T* out_re, T* out_im, length_type length, T scale, int exponent) { assert(is_dma_addr_ok(in_re)); assert(is_dma_addr_ok(in_im)); assert(is_dma_addr_ok(out_re)); assert(is_dma_addr_ok(out_im)); static char* code_ea = 0; static int code_size; Overlay_params params; Fft_split_params* fftp = ¶ms.zfft; if (code_ea == 0) load_plugin(code_ea, code_size, "plugin", "zhalfast_f"); fftp->code_ea = (uintptr_t)code_ea; fftp->code_size = code_size; fftp->cmd = overlay_zfft_f; fftp->direction = (exponent == -1 ? fwd_fft : inv_fft); fftp->size = length; fftp->scale = scale; fftp->ea_input_re = ea_from_ptr(in_re); fftp->ea_input_im = ea_from_ptr(in_im); fftp->ea_output_re = ea_from_ptr(out_re); fftp->ea_output_im = ea_from_ptr(out_im); fftp->in_blk_stride = 0; // not applicable in the single FFT case fftp->out_blk_stride = 0; fftp->chunks_per_wb = 1; fftp->chunks_per_spe = 1; Task_manager *mgr = Task_manager::instance(); Task task = mgr->reserve_iobuf<Plugin_tag, void> (VSIP_IMPL_OVERLAY_STACK_SIZE, sizeof(Overlay_params), VSIP_IMPL_OVERLAY_BUFFER_SIZE, VSIP_IMPL_OVERLAY_DTL_SIZE); assert(2*sizeof(complex<T>)*length <= VSIP_IMPL_OVERLAY_BUFFER_SIZE); Workblock block = task.create_workblock(1); block.set_parameters(params); block.enqueue(); task.sync(); }
// Split-complex FFTM void fftm(T const* in_re, T const* in_im, T* out_re, T* out_im, stride_type in_r_stride, stride_type in_c_stride, stride_type out_r_stride, stride_type out_c_stride, length_type rows, length_type cols, T scale, int exponent, int axis) { assert(is_dma_addr_ok(in_re)); assert(is_dma_addr_ok(in_im)); assert(is_dma_addr_ok(out_re)); assert(is_dma_addr_ok(out_im)); assert(is_dma_addr_ok(in_re + (axis != 0 ? in_r_stride : in_c_stride))); assert(is_dma_addr_ok(out_re + (axis != 0 ? out_r_stride : out_c_stride))); assert(is_dma_addr_ok(in_im + (axis != 0 ? in_r_stride : in_c_stride))); assert(is_dma_addr_ok(out_im + (axis != 0 ? out_r_stride : out_c_stride))); static char* code_ea = 0; static int code_size; Overlay_params params; Fft_split_params* fftp = ¶ms.zfft; if (code_ea == 0) load_plugin(code_ea, code_size, "plugin", "zhalfast_f"); fftp->code_ea = (uintptr_t)code_ea; fftp->code_size = code_size; fftp->cmd = overlay_zfft_f; fftp->direction = (exponent == -1 ? fwd_fft : inv_fft); fftp->scale = scale; length_type num_ffts; length_type in_stride; length_type out_stride; if (axis != 0) { num_ffts = rows; in_stride = in_r_stride; out_stride = out_r_stride; fftp->size = cols; } else { num_ffts = cols; in_stride = in_c_stride; out_stride = out_c_stride; fftp->size = rows; } fftp->ea_input_re = ea_from_ptr(in_re); fftp->ea_input_im = ea_from_ptr(in_im); fftp->ea_output_re = ea_from_ptr(out_re); fftp->ea_output_im = ea_from_ptr(out_im); fftp->in_blk_stride = in_stride; fftp->out_blk_stride = out_stride; Task_manager *mgr = Task_manager::instance(); length_type spes = mgr->num_spes(); length_type chunks_per_wb; // A chunk is the amount of data to perform 1 FFT. // // If the chunk size is less than 16 KB, send multiple chunks per // workblock to amortize transfer costs. if (fftp->size * sizeof(float) < 16384) chunks_per_wb = std::min<length_type>( 16384 / (fftp->size * sizeof(float)), VSIP_IMPL_OVERLAY_DTL_SIZE / 4); else chunks_per_wb = 1; length_type num_wb = num_ffts / chunks_per_wb; length_type wb_per_spe = num_wb / spes; length_type extra_wb = (num_ffts % chunks_per_wb) ? 1 : 0; Task task = mgr->reserve_iobuf<Plugin_tag, void> (VSIP_IMPL_OVERLAY_STACK_SIZE, sizeof(Overlay_params), VSIP_IMPL_OVERLAY_BUFFER_SIZE, VSIP_IMPL_OVERLAY_DTL_SIZE); assert(2*sizeof(complex<T>)*chunks_per_wb*fftp->size <= VSIP_IMPL_OVERLAY_BUFFER_SIZE); assert(4*chunks_per_wb <= VSIP_IMPL_OVERLAY_DTL_SIZE); for (length_type i = 0; i < spes && i < num_wb + extra_wb; ++i) { // If wbs don't divide evenly, give the first SPEs one extra. length_type spe_wb = (i < num_wb % spes) ? wb_per_spe + 1 : wb_per_spe; length_type spe_ffts = spe_wb * chunks_per_wb; if (extra_wb && (i == spes-1 || i >= num_wb)) { spe_wb += 1; spe_ffts += num_ffts % chunks_per_wb; } fftp->chunks_per_wb = chunks_per_wb; fftp->chunks_per_spe = spe_ffts; Workblock block = task.create_workblock(spe_wb); block.set_parameters(params); block.enqueue(); fftp->ea_input_re += sizeof(T) * spe_ffts * in_stride; fftp->ea_input_im += sizeof(T) * spe_ffts * in_stride; fftp->ea_output_re += sizeof(T) * spe_ffts * out_stride; fftp->ea_output_im += sizeof(T) * spe_ffts * out_stride; } task.sync(); }