void ne10_fft_c2c_1d_int16_neon (ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cpx_int16_t *twiddles, ne10_int32_t *factors, ne10_int32_t nfft, ne10_int32_t inverse_fft) { if (fin == fout) { /* NOTE: for an in-place FFT algorithm. It just performs an out-of-place FFT into a temp buffer */ ne10_fft_cpx_int16_t * tmpbuf_ = (ne10_fft_cpx_int16_t*) NE10_MALLOC (sizeof (ne10_fft_cpx_int16_t) * nfft); // copy the data from input to output and bit reversal ne10_data_bitreversal_int16 (tmpbuf_, fin, 1, &factors[2]); if (inverse_fft) ne10_mixed_radix_butterfly_inverse_int16_neon (tmpbuf_, factors, twiddles); else ne10_mixed_radix_butterfly_int16_neon (tmpbuf_, factors, twiddles); memcpy (fout, tmpbuf_, sizeof (ne10_fft_cpx_int16_t) *nfft); NE10_FREE (tmpbuf_); } else { // copy the data from input to output and bit reversal ne10_data_bitreversal_int16 (fout, fin, 1, &factors[2]); if (inverse_fft) ne10_mixed_radix_butterfly_inverse_int16_neon (fout, factors, twiddles); else ne10_mixed_radix_butterfly_int16_neon (fout, factors, twiddles); } }
* @return none. * The function implements a mixed radix-2/4 FFT (real to complex). The length of 2^N(N is 2, 3, 4, 5, 6 ....etc) is supported. * Otherwise, we alloc a temp buffer(the size is same as input buffer) for storing intermedia. * For the usage of this function, please check test/test_suite_fft_int16.c */ void ne10_fft_r2c_1d_int16_scaled_neon (ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_cpx_int16_t *twiddles, ne10_fft_cpx_int16_t *super_twiddles, ne10_int32_t *factors, ne10_int32_t nfft) { ne10_int32_t ncfft = nfft >> 1; /* malloc a temp buffer for cfft */ ne10_fft_cpx_int16_t * tmpbuf_ = (ne10_fft_cpx_int16_t*) NE10_MALLOC (sizeof (ne10_fft_cpx_int16_t) * ncfft); // copy the data from input to output and bit reversal ne10_data_bitreversal_int16 (tmpbuf_, (ne10_fft_cpx_int16_t*) fin, 1, &factors[2]); ne10_mixed_radix_butterfly_int16_neon (tmpbuf_, factors, twiddles); ne10_fft_split_r2c_1d_int16 (fout, tmpbuf_, super_twiddles, ncfft); NE10_FREE (tmpbuf_); } /** * @brief Mixed radix-2/4 IFFT (complex to real) of int16 data. * @param[out] *fout point to the output buffer * @param[in] *fin point to the input buffer * @param[in] *twiddles point to the twiddle buffer
int main(int argc, char **argv) { int input_fd; int output_fd; int fft_dev_fd; int result; IN_SAMPLE_TYPE *in_buf; OUT_SAMPLE_TYPE *out_buf; initialize_everything(argc, argv); /* allocate storage for input, output and config buffers */ in_buf = (IN_SAMPLE_TYPE*) NE10_MALLOC (FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(in_buf == NULL) error(1, errno, "in_buf allocation"); out_buf = (OUT_SAMPLE_TYPE*) NE10_MALLOC (FFT_POINTS * sizeof(OUT_SAMPLE_TYPE)); if(out_buf == NULL) error(1, errno, "out_buf allocation"); /* open the input and output files and fft dev */ input_fd = open(g_input_filename, O_RDONLY); if(input_fd < 0) error(1, errno, "opening input file '%s'", g_input_filename); output_fd = open(g_output_filename, O_WRONLY | O_CREAT); if(output_fd < 0) error(1, errno, "opening output file '%s'", g_output_filename); fft_dev_fd = open("/dev/fft", O_RDWR); if(fft_dev_fd < 0) error(1, errno, "opening fft_dev_fd"); /* capture the start value of the GT */ g_start_time = get_gt_value(); /* read the input data */ result = read(input_fd, in_buf, FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(result < 0) error(1, errno, "read input file"); if(result != (FFT_POINTS * sizeof(IN_SAMPLE_TYPE))) error(1, 0, "input data size, expected %d but got %d", FFT_POINTS * sizeof(IN_SAMPLE_TYPE), result); /* perform FFT with FPGA hardware, 16 bit input 24/32 bit output */ result = write(fft_dev_fd, in_buf, FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(result < 0) error(1, errno, "write to fft_dev_fd"); if (result != (int)(FFT_POINTS * sizeof(IN_SAMPLE_TYPE))) error(1, 0, "fft_dev_fd input data size, expected %d but got %d", FFT_POINTS * sizeof(IN_SAMPLE_TYPE), result); result = read(fft_dev_fd, out_buf, FFT_POINTS * sizeof(OUT_SAMPLE_TYPE)); if(result < 0) error(1, errno, "read from fft_dev_fd"); if (result != (int)(FFT_POINTS * sizeof(OUT_SAMPLE_TYPE))) error(1, 0, "fft_dev_fd output data size, expected %d but got %d", FFT_POINTS * sizeof(OUT_SAMPLE_TYPE), result); /* write the output data */ result = write(output_fd, out_buf, FFT_POINTS * sizeof(OUT_SAMPLE_TYPE)); if(result < 0) error(1, errno, "write output file"); if(result != (FFT_POINTS * sizeof(OUT_SAMPLE_TYPE))) error(1, 0, "output data size, expected %d but got %d", FFT_POINTS * sizeof(OUT_SAMPLE_TYPE), result); /* capture the end value of the GT */ g_end_time = get_gt_value(); /* close the input and output files and fft dev */ close(fft_dev_fd); close(output_fd); close(input_fd); /* free storage for input, output and config buffers */ NE10_FREE (out_buf); NE10_FREE (in_buf); print_results(); release_everything(); return 0; }
/** * @brief User-callable function to allocate all necessary storage space for the fft. * @param[in] nfft length of FFT * @return st point to the FFT config memory. This memory is allocated with malloc. * The function allocate all necessary storage space for the fft. It also factors out the length of FFT and generates the twiddle coeff. */ ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_neon (ne10_int32_t nfft) { // For input shorter than 16, fall back to c version. // We would not get much improvement from NEON for these cases. if (nfft < 16) { return ne10_fft_alloc_c2c_int32_c (nfft); } ne10_fft_cfg_int32_t st = NULL; ne10_uint32_t memneeded = sizeof (ne10_fft_state_int32_t) + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors*/ + sizeof (ne10_fft_cpx_int32_t) * nfft /* twiddle*/ + sizeof (ne10_fft_cpx_int32_t) * nfft /* buffer*/ + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/ st = (ne10_fft_cfg_int32_t) NE10_MALLOC (memneeded); // Bad allocation. if (st == NULL) { return st; } uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_int32_t); NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT); st->factors = (ne10_int32_t*) address; st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2)); st->buffer = st->twiddles + nfft; // st->last_twiddles is default NULL. // Calling fft_c or fft_neon is decided by this pointers. st->last_twiddles = NULL; st->nfft = nfft; if (nfft % NE10_FFT_PARA_LEVEL == 0) { // Size of FFT satisfies requirement of NEON optimization. st->nfft /= NE10_FFT_PARA_LEVEL; st->last_twiddles = st->twiddles + nfft / NE10_FFT_PARA_LEVEL; } ne10_int32_t result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT); // Can not factor. if (result == NE10_ERR) { NE10_FREE (st); return st; } // Check if radix-8 can be enabled ne10_int32_t stage_count = st->factors[0]; ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)]; // Enable radix-8. if (algorithm_flag == NE10_FFT_ALG_ANY) { result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT); if (result == NE10_ERR) { NE10_FREE (st); return st; } ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft); } else { st->last_twiddles = NULL; st->nfft = nfft; result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT); ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft); return st; } // Generate super twiddles for the last stage. if (nfft % NE10_FFT_PARA_LEVEL == 0) { // Size of FFT satisfies requirement of NEON optimization. ne10_fft_generate_twiddles_line_int32 (st->last_twiddles, st->nfft, 1, NE10_FFT_PARA_LEVEL, nfft); } return st; }
int main(int argc, char **argv) { int input_fd; int output_fd; int result; IN_SAMPLE_TYPE *in_buf; OUT_SAMPLE_TYPE *out_buf; CFG_TYPE cfg; int i; initialize_everything(argc, argv); /* allocate storage for input, output and config buffers */ in_buf = (IN_SAMPLE_TYPE*) NE10_MALLOC (FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(in_buf == NULL) error(1, errno, "in_buf allocation"); out_buf = (OUT_SAMPLE_TYPE*) NE10_MALLOC (FFT_POINTS * sizeof(OUT_SAMPLE_TYPE)); if(out_buf == NULL) error(1, errno, "out_buf allocation"); cfg = CFG_ALLOC_FUNC(FFT_CALC_POINTS); if(cfg == NULL) error(1, errno, "cfg allocation"); /* open the input and output files */ input_fd = open(g_input_filename, O_RDONLY); if(input_fd < 0) error(1, errno, "opening input file '%s'", g_input_filename); output_fd = open(g_output_filename, O_WRONLY | O_CREAT); if(output_fd < 0) error(1, errno, "opening output file '%s'", g_output_filename); /* capture the start value of the GT */ g_start_time = get_gt_value(); /* read the input data */ result = read(input_fd, in_buf, FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(result < 0) error(1, errno, "read input file"); if(result != (FFT_POINTS * sizeof(IN_SAMPLE_TYPE))) error(1, 0, "input data size, expected %d but got %d", FFT_POINTS * sizeof(IN_SAMPLE_TYPE), result); /* compute FFT */ for (i = 0; i < FFT_CALC_ROUNDS ; i++) { FFT_FUNC(out_buf + (i * FFT_CALC_POINTS), in_buf + (i * FFT_CALC_POINTS), cfg, 0, 1); } /* write the output data */ result = write(output_fd, out_buf, FFT_POINTS * sizeof(OUT_SAMPLE_TYPE)); if(result < 0) error(1, errno, "write output file"); if(result != (FFT_POINTS * sizeof(OUT_SAMPLE_TYPE))) error(1, 0, "output data size, expected %d but got %d", FFT_POINTS * sizeof(OUT_SAMPLE_TYPE), result); /* capture the end value of the GT */ g_end_time = get_gt_value(); /* close the input and output files */ close(output_fd); close(input_fd); /* free storage for input, output and config buffers */ NE10_FREE (cfg); NE10_FREE (out_buf); NE10_FREE (in_buf); print_results(); release_everything(); return 0; }
int main(int argc, char **argv) { int input_fd; int output_fd; int raw256stream_dev_fd; int result; IN_SAMPLE_TYPE *in_buf; int j; initialize_everything(argc, argv); /* allocate storage for input */ in_buf = (IN_SAMPLE_TYPE*) NE10_MALLOC (FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(in_buf == NULL) error(1, errno, "in_buf allocation"); /* open the input and output files and raw256stream device */ input_fd = open(g_input_filename, O_RDONLY); if(input_fd < 0) error(1, errno, "opening input file '%s'", g_input_filename); output_fd = open(g_output_filename, O_WRONLY | O_CREAT); if(output_fd < 0) error(1, errno, "opening output file '%s'", g_output_filename); raw256stream_dev_fd = open("/dev/raw256stream", O_RDWR); if(raw256stream_dev_fd < 0) error(1, errno, "opening raw256stream_dev_fd"); /* read the input data */ result = read(input_fd, in_buf, FFT_CALC_POINTS * sizeof(IN_SAMPLE_TYPE)); if(result < 0) error(1, errno, "read input file"); if(result != (FFT_CALC_POINTS * sizeof(IN_SAMPLE_TYPE))) error(1, 0, "input data size, expected %d but got %d", FFT_CALC_POINTS * sizeof(IN_SAMPLE_TYPE), result); /* write the waveform buffer */ result = write(raw256stream_dev_fd, in_buf, FFT_CALC_POINTS * sizeof(IN_SAMPLE_TYPE)); if(result < 0) error(1, errno, "write waveform buffer"); if(result != (FFT_CALC_POINTS * sizeof(IN_SAMPLE_TYPE))) error(1, 0, "output data size, expected %d but got %d", FFT_CALC_POINTS * sizeof(IN_SAMPLE_TYPE), result); /* capture the start value of the GT */ g_start_time = get_gt_value(); for(j = 0 ; j < FFT_IN_ROUNDS ; j++) { /* read the input stream */ result = read(raw256stream_dev_fd, in_buf, FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(result < 0) error(1, errno, "read input stream"); if(result != (FFT_POINTS * sizeof(IN_SAMPLE_TYPE))) error(1, 0, "input data size, expected %d but got %d", FFT_POINTS * sizeof(IN_SAMPLE_TYPE), result); /* write the output data */ result = write(output_fd, in_buf, FFT_POINTS * sizeof(IN_SAMPLE_TYPE)); if(result < 0) error(1, errno, "write output file"); if(result != (FFT_POINTS * sizeof(IN_SAMPLE_TYPE))) error(1, 0, "output data size, expected %d but got %d", FFT_POINTS * sizeof(IN_SAMPLE_TYPE), result); } /* capture the end value of the GT */ g_end_time = get_gt_value(); /* close the input and output files and stream device */ close(raw256stream_dev_fd); close(output_fd); close(input_fd); /* free storage for input, output and config buffers */ NE10_FREE (in_buf); print_results(); release_everything(); return 0; }
static inline void ne10_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout, const ne10_fft_cpx_float32_t *Fin, const ne10_fft_cpx_float32_t *twiddles, const ne10_int32_t radix, const ne10_int32_t in_step, const ne10_int32_t out_step, const ne10_int32_t is_inverse, const ne10_int32_t is_scaled) { ne10_int32_t q, q1; ne10_int32_t f_count = in_step; ne10_fft_cpx_float32_t tmp; ne10_fft_cpx_float32_t *scratch; scratch = (ne10_fft_cpx_float32_t *) NE10_MALLOC (radix * sizeof (ne10_fft_cpx_float32_t)); for (; f_count > 0; f_count--) { // load for (q1 = 0; q1 < radix; q1++) { scratch[q1] = Fin[in_step * q1]; if (is_inverse) { scratch[q1].i = -scratch[q1].i; #ifdef NE10_DSP_CFFT_SCALING if (is_scaled) { const ne10_float32_t one_by_nfft = 1.0 / (radix * in_step); scratch[q1].r *= one_by_nfft; scratch[q1].i *= one_by_nfft; } #endif } } // q1 // compute Fout[q1 * out_step] from definition for (q1 = 0; q1 < radix; q1++) { ne10_int32_t twidx = 0; Fout[q1 * out_step] = scratch[0]; for (q = 1; q < radix; q++) { twidx += 1 * q1; if (twidx >= radix) { twidx -= radix; } NE10_CPX_MUL_F32 (tmp, scratch[q], twiddles[twidx]); NE10_CPX_ADDTO (Fout[q1 * out_step], tmp); } // q if (is_inverse) { Fout[q1 * out_step].i = -Fout[q1 * out_step].i; } } // q1 Fout += radix; Fin++; } NE10_FREE (scratch); }