void *output_to_PCM(NeAACDecHandle hDecoder, real_t **input, void *sample_buffer, uint8_t channels, uint16_t frame_len, uint8_t format) { int16_t *short_sample_buffer = (int16_t*)sample_buffer; int32_t *int_sample_buffer = (int32_t*)sample_buffer; float32_t *float_sample_buffer = (float32_t*)sample_buffer; double *double_sample_buffer = (double*)sample_buffer; #ifdef PROFILE int64_t count = faad_get_ts(); #endif /* Copy output to a standard PCM buffer */ switch (format) { case FAAD_FMT_16BIT: to_PCM_16bit(hDecoder, input, channels, frame_len, &short_sample_buffer); break; case FAAD_FMT_24BIT: to_PCM_24bit(hDecoder, input, channels, frame_len, &int_sample_buffer); break; case FAAD_FMT_32BIT: to_PCM_32bit(hDecoder, input, channels, frame_len, &int_sample_buffer); break; case FAAD_FMT_FLOAT: to_PCM_float(hDecoder, input, channels, frame_len, &float_sample_buffer); break; case FAAD_FMT_DOUBLE: to_PCM_double(hDecoder, input, channels, frame_len, &double_sample_buffer); break; } #ifdef PROFILE count = faad_get_ts() - count; hDecoder->output_cycles += count; #endif return sample_buffer; }
void faad_imdct(mdct_info *mdct, real_t *X_in, real_t *X_out) { uint16_t k; complex_t x; #ifdef ALLOW_SMALL_FRAMELENGTH #ifdef FIXED_POINT real_t scale, b_scale = 0; #endif #endif ALIGN complex_t Z1[512]; complex_t *sincos = mdct->sincos; uint16_t N = mdct->N; uint16_t N2 = N >> 1; uint16_t N4 = N >> 2; uint16_t N8 = N >> 3; #ifdef PROFILE int64_t count1, count2 = faad_get_ts(); #endif #ifdef ALLOW_SMALL_FRAMELENGTH #ifdef FIXED_POINT /* detect non-power of 2 */ if (N & (N-1)) { /* adjust scale for non-power of 2 MDCT */ /* 2048/1920 */ b_scale = 1; scale = COEF_CONST(1.0666666666666667); } #endif #endif /* pre-IFFT complex multiplication */ for (k = 0; k < N4; k++) { ComplexMult(&IM(Z1[k]), &RE(Z1[k]), X_in[2*k], X_in[N2 - 1 - 2*k], RE(sincos[k]), IM(sincos[k])); } #ifdef PROFILE count1 = faad_get_ts(); #endif /* complex IFFT, any non-scaling FFT can be used here */ cfftb(mdct->cfft, Z1); #ifdef PROFILE count1 = faad_get_ts() - count1; #endif /* post-IFFT complex multiplication */ for (k = 0; k < N4; k++) { RE(x) = RE(Z1[k]); IM(x) = IM(Z1[k]); ComplexMult(&IM(Z1[k]), &RE(Z1[k]), IM(x), RE(x), RE(sincos[k]), IM(sincos[k])); #ifdef ALLOW_SMALL_FRAMELENGTH #ifdef FIXED_POINT /* non-power of 2 MDCT scaling */ if (b_scale) { RE(Z1[k]) = MUL_C(RE(Z1[k]), scale); IM(Z1[k]) = MUL_C(IM(Z1[k]), scale); } #endif #endif } /* reordering */ for (k = 0; k < N8; k+=2) { X_out[ 2*k] = IM(Z1[N8 + k]); X_out[ 2 + 2*k] = IM(Z1[N8 + 1 + k]); X_out[ 1 + 2*k] = -RE(Z1[N8 - 1 - k]); X_out[ 3 + 2*k] = -RE(Z1[N8 - 2 - k]); X_out[N4 + 2*k] = RE(Z1[ k]); X_out[N4 + + 2 + 2*k] = RE(Z1[ 1 + k]); X_out[N4 + 1 + 2*k] = -IM(Z1[N4 - 1 - k]); X_out[N4 + 3 + 2*k] = -IM(Z1[N4 - 2 - k]); X_out[N2 + 2*k] = RE(Z1[N8 + k]); X_out[N2 + + 2 + 2*k] = RE(Z1[N8 + 1 + k]); X_out[N2 + 1 + 2*k] = -IM(Z1[N8 - 1 - k]); X_out[N2 + 3 + 2*k] = -IM(Z1[N8 - 2 - k]); X_out[N2 + N4 + 2*k] = -IM(Z1[ k]); X_out[N2 + N4 + 2 + 2*k] = -IM(Z1[ 1 + k]); X_out[N2 + N4 + 1 + 2*k] = RE(Z1[N4 - 1 - k]); X_out[N2 + N4 + 3 + 2*k] = RE(Z1[N4 - 2 - k]); } #ifdef PROFILE count2 = faad_get_ts() - count2; mdct->fft_cycles += count1; mdct->cycles += (count2 - count1); #endif }
uint8_t reconstruct_single_channel(NeAACDecHandle hDecoder, ic_stream *ics, element *sce, int16_t *spec_data) { uint8_t retval, output_channels; #ifdef PROFILE int64_t count = faad_get_ts(); #endif /* always allocate 2 channels, PS can always "suddenly" turn up */ #if (defined(PS_DEC) || defined(DRM_PS)) output_channels = 2; #else output_channels = 1; #endif if (hDecoder->element_output_channels[hDecoder->fr_ch_ele] == 0) { /* element_output_channels not set yet */ hDecoder->element_output_channels[hDecoder->fr_ch_ele] = output_channels; } else if (hDecoder->element_output_channels[hDecoder->fr_ch_ele] != output_channels) { /* element inconsistency */ return 21; } if (hDecoder->element_alloced[hDecoder->fr_ch_ele] == 0) { retval = allocate_single_channel(hDecoder, sce->channel, output_channels); if (retval > 0) return retval; hDecoder->element_alloced[hDecoder->fr_ch_ele] = 1; } /* dequantisation and scaling */ retval = quant_to_spec(hDecoder, ics, spec_data, spec_coef1, hDecoder->frameLength); if (retval > 0) return retval; #ifdef PROFILE count = faad_get_ts() - count; hDecoder->requant_cycles += count; #endif /* pns decoding */ pns_decode(ics, NULL, spec_coef1, NULL, hDecoder->frameLength, 0, hDecoder->object_type); #ifdef MAIN_DEC /* MAIN object type prediction */ if (hDecoder->object_type == MAIN) { /* intra channel prediction */ ic_prediction(ics, spec_coef1, hDecoder->pred_stat[sce->channel], hDecoder->frameLength, hDecoder->sf_index); /* In addition, for scalefactor bands coded by perceptual noise substitution the predictors belonging to the corresponding spectral coefficients are reset. */ pns_reset_pred_state(ics, hDecoder->pred_stat[sce->channel]); } #endif #ifdef LTP_DEC if (is_ltp_ot(hDecoder->object_type)) { #ifdef LD_DEC if (hDecoder->object_type == LD) { if (ics->ltp.data_present) { if (ics->ltp.lag_update) hDecoder->ltp_lag[sce->channel] = ics->ltp.lag; } ics->ltp.lag = hDecoder->ltp_lag[sce->channel]; } #endif /* long term prediction */ lt_prediction(ics, &(ics->ltp), spec_coef1, hDecoder->lt_pred_stat[sce->channel], hDecoder->fb, ics->window_shape, hDecoder->window_shape_prev[sce->channel], hDecoder->sf_index, hDecoder->object_type, hDecoder->frameLength); } #endif /* tns decoding */ tns_decode_frame(ics, &(ics->tns), hDecoder->sf_index, hDecoder->object_type, spec_coef1, hDecoder->frameLength); /* drc decoding */ if (hDecoder->drc->present) { if (!hDecoder->drc->exclude_mask[sce->channel] || !hDecoder->drc->excluded_chns_present) drc_decode(hDecoder->drc, spec_coef1); } /* filter bank */ #ifdef SSR_DEC if (hDecoder->object_type != SSR) { #endif ifilter_bank(ics->window_sequence,ics->window_shape, hDecoder->window_shape_prev[sce->channel],spec_coef1, hDecoder->time_out[sce->channel], hDecoder->fb_intermed[sce->channel], hDecoder->object_type, hDecoder->frameLength); #ifdef SSR_DEC } else { ssr_decode(&(ics->ssr), hDecoder->fb, ics->window_sequence, ics->window_shape, hDecoder->window_shape_prev[sce->channel], spec_coef1, hDecoder->time_out[sce->channel], hDecoder->ssr_overlap[sce->channel], hDecoder->ipqf_buffer[sce->channel], hDecoder->prev_fmd[sce->channel], hDecoder->frameLength); } #endif /* save window shape for next frame */ hDecoder->window_shape_prev[sce->channel] = ics->window_shape; #ifdef LTP_DEC if (is_ltp_ot(hDecoder->object_type)) { lt_update_state(hDecoder->lt_pred_stat[sce->channel], hDecoder->time_out[sce->channel], hDecoder->fb_intermed[sce->channel], hDecoder->frameLength, hDecoder->object_type); } #endif #ifdef SBR_DEC if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1)) && hDecoder->sbr_alloced[hDecoder->fr_ch_ele]) { uint8_t ele = hDecoder->fr_ch_ele; uint8_t ch = sce->channel; /* following case can happen when forceUpSampling == 1 */ if (hDecoder->sbr[ele] == NULL) { hDecoder->sbr[ele] = sbrDecodeInit(hDecoder->frameLength, hDecoder->element_id[ele], 2*get_sample_rate(hDecoder->sf_index), hDecoder->downSampledSBR #ifdef DRM , 0 #endif ); } if (sce->ics1.window_sequence == EIGHT_SHORT_SEQUENCE) hDecoder->sbr[ele]->maxAACLine = 8*sce->ics1.swb_offset[max(sce->ics1.max_sfb-1, 0)]; else hDecoder->sbr[ele]->maxAACLine = sce->ics1.swb_offset[max(sce->ics1.max_sfb-1, 0)]; /* check if any of the PS tools is used */ #if (defined(PS_DEC) || defined(DRM_PS)) if (hDecoder->ps_used[ele] == 0) { #endif retval = sbrDecodeSingleFrame(hDecoder->sbr[ele], hDecoder->time_out[ch], hDecoder->postSeekResetFlag, hDecoder->downSampledSBR); #if (defined(PS_DEC) || defined(DRM_PS)) } else { retval = sbrDecodeSingleFramePS(hDecoder->sbr[ele], hDecoder->time_out[ch], hDecoder->time_out[ch+1], hDecoder->postSeekResetFlag, hDecoder->downSampledSBR); } #endif if (retval > 0) return retval; } else if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1)) && !hDecoder->sbr_alloced[hDecoder->fr_ch_ele]) { return 23; } #endif /* copy L to R when no PS is used */ #if (defined(PS_DEC) || defined(DRM_PS)) if ((hDecoder->ps_used[hDecoder->fr_ch_ele] == 0)) { uint8_t ele = hDecoder->fr_ch_ele; uint8_t ch = sce->channel; uint16_t frame_size = (hDecoder->sbr_alloced[ele]) ? 2 : 1; frame_size *= hDecoder->frameLength*sizeof(real_t); memcpy(hDecoder->time_out[ch+1], hDecoder->time_out[ch], frame_size); } #endif return 0; }
uint8_t reconstruct_channel_pair(NeAACDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2, element *cpe, int16_t *spec_data1, int16_t *spec_data2) { uint8_t retval; #ifdef PROFILE int64_t count = faad_get_ts(); #endif if (hDecoder->element_alloced[hDecoder->fr_ch_ele] == 0) { retval = allocate_channel_pair(hDecoder, cpe->channel, (uint8_t)cpe->paired_channel); if (retval > 0) return retval; hDecoder->element_alloced[hDecoder->fr_ch_ele] = 1; } /* dequantisation and scaling */ retval = quant_to_spec(hDecoder, ics1, spec_data1, spec_coef1, hDecoder->frameLength); if (retval > 0) return retval; retval = quant_to_spec(hDecoder, ics2, spec_data2, spec_coef2, hDecoder->frameLength); if (retval > 0) return retval; #ifdef PROFILE count = faad_get_ts() - count; hDecoder->requant_cycles += count; #endif /* pns decoding */ if (ics1->ms_mask_present) { pns_decode(ics1, ics2, spec_coef1, spec_coef2, hDecoder->frameLength, 1, hDecoder->object_type); } else { pns_decode(ics1, NULL, spec_coef1, NULL, hDecoder->frameLength, 0, hDecoder->object_type); pns_decode(ics2, NULL, spec_coef2, NULL, hDecoder->frameLength, 0, hDecoder->object_type); } /* mid/side decoding */ ms_decode(ics1, ics2, spec_coef1, spec_coef2, hDecoder->frameLength); #if 0 { int i; for (i = 0; i < 1024; i++) { //printf("%d\n", spec_coef1[i]); printf("0x%.8X\n", spec_coef1[i]); } for (i = 0; i < 1024; i++) { //printf("%d\n", spec_coef2[i]); printf("0x%.8X\n", spec_coef2[i]); } } #endif /* intensity stereo decoding */ is_decode(ics1, ics2, spec_coef1, spec_coef2, hDecoder->frameLength); #if 0 { int i; for (i = 0; i < 1024; i++) { printf("%d\n", spec_coef1[i]); //printf("0x%.8X\n", spec_coef1[i]); } for (i = 0; i < 1024; i++) { printf("%d\n", spec_coef2[i]); //printf("0x%.8X\n", spec_coef2[i]); } } #endif #ifdef MAIN_DEC /* MAIN object type prediction */ if (hDecoder->object_type == MAIN) { /* intra channel prediction */ ic_prediction(ics1, spec_coef1, hDecoder->pred_stat[cpe->channel], hDecoder->frameLength, hDecoder->sf_index); ic_prediction(ics2, spec_coef2, hDecoder->pred_stat[cpe->paired_channel], hDecoder->frameLength, hDecoder->sf_index); /* In addition, for scalefactor bands coded by perceptual noise substitution the predictors belonging to the corresponding spectral coefficients are reset. */ pns_reset_pred_state(ics1, hDecoder->pred_stat[cpe->channel]); pns_reset_pred_state(ics2, hDecoder->pred_stat[cpe->paired_channel]); } #endif #ifdef LTP_DEC if (is_ltp_ot(hDecoder->object_type)) { ltp_info *ltp1 = &(ics1->ltp); ltp_info *ltp2 = (cpe->common_window) ? &(ics2->ltp2) : &(ics2->ltp); #ifdef LD_DEC if (hDecoder->object_type == LD) { if (ltp1->data_present) { if (ltp1->lag_update) hDecoder->ltp_lag[cpe->channel] = ltp1->lag; } ltp1->lag = hDecoder->ltp_lag[cpe->channel]; if (ltp2->data_present) { if (ltp2->lag_update) hDecoder->ltp_lag[cpe->paired_channel] = ltp2->lag; } ltp2->lag = hDecoder->ltp_lag[cpe->paired_channel]; } #endif /* long term prediction */ lt_prediction(ics1, ltp1, spec_coef1, hDecoder->lt_pred_stat[cpe->channel], hDecoder->fb, ics1->window_shape, hDecoder->window_shape_prev[cpe->channel], hDecoder->sf_index, hDecoder->object_type, hDecoder->frameLength); lt_prediction(ics2, ltp2, spec_coef2, hDecoder->lt_pred_stat[cpe->paired_channel], hDecoder->fb, ics2->window_shape, hDecoder->window_shape_prev[cpe->paired_channel], hDecoder->sf_index, hDecoder->object_type, hDecoder->frameLength); } #endif /* tns decoding */ tns_decode_frame(ics1, &(ics1->tns), hDecoder->sf_index, hDecoder->object_type, spec_coef1, hDecoder->frameLength); tns_decode_frame(ics2, &(ics2->tns), hDecoder->sf_index, hDecoder->object_type, spec_coef2, hDecoder->frameLength); /* drc decoding */ if (hDecoder->drc->present) { if (!hDecoder->drc->exclude_mask[cpe->channel] || !hDecoder->drc->excluded_chns_present) drc_decode(hDecoder->drc, spec_coef1); if (!hDecoder->drc->exclude_mask[cpe->paired_channel] || !hDecoder->drc->excluded_chns_present) drc_decode(hDecoder->drc, spec_coef2); } /* filter bank */ #ifdef SSR_DEC if (hDecoder->object_type != SSR) { #endif ifilter_bank(ics1->window_sequence,ics1->window_shape, hDecoder->window_shape_prev[cpe->channel],spec_coef1, hDecoder->time_out[cpe->channel], hDecoder->fb_intermed[cpe->channel], hDecoder->object_type, hDecoder->frameLength); ifilter_bank(ics2->window_sequence,ics2->window_shape, hDecoder->window_shape_prev[cpe->paired_channel], spec_coef2, hDecoder->time_out[cpe->paired_channel], hDecoder->fb_intermed[cpe->paired_channel], hDecoder->object_type, hDecoder->frameLength); #ifdef SSR_DEC } else { ssr_decode(&(ics1->ssr), hDecoder->fb, ics1->window_sequence, ics1->window_shape, hDecoder->window_shape_prev[cpe->channel], spec_coef1, hDecoder->time_out[cpe->channel], hDecoder->ssr_overlap[cpe->channel], hDecoder->ipqf_buffer[cpe->channel], hDecoder->prev_fmd[cpe->channel], hDecoder->frameLength); ssr_decode(&(ics2->ssr), hDecoder->fb, ics2->window_sequence, ics2->window_shape, hDecoder->window_shape_prev[cpe->paired_channel], spec_coef2, hDecoder->time_out[cpe->paired_channel], hDecoder->ssr_overlap[cpe->paired_channel], hDecoder->ipqf_buffer[cpe->paired_channel], hDecoder->prev_fmd[cpe->paired_channel], hDecoder->frameLength); } #endif /* save window shape for next frame */ hDecoder->window_shape_prev[cpe->channel] = ics1->window_shape; hDecoder->window_shape_prev[cpe->paired_channel] = ics2->window_shape; #ifdef LTP_DEC if (is_ltp_ot(hDecoder->object_type)) { lt_update_state(hDecoder->lt_pred_stat[cpe->channel], hDecoder->time_out[cpe->channel], hDecoder->fb_intermed[cpe->channel], hDecoder->frameLength, hDecoder->object_type); lt_update_state(hDecoder->lt_pred_stat[cpe->paired_channel], hDecoder->time_out[cpe->paired_channel], hDecoder->fb_intermed[cpe->paired_channel], hDecoder->frameLength, hDecoder->object_type); } #endif #ifdef SBR_DEC if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1)) && hDecoder->sbr_alloced[hDecoder->fr_ch_ele]) { uint8_t ele = hDecoder->fr_ch_ele; uint8_t ch0 = cpe->channel; uint8_t ch1 = cpe->paired_channel; /* following case can happen when forceUpSampling == 1 */ if (hDecoder->sbr[ele] == NULL) { hDecoder->sbr[ele] = sbrDecodeInit(hDecoder->frameLength, hDecoder->element_id[ele], 2*get_sample_rate(hDecoder->sf_index), hDecoder->downSampledSBR #ifdef DRM , 0 #endif ); } if (cpe->ics1.window_sequence == EIGHT_SHORT_SEQUENCE) hDecoder->sbr[ele]->maxAACLine = 8*cpe->ics1.swb_offset[max(cpe->ics1.max_sfb-1, 0)]; else hDecoder->sbr[ele]->maxAACLine = cpe->ics1.swb_offset[max(cpe->ics1.max_sfb-1, 0)]; retval = sbrDecodeCoupleFrame(hDecoder->sbr[ele], hDecoder->time_out[ch0], hDecoder->time_out[ch1], hDecoder->postSeekResetFlag, hDecoder->downSampledSBR); if (retval > 0) return retval; } else if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1)) && !hDecoder->sbr_alloced[hDecoder->fr_ch_ele]) { return 23; } #endif return 0; }
void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, uint8_t window_shape_prev, real_t *freq_in, real_t *time_out, uint8_t object_type, uint16_t frame_len) { int16_t i; ALIGN real_t transf_buf[2*1024] = {0}; const real_t *window_long = NULL; const real_t *window_long_prev = NULL; const real_t *window_short = NULL; const real_t *window_short_prev = NULL; uint16_t nlong = frame_len; uint16_t nshort = frame_len/8; uint16_t trans = nshort/2; uint16_t nflat_ls = (nlong-nshort)/2; #ifdef PROFILE int64_t count = faad_get_ts(); #endif #ifdef LD_DEC if (object_type == LD) { window_long = fb->ld_window[window_shape]; window_long_prev = fb->ld_window[window_shape_prev]; } else { #endif window_long = fb->long_window[window_shape]; window_long_prev = fb->long_window[window_shape_prev]; window_short = fb->short_window[window_shape]; window_short_prev = fb->short_window[window_shape_prev]; #ifdef LD_DEC } #endif switch (window_sequence) { case ONLY_LONG_SEQUENCE: imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nlong; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[i]); m2 = _mm_load_ps(&window_long_prev[i]); m6 = _mm_load_ps(&window_long[nlong-4-i]); m3 = _mm_load_ps(&time_out[nlong+i]); m5 = _mm_load_ps(&transf_buf[nlong+i]); m4 = _mm_mul_ps(m1, m2); m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_add_ps(m4, m3); m8 = _mm_mul_ps(m5, m7); _mm_store_ps(&time_out[i], m4); _mm_store_ps(&time_out[nlong+i], m8); } break; case LONG_START_SEQUENCE: imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nlong; i+=4) { __m128 m1 = _mm_load_ps(&transf_buf[i]); __m128 m2 = _mm_load_ps(&window_long_prev[i]); __m128 m3 = _mm_load_ps(&time_out[nlong+i]); __m128 m4 = _mm_mul_ps(m1, m2); m4 = _mm_add_ps(m4, m3); _mm_store_ps(&time_out[i], m4); } for (i = 0; i < nflat_ls; i+=4) { __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); _mm_store_ps(&time_out[nlong+i], m1); } for (i = 0; i < nshort; i+=4) { __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]); __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]); __m128 m3, m4; m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m3); _mm_store_ps(&time_out[nlong+nflat_ls+i], m4); } for (i = 0; i < nflat_ls; i+=4) { __m128 m1 = _mm_setzero_ps(); _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); } break; case EIGHT_SHORT_SEQUENCE: faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]); faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]); faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]); faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]); faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]); faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]); faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]); faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]); for (i = 0; i < nflat_ls; i+=4) { __m128 m1 = _mm_load_ps(&time_out[nlong+i]); _mm_store_ps(&time_out[i], m1); } for (i = 0; i < nshort; i+=4) { __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]); __m128 m2 = _mm_load_ps(&window_short_prev[i]); __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); __m128 m4 = _mm_mul_ps(m1, m2); m4 = _mm_add_ps(m4, m3); _mm_store_ps(&time_out[nflat_ls+i], m4); } for (i = 0; i < nshort; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*1+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]); m6 = _mm_load_ps(&transf_buf[nshort*2+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m4 = _mm_add_ps(m4, m3); m4 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4); } for (i = 0; i < nshort; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*3+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]); m6 = _mm_load_ps(&transf_buf[nshort*4+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m4 = _mm_add_ps(m4, m3); m4 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4); } for (i = 0; i < nshort; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*5+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]); m6 = _mm_load_ps(&transf_buf[nshort*6+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m4 = _mm_add_ps(m4, m3); m4 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4); } for(i = 0; i < trans; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*7+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]); m6 = _mm_load_ps(&transf_buf[nshort*8+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m4 = _mm_add_ps(m4, m3); m4 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4); } for (i = trans; i < nshort; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*7+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m6 = _mm_load_ps(&transf_buf[nshort*8+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m3 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3); } for (i = 0; i < nshort; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*9+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m6 = _mm_load_ps(&transf_buf[nshort*10+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m3 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3); } for (i = 0; i < nshort; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*11+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m6 = _mm_load_ps(&transf_buf[nshort*12+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m3 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3); } for (i = 0; i < nshort; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8; m1 = _mm_load_ps(&transf_buf[nshort*13+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m6 = _mm_load_ps(&transf_buf[nshort*14+i]); m7 = _mm_load_ps(&window_short[i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m5); m8 = _mm_mul_ps(m6, m7); m3 = _mm_add_ps(m4, m8); _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3); } for (i = 0; i < nshort; i+=4) { __m128 m1, m2, m3, m5; m1 = _mm_load_ps(&transf_buf[nshort*15+i]); m2 = _mm_load_ps(&window_short[nshort-4-i]); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m3 = _mm_mul_ps(m1, m5); _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3); } for (i = 0; i < nflat_ls; i+=4) { __m128 m1 = _mm_setzero_ps(); _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); } break; case LONG_STOP_SEQUENCE: imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nflat_ls; i+=4) { __m128 m1 = _mm_load_ps(&time_out[nlong+i]); _mm_store_ps(&time_out[i], m1); } for (i = 0; i < nshort; i+=4) { __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]); __m128 m2 = _mm_load_ps(&window_short_prev[i]); __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); __m128 m4 = _mm_mul_ps(m1, m2); m4 = _mm_add_ps(m4, m3); _mm_store_ps(&time_out[nflat_ls+i], m4); } for (i = 0; i < nflat_ls; i+=4) { __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]); __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]); __m128 m3 = _mm_add_ps(m1, m2); _mm_store_ps(&time_out[nflat_ls+nshort+i], m3); } for (i = 0; i < nlong; i+=4) { __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]); __m128 m3, m4; m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); m4 = _mm_mul_ps(m1, m3); _mm_store_ps(&time_out[nlong+i], m4); } break; } #ifdef PROFILE count = faad_get_ts() - count; fb->cycles += count; #endif }
void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, uint8_t window_shape_prev, real_t *freq_in, real_t *time_out, real_t *overlap, uint8_t object_type, uint16_t frame_len) { int16_t i; ALIGN real_t transf_buf[2*1024] = {0}; const real_t *window_long = NULL; const real_t *window_long_prev = NULL; const real_t *window_short = NULL; const real_t *window_short_prev = NULL; uint16_t nlong = frame_len; uint16_t nshort = frame_len/8; uint16_t trans = nshort/2; uint16_t nflat_ls = (nlong-nshort)/2; #ifdef PROFILE int64_t count = faad_get_ts(); #endif #ifdef LD_DEC if (object_type == LD) { window_long = fb->ld_window[window_shape]; window_long_prev = fb->ld_window[window_shape_prev]; } else { #endif window_long = fb->long_window[window_shape]; window_long_prev = fb->long_window[window_shape_prev]; window_short = fb->short_window[window_shape]; window_short_prev = fb->short_window[window_shape_prev]; #ifdef LD_DEC } #endif switch (window_sequence) { case ONLY_LONG_SEQUENCE: imdct_long(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nlong; i+=4) { time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } for (i = 0; i < nlong; i+=4) { overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]); overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]); overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]); } break; case LONG_START_SEQUENCE: imdct_long(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nlong; i+=4) { time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } for (i = 0; i < nflat_ls; i++) overlap[i] = transf_buf[nlong+i]; for (i = 0; i < nshort; i++) overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]); for (i = 0; i < nflat_ls; i++) overlap[nflat_ls+nshort+i] = 0; break; case EIGHT_SHORT_SEQUENCE: faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0); faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2); faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3); faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4); faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5); faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6); faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7); for (i = 0; i < nflat_ls; i++) time_out[i] = overlap[i]; for(i = 0; i < nshort; i++) { time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]); time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]); time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]); time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]); if (i < trans) time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); } for(i = 0; i < nshort; i++) { if (i >= trans) overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]); overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]); overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]); overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]); } for (i = 0; i < nflat_ls; i++) overlap[nflat_ls+nshort+i] = 0; break; case LONG_STOP_SEQUENCE: imdct_long(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nflat_ls; i++) time_out[i] = overlap[i]; for (i = 0; i < nshort; i++) time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); for (i = 0; i < nflat_ls; i++) time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; for (i = 0; i < nlong; i++) overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); break; } #ifdef PROFILE count = faad_get_ts() - count; fb->cycles += count; #endif }
void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, uint8_t window_shape_prev, real_t *freq_in, real_t *time_out, real_t *overlap, uint8_t object_type, uint16_t frame_len) { int16_t i; ALIGN real_t transf_buf[2*1024] = {0}; const real_t *window_long = NULL; const real_t *window_long_prev = NULL; const real_t *window_short = NULL; const real_t *window_short_prev = NULL; uint16_t nlong = frame_len; uint16_t nshort = frame_len/8; uint16_t trans = nshort/2; uint16_t nflat_ls = (nlong-nshort)/2; #ifdef PROFILE int64_t count = faad_get_ts(); #endif /* select windows of current frame and previous frame (Sine or KBD) */ #ifdef LD_DEC if (object_type == LD) { window_long = fb->ld_window[window_shape]; window_long_prev = fb->ld_window[window_shape_prev]; } else { #endif window_long = fb->long_window[window_shape]; window_long_prev = fb->long_window[window_shape_prev]; window_short = fb->short_window[window_shape]; window_short_prev = fb->short_window[window_shape_prev]; #ifdef LD_DEC } #endif #if 0 for (i = 0; i < 1024; i++) { printf("%d\n", freq_in[i]); } #endif #if 0 printf("%d %d\n", window_sequence, window_shape); #endif switch (window_sequence) { case ONLY_LONG_SEQUENCE: /* perform iMDCT */ imdct_long(fb, freq_in, transf_buf, 2*nlong); /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nlong; i+=4) { time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } /* window the second half and save as overlap for next frame */ for (i = 0; i < nlong; i+=4) { overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]); overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]); overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]); } break; case LONG_START_SEQUENCE: /* perform iMDCT */ imdct_long(fb, freq_in, transf_buf, 2*nlong); /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nlong; i+=4) { time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } /* window the second half and save as overlap for next frame */ /* construct second half window using padding with 1's and 0's */ for (i = 0; i < nflat_ls; i++) overlap[i] = transf_buf[nlong+i]; for (i = 0; i < nshort; i++) overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]); for (i = 0; i < nflat_ls; i++) overlap[nflat_ls+nshort+i] = 0; break; case EIGHT_SHORT_SEQUENCE: /* perform iMDCT for each short block */ faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0); faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2); faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3); faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4); faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5); faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6); faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7); /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nflat_ls; i++) time_out[i] = overlap[i]; for(i = 0; i < nshort; i++) { time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]); time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]); time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]); time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]); if (i < trans) time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); } /* window the second half and save as overlap for next frame */ for(i = 0; i < nshort; i++) { if (i >= trans) overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]); overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]); overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]); overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]); } for (i = 0; i < nflat_ls; i++) overlap[nflat_ls+nshort+i] = 0; break; case LONG_STOP_SEQUENCE: /* perform iMDCT */ imdct_long(fb, freq_in, transf_buf, 2*nlong); /* add second half output of previous frame to windowed output of current frame */ /* construct first half window using padding with 1's and 0's */ for (i = 0; i < nflat_ls; i++) time_out[i] = overlap[i]; for (i = 0; i < nshort; i++) time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); for (i = 0; i < nflat_ls; i++) time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; /* window the second half and save as overlap for next frame */ for (i = 0; i < nlong; i++) overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); break; } #if 0 for (i = 0; i < 1024; i++) { printf("%d\n", time_out[i]); //printf("0x%.8X\n", time_out[i]); } #endif #ifdef PROFILE count = faad_get_ts() - count; fb->cycles += count; #endif }
void faad_imdct_sse(mdct_info *mdct, real_t *X_in, real_t *X_out) { uint16_t k; ALIGN complex_t Z1[512]; complex_t *sincos = mdct->sincos; uint16_t N = mdct->N; uint16_t N2 = N >> 1; uint16_t N4 = N >> 2; uint16_t N8 = N >> 3; #ifdef PROFILE int64_t count1, count2 = faad_get_ts(); #endif /* pre-IFFT complex multiplication */ for (k = 0; k < N4; k+=4) { __m128 m12, m13, m14, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11; __m128 n12, n13, n14, n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11; n12 = _mm_load_ps(&X_in[N2 - 2*k - 8]); m12 = _mm_load_ps(&X_in[N2 - 2*k - 4]); m13 = _mm_load_ps(&X_in[2*k]); n13 = _mm_load_ps(&X_in[2*k + 4]); m1 = _mm_load_ps(&RE(sincos[k])); n1 = _mm_load_ps(&RE(sincos[k+2])); m0 = _mm_shuffle_ps(m12, m13, _MM_SHUFFLE(2,0,1,3)); m2 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,3,0,1)); m14 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3,1,2,0)); n0 = _mm_shuffle_ps(n12, n13, _MM_SHUFFLE(2,0,1,3)); n2 = _mm_shuffle_ps(n1, n1, _MM_SHUFFLE(2,3,0,1)); n14 = _mm_shuffle_ps(n0, n0, _MM_SHUFFLE(3,1,2,0)); m3 = _mm_mul_ps(m14, m1); n3 = _mm_mul_ps(n14, n1); m4 = _mm_mul_ps(m14, m2); n4 = _mm_mul_ps(n14, n2); m5 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(2,0,2,0)); n5 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(2,0,2,0)); m6 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(3,1,3,1)); n6 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(3,1,3,1)); m7 = _mm_add_ps(m5, m6); n7 = _mm_add_ps(n5, n6); m8 = _mm_sub_ps(m5, m6); n8 = _mm_sub_ps(n5, n6); m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(3,2,3,2)); n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(3,2,3,2)); m10 = _mm_shuffle_ps(m8, m8, _MM_SHUFFLE(1,0,1,0)); n10 = _mm_shuffle_ps(n8, n8, _MM_SHUFFLE(1,0,1,0)); m11 = _mm_unpacklo_ps(m10, m9); n11 = _mm_unpacklo_ps(n10, n9); _mm_store_ps(&RE(Z1[k]), m11); _mm_store_ps(&RE(Z1[k+2]), n11); } #ifdef PROFILE count1 = faad_get_ts(); #endif /* complex IFFT, any non-scaling FFT can be used here */ cfftb_sse(mdct->cfft, Z1); #ifdef PROFILE count1 = faad_get_ts() - count1; #endif /* post-IFFT complex multiplication */ for (k = 0; k < N4; k+=4) { __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11; __m128 n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11; m0 = _mm_load_ps(&RE(Z1[k])); n0 = _mm_load_ps(&RE(Z1[k+2])); m1 = _mm_load_ps(&RE(sincos[k])); n1 = _mm_load_ps(&RE(sincos[k+2])); m2 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,3,0,1)); n2 = _mm_shuffle_ps(n1, n1, _MM_SHUFFLE(2,3,0,1)); m3 = _mm_mul_ps(m0, m1); n3 = _mm_mul_ps(n0, n1); m4 = _mm_mul_ps(m0, m2); n4 = _mm_mul_ps(n0, n2); m5 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(2,0,2,0)); n5 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(2,0,2,0)); m6 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(3,1,3,1)); n6 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(3,1,3,1)); m7 = _mm_add_ps(m5, m6); n7 = _mm_add_ps(n5, n6); m8 = _mm_sub_ps(m5, m6); n8 = _mm_sub_ps(n5, n6); m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(3,2,3,2)); n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(3,2,3,2)); m10 = _mm_shuffle_ps(m8, m8, _MM_SHUFFLE(1,0,1,0)); n10 = _mm_shuffle_ps(n8, n8, _MM_SHUFFLE(1,0,1,0)); m11 = _mm_unpacklo_ps(m10, m9); n11 = _mm_unpacklo_ps(n10, n9); _mm_store_ps(&RE(Z1[k]), m11); _mm_store_ps(&RE(Z1[k+2]), n11); } /* reordering */ for (k = 0; k < N8; k+=2) { __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m13; __m128 n4, n5, n6, n7, n8, n9; __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0); __m128 neg2 = _mm_set_ps(-1.0, -1.0, -1.0, -1.0); m0 = _mm_load_ps(&RE(Z1[k])); m1 = _mm_load_ps(&RE(Z1[N8 - 2 - k])); m2 = _mm_load_ps(&RE(Z1[N8 + k])); m3 = _mm_load_ps(&RE(Z1[N4 - 2 - k])); m10 = _mm_mul_ps(m0, neg1); m11 = _mm_mul_ps(m1, neg2); m13 = _mm_mul_ps(m3, neg1); m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3,1,2,0)); n4 = _mm_shuffle_ps(m10, m10, _MM_SHUFFLE(3,1,2,0)); m4 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(3,1,2,0)); n5 = _mm_shuffle_ps(m13, m13, _MM_SHUFFLE(3,1,2,0)); m6 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(3,2,1,0)); n6 = _mm_shuffle_ps(n4, n5, _MM_SHUFFLE(3,2,1,0)); m7 = _mm_shuffle_ps(m5, m4, _MM_SHUFFLE(3,2,1,0)); n7 = _mm_shuffle_ps(n5, n4, _MM_SHUFFLE(3,2,1,0)); m8 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0,3,1,2)); n8 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2,1,3,0)); m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(2,1,3,0)); n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(0,3,1,2)); _mm_store_ps(&X_out[2*k], m8); _mm_store_ps(&X_out[N4 + 2*k], n8); _mm_store_ps(&X_out[N2 + 2*k], m9); _mm_store_ps(&X_out[N2 + N4 + 2*k], n9); } #ifdef PROFILE count2 = faad_get_ts() - count2; mdct->fft_cycles += count1; mdct->cycles += (count2 - count1); #endif }
void faad_imdct(mdct_info *mdct, real_t *X_in, real_t *X_out) { uint16_t k; complex_t x; ALIGN complex_t Z1[512]; complex_t *sincos = mdct->sincos; uint16_t N = mdct->N; uint16_t N2 = N >> 1; uint16_t N4 = N >> 2; uint16_t N8 = N >> 3; #ifdef PROFILE int64_t count1, count2 = faad_get_ts(); #endif /* pre-IFFT complex multiplication */ for (k = 0; k < N4; k++) { ComplexMult(&IM(Z1[k]), &RE(Z1[k]), X_in[2*k], X_in[N2 - 1 - 2*k], RE(sincos[k]), IM(sincos[k])); } #ifdef PROFILE count1 = faad_get_ts(); #endif /* complex IFFT, any non-scaling FFT can be used here */ cfftb(mdct->cfft, Z1); #ifdef PROFILE count1 = faad_get_ts() - count1; #endif /* post-IFFT complex multiplication */ for (k = 0; k < N4; k++) { RE(x) = RE(Z1[k]); IM(x) = IM(Z1[k]); ComplexMult(&IM(Z1[k]), &RE(Z1[k]), IM(x), RE(x), RE(sincos[k]), IM(sincos[k])); } /* reordering */ for (k = 0; k < N8; k+=2) { X_out[ 2*k] = IM(Z1[N8 + k]); X_out[ 2 + 2*k] = IM(Z1[N8 + 1 + k]); X_out[ 1 + 2*k] = -RE(Z1[N8 - 1 - k]); X_out[ 3 + 2*k] = -RE(Z1[N8 - 2 - k]); X_out[N4 + 2*k] = RE(Z1[ k]); X_out[N4 + + 2 + 2*k] = RE(Z1[ 1 + k]); X_out[N4 + 1 + 2*k] = -IM(Z1[N4 - 1 - k]); X_out[N4 + 3 + 2*k] = -IM(Z1[N4 - 2 - k]); X_out[N2 + 2*k] = RE(Z1[N8 + k]); X_out[N2 + + 2 + 2*k] = RE(Z1[N8 + 1 + k]); X_out[N2 + 1 + 2*k] = -IM(Z1[N8 - 1 - k]); X_out[N2 + 3 + 2*k] = -IM(Z1[N8 - 2 - k]); X_out[N2 + N4 + 2*k] = -IM(Z1[ k]); X_out[N2 + N4 + 2 + 2*k] = -IM(Z1[ 1 + k]); X_out[N2 + N4 + 1 + 2*k] = RE(Z1[N4 - 1 - k]); X_out[N2 + N4 + 3 + 2*k] = RE(Z1[N4 - 2 - k]); } #ifdef PROFILE count2 = faad_get_ts() - count2; mdct->fft_cycles += count1; mdct->cycles += (count2 - count1); #endif }