예제 #1
0
void *output_to_PCM(NeAACDecHandle hDecoder,
                    real_t **input, void *sample_buffer, uint8_t channels,
                    uint16_t frame_len, uint8_t format)
{
    int16_t   *short_sample_buffer = (int16_t*)sample_buffer;
    int32_t   *int_sample_buffer = (int32_t*)sample_buffer;
    float32_t *float_sample_buffer = (float32_t*)sample_buffer;
    double    *double_sample_buffer = (double*)sample_buffer;

#ifdef PROFILE
    int64_t count = faad_get_ts();
#endif

    /* Copy output to a standard PCM buffer */
    switch (format)
    {
    case FAAD_FMT_16BIT:
        to_PCM_16bit(hDecoder, input, channels, frame_len, &short_sample_buffer);
        break;
    case FAAD_FMT_24BIT:
        to_PCM_24bit(hDecoder, input, channels, frame_len, &int_sample_buffer);
        break;
    case FAAD_FMT_32BIT:
        to_PCM_32bit(hDecoder, input, channels, frame_len, &int_sample_buffer);
        break;
    case FAAD_FMT_FLOAT:
        to_PCM_float(hDecoder, input, channels, frame_len, &float_sample_buffer);
        break;
    case FAAD_FMT_DOUBLE:
        to_PCM_double(hDecoder, input, channels, frame_len, &double_sample_buffer);
        break;
    }

#ifdef PROFILE
    count = faad_get_ts() - count;
    hDecoder->output_cycles += count;
#endif

    return sample_buffer;
}
예제 #2
0
void faad_imdct(mdct_info *mdct, real_t *X_in, real_t *X_out)
{
    uint16_t k;

    complex_t x;
#ifdef ALLOW_SMALL_FRAMELENGTH
#ifdef FIXED_POINT
    real_t scale, b_scale = 0;
#endif
#endif
    ALIGN complex_t Z1[512];
    complex_t *sincos = mdct->sincos;

    uint16_t N  = mdct->N;
    uint16_t N2 = N >> 1;
    uint16_t N4 = N >> 2;
    uint16_t N8 = N >> 3;

#ifdef PROFILE
    int64_t count1, count2 = faad_get_ts();
#endif

#ifdef ALLOW_SMALL_FRAMELENGTH
#ifdef FIXED_POINT
    /* detect non-power of 2 */
    if (N & (N-1))
    {
        /* adjust scale for non-power of 2 MDCT */
        /* 2048/1920 */
        b_scale = 1;
        scale = COEF_CONST(1.0666666666666667);
    }
#endif
#endif

    /* pre-IFFT complex multiplication */
    for (k = 0; k < N4; k++)
    {
        ComplexMult(&IM(Z1[k]), &RE(Z1[k]),
            X_in[2*k], X_in[N2 - 1 - 2*k], RE(sincos[k]), IM(sincos[k]));
    }

#ifdef PROFILE
    count1 = faad_get_ts();
#endif

    /* complex IFFT, any non-scaling FFT can be used here */
    cfftb(mdct->cfft, Z1);

#ifdef PROFILE
    count1 = faad_get_ts() - count1;
#endif

    /* post-IFFT complex multiplication */
    for (k = 0; k < N4; k++)
    {
        RE(x) = RE(Z1[k]);
        IM(x) = IM(Z1[k]);
        ComplexMult(&IM(Z1[k]), &RE(Z1[k]),
            IM(x), RE(x), RE(sincos[k]), IM(sincos[k]));

#ifdef ALLOW_SMALL_FRAMELENGTH
#ifdef FIXED_POINT
        /* non-power of 2 MDCT scaling */
        if (b_scale)
        {
            RE(Z1[k]) = MUL_C(RE(Z1[k]), scale);
            IM(Z1[k]) = MUL_C(IM(Z1[k]), scale);
        }
#endif
#endif
    }

    /* reordering */
    for (k = 0; k < N8; k+=2)
    {
        X_out[              2*k] =  IM(Z1[N8 +     k]);
        X_out[          2 + 2*k] =  IM(Z1[N8 + 1 + k]);

        X_out[          1 + 2*k] = -RE(Z1[N8 - 1 - k]);
        X_out[          3 + 2*k] = -RE(Z1[N8 - 2 - k]);

        X_out[N4 +          2*k] =  RE(Z1[         k]);
        X_out[N4 +    + 2 + 2*k] =  RE(Z1[     1 + k]);

        X_out[N4 +      1 + 2*k] = -IM(Z1[N4 - 1 - k]);
        X_out[N4 +      3 + 2*k] = -IM(Z1[N4 - 2 - k]);

        X_out[N2 +          2*k] =  RE(Z1[N8 +     k]);
        X_out[N2 +    + 2 + 2*k] =  RE(Z1[N8 + 1 + k]);

        X_out[N2 +      1 + 2*k] = -IM(Z1[N8 - 1 - k]);
        X_out[N2 +      3 + 2*k] = -IM(Z1[N8 - 2 - k]);

        X_out[N2 + N4 +     2*k] = -IM(Z1[         k]);
        X_out[N2 + N4 + 2 + 2*k] = -IM(Z1[     1 + k]);

        X_out[N2 + N4 + 1 + 2*k] =  RE(Z1[N4 - 1 - k]);
        X_out[N2 + N4 + 3 + 2*k] =  RE(Z1[N4 - 2 - k]);
    }

#ifdef PROFILE
    count2 = faad_get_ts() - count2;
    mdct->fft_cycles += count1;
    mdct->cycles += (count2 - count1);
#endif
}
예제 #3
0
파일: specrec.c 프로젝트: leavittx/rockbox
uint8_t reconstruct_single_channel(NeAACDecHandle hDecoder, ic_stream *ics,
                                   element *sce, int16_t *spec_data)
{
    uint8_t retval, output_channels;

#ifdef PROFILE
    int64_t count = faad_get_ts();
#endif


    /* always allocate 2 channels, PS can always "suddenly" turn up */
#if (defined(PS_DEC) || defined(DRM_PS))
    output_channels = 2;
#else
    output_channels = 1;
#endif

    if (hDecoder->element_output_channels[hDecoder->fr_ch_ele] == 0)
    {
        /* element_output_channels not set yet */
        hDecoder->element_output_channels[hDecoder->fr_ch_ele] = output_channels;
    } else if (hDecoder->element_output_channels[hDecoder->fr_ch_ele] != output_channels) {
        /* element inconsistency */
        return 21;
    }

    if (hDecoder->element_alloced[hDecoder->fr_ch_ele] == 0)
    {
        retval = allocate_single_channel(hDecoder, sce->channel, output_channels);
        if (retval > 0)
            return retval;

        hDecoder->element_alloced[hDecoder->fr_ch_ele] = 1;
    }


    /* dequantisation and scaling */
    retval = quant_to_spec(hDecoder, ics, spec_data, spec_coef1, hDecoder->frameLength);
    if (retval > 0)
        return retval;

#ifdef PROFILE
    count = faad_get_ts() - count;
    hDecoder->requant_cycles += count;
#endif


    /* pns decoding */
    pns_decode(ics, NULL, spec_coef1, NULL, hDecoder->frameLength, 0, hDecoder->object_type);

#ifdef MAIN_DEC
    /* MAIN object type prediction */
    if (hDecoder->object_type == MAIN)
    {
        /* intra channel prediction */
        ic_prediction(ics, spec_coef1, hDecoder->pred_stat[sce->channel], hDecoder->frameLength,
            hDecoder->sf_index);

        /* In addition, for scalefactor bands coded by perceptual
           noise substitution the predictors belonging to the
           corresponding spectral coefficients are reset.
        */
        pns_reset_pred_state(ics, hDecoder->pred_stat[sce->channel]);
    }
#endif

#ifdef LTP_DEC
    if (is_ltp_ot(hDecoder->object_type))
    {
#ifdef LD_DEC
        if (hDecoder->object_type == LD)
        {
            if (ics->ltp.data_present)
            {
                if (ics->ltp.lag_update)
                    hDecoder->ltp_lag[sce->channel] = ics->ltp.lag;
            }
            ics->ltp.lag = hDecoder->ltp_lag[sce->channel];
        }
#endif

        /* long term prediction */
        lt_prediction(ics, &(ics->ltp), spec_coef1, hDecoder->lt_pred_stat[sce->channel], hDecoder->fb,
            ics->window_shape, hDecoder->window_shape_prev[sce->channel],
            hDecoder->sf_index, hDecoder->object_type, hDecoder->frameLength);
    }
#endif

    /* tns decoding */
    tns_decode_frame(ics, &(ics->tns), hDecoder->sf_index, hDecoder->object_type,
        spec_coef1, hDecoder->frameLength);

    /* drc decoding */
    if (hDecoder->drc->present)
    {
        if (!hDecoder->drc->exclude_mask[sce->channel] || !hDecoder->drc->excluded_chns_present)
            drc_decode(hDecoder->drc, spec_coef1);
    }

    /* filter bank */
#ifdef SSR_DEC
    if (hDecoder->object_type != SSR)
    {
#endif
        ifilter_bank(ics->window_sequence,ics->window_shape,
            hDecoder->window_shape_prev[sce->channel],spec_coef1,
            hDecoder->time_out[sce->channel], hDecoder->fb_intermed[sce->channel],
            hDecoder->object_type, hDecoder->frameLength);
#ifdef SSR_DEC
    } else {
        ssr_decode(&(ics->ssr), hDecoder->fb, ics->window_sequence, ics->window_shape,
            hDecoder->window_shape_prev[sce->channel], spec_coef1, hDecoder->time_out[sce->channel],
            hDecoder->ssr_overlap[sce->channel], hDecoder->ipqf_buffer[sce->channel], hDecoder->prev_fmd[sce->channel],
            hDecoder->frameLength);
    }
#endif

    /* save window shape for next frame */
    hDecoder->window_shape_prev[sce->channel] = ics->window_shape;

#ifdef LTP_DEC
    if (is_ltp_ot(hDecoder->object_type))
    {
        lt_update_state(hDecoder->lt_pred_stat[sce->channel], hDecoder->time_out[sce->channel],
            hDecoder->fb_intermed[sce->channel], hDecoder->frameLength, hDecoder->object_type);
    }
#endif

#ifdef SBR_DEC
    if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1))
        && hDecoder->sbr_alloced[hDecoder->fr_ch_ele])
    {
        uint8_t ele = hDecoder->fr_ch_ele;
        uint8_t ch = sce->channel;

        /* following case can happen when forceUpSampling == 1 */
        if (hDecoder->sbr[ele] == NULL)
        {
            hDecoder->sbr[ele] = sbrDecodeInit(hDecoder->frameLength,
                hDecoder->element_id[ele], 2*get_sample_rate(hDecoder->sf_index),
                hDecoder->downSampledSBR
#ifdef DRM
                , 0
#endif
                );
        }

        if (sce->ics1.window_sequence == EIGHT_SHORT_SEQUENCE)
            hDecoder->sbr[ele]->maxAACLine = 8*sce->ics1.swb_offset[max(sce->ics1.max_sfb-1, 0)];
        else
            hDecoder->sbr[ele]->maxAACLine = sce->ics1.swb_offset[max(sce->ics1.max_sfb-1, 0)];

        /* check if any of the PS tools is used */
#if (defined(PS_DEC) || defined(DRM_PS))
        if (hDecoder->ps_used[ele] == 0)
        {
#endif
            retval = sbrDecodeSingleFrame(hDecoder->sbr[ele], hDecoder->time_out[ch],
                hDecoder->postSeekResetFlag, hDecoder->downSampledSBR);
#if (defined(PS_DEC) || defined(DRM_PS))
        } else {
            retval = sbrDecodeSingleFramePS(hDecoder->sbr[ele], hDecoder->time_out[ch],
                hDecoder->time_out[ch+1], hDecoder->postSeekResetFlag,
                hDecoder->downSampledSBR);
        }
#endif
        if (retval > 0)
            return retval;
    } else if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1))
        && !hDecoder->sbr_alloced[hDecoder->fr_ch_ele])
    {
        return 23;
    }
#endif

    /* copy L to R when no PS is used */
#if (defined(PS_DEC) || defined(DRM_PS))
    if ((hDecoder->ps_used[hDecoder->fr_ch_ele] == 0))
    {
        uint8_t ele = hDecoder->fr_ch_ele;
        uint8_t ch = sce->channel;
        uint16_t frame_size = (hDecoder->sbr_alloced[ele]) ? 2 : 1;
        frame_size *= hDecoder->frameLength*sizeof(real_t);

        memcpy(hDecoder->time_out[ch+1], hDecoder->time_out[ch], frame_size);
    }
#endif

    return 0;
}
예제 #4
0
파일: specrec.c 프로젝트: leavittx/rockbox
uint8_t reconstruct_channel_pair(NeAACDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,
                                 element *cpe, int16_t *spec_data1, int16_t *spec_data2)
{
    uint8_t retval;

#ifdef PROFILE
    int64_t count = faad_get_ts();
#endif
    if (hDecoder->element_alloced[hDecoder->fr_ch_ele] == 0)
    {
        retval = allocate_channel_pair(hDecoder, cpe->channel, (uint8_t)cpe->paired_channel);
        if (retval > 0)
            return retval;

        hDecoder->element_alloced[hDecoder->fr_ch_ele] = 1;
    }

    /* dequantisation and scaling */
    retval = quant_to_spec(hDecoder, ics1, spec_data1, spec_coef1, hDecoder->frameLength);
    if (retval > 0)
        return retval;
    retval = quant_to_spec(hDecoder, ics2, spec_data2, spec_coef2, hDecoder->frameLength);
    if (retval > 0)
        return retval;

#ifdef PROFILE
    count = faad_get_ts() - count;
    hDecoder->requant_cycles += count;
#endif


    /* pns decoding */
    if (ics1->ms_mask_present)
    {
        pns_decode(ics1, ics2, spec_coef1, spec_coef2, hDecoder->frameLength, 1, hDecoder->object_type);
    } else {
        pns_decode(ics1, NULL, spec_coef1, NULL, hDecoder->frameLength, 0, hDecoder->object_type);
        pns_decode(ics2, NULL, spec_coef2, NULL, hDecoder->frameLength, 0, hDecoder->object_type);
    }

    /* mid/side decoding */
    ms_decode(ics1, ics2, spec_coef1, spec_coef2, hDecoder->frameLength);

#if 0
    {
        int i;
        for (i = 0; i < 1024; i++)
        {
            //printf("%d\n", spec_coef1[i]);
            printf("0x%.8X\n", spec_coef1[i]);
        }
        for (i = 0; i < 1024; i++)
        {
            //printf("%d\n", spec_coef2[i]);
            printf("0x%.8X\n", spec_coef2[i]);
        }
    }
#endif

    /* intensity stereo decoding */
    is_decode(ics1, ics2, spec_coef1, spec_coef2, hDecoder->frameLength);

#if 0
    {
        int i;
        for (i = 0; i < 1024; i++)
        {
            printf("%d\n", spec_coef1[i]);
            //printf("0x%.8X\n", spec_coef1[i]);
        }
        for (i = 0; i < 1024; i++)
        {
            printf("%d\n", spec_coef2[i]);
            //printf("0x%.8X\n", spec_coef2[i]);
        }
    }
#endif

#ifdef MAIN_DEC
    /* MAIN object type prediction */
    if (hDecoder->object_type == MAIN)
    {
        /* intra channel prediction */
        ic_prediction(ics1, spec_coef1, hDecoder->pred_stat[cpe->channel], hDecoder->frameLength,
            hDecoder->sf_index);
        ic_prediction(ics2, spec_coef2, hDecoder->pred_stat[cpe->paired_channel], hDecoder->frameLength,
            hDecoder->sf_index);

        /* In addition, for scalefactor bands coded by perceptual
           noise substitution the predictors belonging to the
           corresponding spectral coefficients are reset.
        */
        pns_reset_pred_state(ics1, hDecoder->pred_stat[cpe->channel]);
        pns_reset_pred_state(ics2, hDecoder->pred_stat[cpe->paired_channel]);
    }
#endif

#ifdef LTP_DEC
    if (is_ltp_ot(hDecoder->object_type))
    {
        ltp_info *ltp1 = &(ics1->ltp);
        ltp_info *ltp2 = (cpe->common_window) ? &(ics2->ltp2) : &(ics2->ltp);
#ifdef LD_DEC
        if (hDecoder->object_type == LD)
        {
            if (ltp1->data_present)
            {
                if (ltp1->lag_update)
                    hDecoder->ltp_lag[cpe->channel] = ltp1->lag;
            }
            ltp1->lag = hDecoder->ltp_lag[cpe->channel];
            if (ltp2->data_present)
            {
                if (ltp2->lag_update)
                    hDecoder->ltp_lag[cpe->paired_channel] = ltp2->lag;
            }
            ltp2->lag = hDecoder->ltp_lag[cpe->paired_channel];
        }
#endif

        /* long term prediction */
        lt_prediction(ics1, ltp1, spec_coef1, hDecoder->lt_pred_stat[cpe->channel], hDecoder->fb,
            ics1->window_shape, hDecoder->window_shape_prev[cpe->channel],
            hDecoder->sf_index, hDecoder->object_type, hDecoder->frameLength);
        lt_prediction(ics2, ltp2, spec_coef2, hDecoder->lt_pred_stat[cpe->paired_channel], hDecoder->fb,
            ics2->window_shape, hDecoder->window_shape_prev[cpe->paired_channel],
            hDecoder->sf_index, hDecoder->object_type, hDecoder->frameLength);
    }
#endif

    /* tns decoding */
    tns_decode_frame(ics1, &(ics1->tns), hDecoder->sf_index, hDecoder->object_type,
        spec_coef1, hDecoder->frameLength);
    tns_decode_frame(ics2, &(ics2->tns), hDecoder->sf_index, hDecoder->object_type,
        spec_coef2, hDecoder->frameLength);

    /* drc decoding */
    if (hDecoder->drc->present)
    {
        if (!hDecoder->drc->exclude_mask[cpe->channel] || !hDecoder->drc->excluded_chns_present)
            drc_decode(hDecoder->drc, spec_coef1);
        if (!hDecoder->drc->exclude_mask[cpe->paired_channel] || !hDecoder->drc->excluded_chns_present)
            drc_decode(hDecoder->drc, spec_coef2);
    }

    /* filter bank */
#ifdef SSR_DEC
    if (hDecoder->object_type != SSR)
    {
#endif
        ifilter_bank(ics1->window_sequence,ics1->window_shape, 
            hDecoder->window_shape_prev[cpe->channel],spec_coef1,
            hDecoder->time_out[cpe->channel], hDecoder->fb_intermed[cpe->channel],
            hDecoder->object_type, hDecoder->frameLength);
        ifilter_bank(ics2->window_sequence,ics2->window_shape,
            hDecoder->window_shape_prev[cpe->paired_channel], spec_coef2,
            hDecoder->time_out[cpe->paired_channel], hDecoder->fb_intermed[cpe->paired_channel],
            hDecoder->object_type, hDecoder->frameLength);
#ifdef SSR_DEC
    } else {
        ssr_decode(&(ics1->ssr), hDecoder->fb, ics1->window_sequence, ics1->window_shape,
            hDecoder->window_shape_prev[cpe->channel], spec_coef1, hDecoder->time_out[cpe->channel],
            hDecoder->ssr_overlap[cpe->channel], hDecoder->ipqf_buffer[cpe->channel],
            hDecoder->prev_fmd[cpe->channel], hDecoder->frameLength);
        ssr_decode(&(ics2->ssr), hDecoder->fb, ics2->window_sequence, ics2->window_shape,
            hDecoder->window_shape_prev[cpe->paired_channel], spec_coef2, hDecoder->time_out[cpe->paired_channel],
            hDecoder->ssr_overlap[cpe->paired_channel], hDecoder->ipqf_buffer[cpe->paired_channel],
            hDecoder->prev_fmd[cpe->paired_channel], hDecoder->frameLength);
    }
#endif

    /* save window shape for next frame */
    hDecoder->window_shape_prev[cpe->channel] = ics1->window_shape;
    hDecoder->window_shape_prev[cpe->paired_channel] = ics2->window_shape;

#ifdef LTP_DEC
    if (is_ltp_ot(hDecoder->object_type))
    {
        lt_update_state(hDecoder->lt_pred_stat[cpe->channel], hDecoder->time_out[cpe->channel],
            hDecoder->fb_intermed[cpe->channel], hDecoder->frameLength, hDecoder->object_type);
        lt_update_state(hDecoder->lt_pred_stat[cpe->paired_channel], hDecoder->time_out[cpe->paired_channel],
            hDecoder->fb_intermed[cpe->paired_channel], hDecoder->frameLength, hDecoder->object_type);
    }
#endif

#ifdef SBR_DEC
    if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1))
        && hDecoder->sbr_alloced[hDecoder->fr_ch_ele])
    {
        uint8_t ele = hDecoder->fr_ch_ele;
        uint8_t ch0 = cpe->channel;
        uint8_t ch1 = cpe->paired_channel;

        /* following case can happen when forceUpSampling == 1 */
        if (hDecoder->sbr[ele] == NULL)
        {
            hDecoder->sbr[ele] = sbrDecodeInit(hDecoder->frameLength,
                hDecoder->element_id[ele], 2*get_sample_rate(hDecoder->sf_index),
                hDecoder->downSampledSBR
#ifdef DRM
                , 0
#endif
                );
        }

        if (cpe->ics1.window_sequence == EIGHT_SHORT_SEQUENCE)
            hDecoder->sbr[ele]->maxAACLine = 8*cpe->ics1.swb_offset[max(cpe->ics1.max_sfb-1, 0)];
        else
            hDecoder->sbr[ele]->maxAACLine = cpe->ics1.swb_offset[max(cpe->ics1.max_sfb-1, 0)];

        retval = sbrDecodeCoupleFrame(hDecoder->sbr[ele],
            hDecoder->time_out[ch0], hDecoder->time_out[ch1],
            hDecoder->postSeekResetFlag, hDecoder->downSampledSBR);
        if (retval > 0)
            return retval;
    } else if (((hDecoder->sbr_present_flag == 1) || (hDecoder->forceUpSampling == 1))
        && !hDecoder->sbr_alloced[hDecoder->fr_ch_ele])
    {
        return 23;
    }
#endif

    return 0;
}
예제 #5
0
void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
                      uint8_t window_shape_prev, real_t *freq_in,
                      real_t *time_out, uint8_t object_type, uint16_t frame_len)
{
    int16_t i;
    ALIGN real_t transf_buf[2*1024] = {0};

    const real_t *window_long = NULL;
    const real_t *window_long_prev = NULL;
    const real_t *window_short = NULL;
    const real_t *window_short_prev = NULL;

    uint16_t nlong = frame_len;
    uint16_t nshort = frame_len/8;
    uint16_t trans = nshort/2;

    uint16_t nflat_ls = (nlong-nshort)/2;

#ifdef PROFILE
    int64_t count = faad_get_ts();
#endif

#ifdef LD_DEC
    if (object_type == LD)
    {
        window_long       = fb->ld_window[window_shape];
        window_long_prev  = fb->ld_window[window_shape_prev];
    } else {
#endif
        window_long       = fb->long_window[window_shape];
        window_long_prev  = fb->long_window[window_shape_prev];
        window_short      = fb->short_window[window_shape];
        window_short_prev = fb->short_window[window_shape_prev];
#ifdef LD_DEC
    }
#endif

    switch (window_sequence)
    {
    case ONLY_LONG_SEQUENCE:
        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
        for (i = 0; i < nlong; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;

            m1 = _mm_load_ps(&transf_buf[i]);
            m2 = _mm_load_ps(&window_long_prev[i]);
            m6 = _mm_load_ps(&window_long[nlong-4-i]);
            m3 = _mm_load_ps(&time_out[nlong+i]);
            m5 = _mm_load_ps(&transf_buf[nlong+i]);

            m4 = _mm_mul_ps(m1, m2);
            m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_add_ps(m4, m3);
            m8 = _mm_mul_ps(m5, m7);

            _mm_store_ps(&time_out[i], m4);
            _mm_store_ps(&time_out[nlong+i], m8);
        }
        break;

    case LONG_START_SEQUENCE:
        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
        for (i = 0; i < nlong; i+=4)
        {
            __m128 m1 = _mm_load_ps(&transf_buf[i]);
            __m128 m2 = _mm_load_ps(&window_long_prev[i]);
            __m128 m3 = _mm_load_ps(&time_out[nlong+i]);

            __m128 m4 = _mm_mul_ps(m1, m2);
            m4 = _mm_add_ps(m4, m3);

            _mm_store_ps(&time_out[i], m4);
        }
        for (i = 0; i < nflat_ls; i+=4)
        {
            __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
            _mm_store_ps(&time_out[nlong+i], m1);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]);
            __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]);
            __m128 m3, m4;

            m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m3);

            _mm_store_ps(&time_out[nlong+nflat_ls+i], m4);
        }
        for (i = 0; i < nflat_ls; i+=4)
        {
            __m128 m1 = _mm_setzero_ps();
            _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
        }
        break;

    case EIGHT_SHORT_SEQUENCE:
        faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]);
        faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]);
        faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]);
        faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]);
        faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]);
        faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]);
        faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]);
        faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]);
        for (i = 0; i < nflat_ls; i+=4)
        {
            __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
            _mm_store_ps(&time_out[i], m1);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]);
            __m128 m2 = _mm_load_ps(&window_short_prev[i]);
            __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);

            __m128 m4 = _mm_mul_ps(m1, m2);
            m4 = _mm_add_ps(m4, m3);

            _mm_store_ps(&time_out[nflat_ls+i], m4);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*1+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]);
            m6 = _mm_load_ps(&transf_buf[nshort*2+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m4 = _mm_add_ps(m4, m3);
            m4 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*3+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]);
            m6 = _mm_load_ps(&transf_buf[nshort*4+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m4 = _mm_add_ps(m4, m3);
            m4 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*5+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]);
            m6 = _mm_load_ps(&transf_buf[nshort*6+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m4 = _mm_add_ps(m4, m3);
            m4 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4);
        }
        for(i = 0; i < trans; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]);
            m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m4 = _mm_add_ps(m4, m3);
            m4 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4);
        }
        for (i = trans; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m3 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*9+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m6 = _mm_load_ps(&transf_buf[nshort*10+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m3 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*11+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m6 = _mm_load_ps(&transf_buf[nshort*12+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m3 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8;
            m1 = _mm_load_ps(&transf_buf[nshort*13+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);
            m6 = _mm_load_ps(&transf_buf[nshort*14+i]);
            m7 = _mm_load_ps(&window_short[i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m5);
            m8 = _mm_mul_ps(m6, m7);
            m3 = _mm_add_ps(m4, m8);

            _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1, m2, m3, m5;
            m1 = _mm_load_ps(&transf_buf[nshort*15+i]);
            m2 = _mm_load_ps(&window_short[nshort-4-i]);

            m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m3 = _mm_mul_ps(m1, m5);

            _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3);
        }
        for (i = 0; i < nflat_ls; i+=4)
        {
            __m128 m1 = _mm_setzero_ps();
            _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
        }
        break;

    case LONG_STOP_SEQUENCE:
        imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
        for (i = 0; i < nflat_ls; i+=4)
        {
            __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
            _mm_store_ps(&time_out[i], m1);
        }
        for (i = 0; i < nshort; i+=4)
        {
            __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]);
            __m128 m2 = _mm_load_ps(&window_short_prev[i]);
            __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);

            __m128 m4 = _mm_mul_ps(m1, m2);
            m4 = _mm_add_ps(m4, m3);

            _mm_store_ps(&time_out[nflat_ls+i], m4);
        }
        for (i = 0; i < nflat_ls; i+=4)
        {
            __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]);
            __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]);

            __m128 m3 = _mm_add_ps(m1, m2);

            _mm_store_ps(&time_out[nflat_ls+nshort+i], m3);
        }
        for (i = 0; i < nlong; i+=4)
        {
            __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
            __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]);
            __m128 m3, m4;

            m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));

            m4 = _mm_mul_ps(m1, m3);

            _mm_store_ps(&time_out[nlong+i], m4);
        }
		break;
    }

#ifdef PROFILE
    count = faad_get_ts() - count;
    fb->cycles += count;
#endif
}
예제 #6
0
void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
                  uint8_t window_shape_prev, real_t *freq_in,
                  real_t *time_out, real_t *overlap,
                  uint8_t object_type, uint16_t frame_len)
{
    int16_t i;
    ALIGN real_t transf_buf[2*1024] = {0};

    const real_t *window_long = NULL;
    const real_t *window_long_prev = NULL;
    const real_t *window_short = NULL;
    const real_t *window_short_prev = NULL;

    uint16_t nlong = frame_len;
    uint16_t nshort = frame_len/8;
    uint16_t trans = nshort/2;

    uint16_t nflat_ls = (nlong-nshort)/2;

#ifdef PROFILE
    int64_t count = faad_get_ts();
#endif

#ifdef LD_DEC
    if (object_type == LD)
    {
        window_long       = fb->ld_window[window_shape];
        window_long_prev  = fb->ld_window[window_shape_prev];
    } else {
#endif
        window_long       = fb->long_window[window_shape];
        window_long_prev  = fb->long_window[window_shape_prev];
        window_short      = fb->short_window[window_shape];
        window_short_prev = fb->short_window[window_shape_prev];
#ifdef LD_DEC
    }
#endif


    switch (window_sequence)
    {
    case ONLY_LONG_SEQUENCE:
        imdct_long(fb, freq_in, transf_buf, 2*nlong);
        for (i = 0; i < nlong; i+=4)
        {
            time_out[i]   = overlap[i]   + MUL_F(transf_buf[i],window_long_prev[i]);
            time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
            time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
            time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
        }
        for (i = 0; i < nlong; i+=4)
        {
            overlap[i]   = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
            overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]);
            overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]);
            overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]);
        }
        break;

    case LONG_START_SEQUENCE:
        imdct_long(fb, freq_in, transf_buf, 2*nlong);
        for (i = 0; i < nlong; i+=4)
        {
            time_out[i]   = overlap[i]   + MUL_F(transf_buf[i],window_long_prev[i]);
            time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
            time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
            time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
        }
        for (i = 0; i < nflat_ls; i++)
            overlap[i] = transf_buf[nlong+i];
        for (i = 0; i < nshort; i++)
            overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
        for (i = 0; i < nflat_ls; i++)
            overlap[nflat_ls+nshort+i] = 0;
        break;

    case EIGHT_SHORT_SEQUENCE:
        faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0);
        faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1);
        faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2);
        faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3);
        faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4);
        faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5);
        faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6);
        faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7);
        for (i = 0; i < nflat_ls; i++)
            time_out[i] = overlap[i];
        for(i = 0; i < nshort; i++)
        {
            time_out[nflat_ls+         i] = overlap[nflat_ls+         i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]);
            time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]);
            time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]);
            time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]);
            if (i < trans)
                time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
        }
        for(i = 0; i < nshort; i++)
        {
            if (i >= trans)
                overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
            overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]);
            overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]);
            overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]);
            overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]);
        }
        for (i = 0; i < nflat_ls; i++)
            overlap[nflat_ls+nshort+i] = 0;
        break;

    case LONG_STOP_SEQUENCE:
        imdct_long(fb, freq_in, transf_buf, 2*nlong);
        for (i = 0; i < nflat_ls; i++)
            time_out[i] = overlap[i];
        for (i = 0; i < nshort; i++)
            time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]);
        for (i = 0; i < nflat_ls; i++)
            time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
        for (i = 0; i < nlong; i++)
            overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
		break;
    }

#ifdef PROFILE
    count = faad_get_ts() - count;
    fb->cycles += count;
#endif
}
예제 #7
0
void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
                  uint8_t window_shape_prev, real_t *freq_in,
                  real_t *time_out, real_t *overlap,
                  uint8_t object_type, uint16_t frame_len)
{
    int16_t i;
    ALIGN real_t transf_buf[2*1024] = {0};

    const real_t *window_long = NULL;
    const real_t *window_long_prev = NULL;
    const real_t *window_short = NULL;
    const real_t *window_short_prev = NULL;

    uint16_t nlong = frame_len;
    uint16_t nshort = frame_len/8;
    uint16_t trans = nshort/2;

    uint16_t nflat_ls = (nlong-nshort)/2;

#ifdef PROFILE
    int64_t count = faad_get_ts();
#endif

    /* select windows of current frame and previous frame (Sine or KBD) */
#ifdef LD_DEC
    if (object_type == LD)
    {
        window_long       = fb->ld_window[window_shape];
        window_long_prev  = fb->ld_window[window_shape_prev];
    } else {
#endif
        window_long       = fb->long_window[window_shape];
        window_long_prev  = fb->long_window[window_shape_prev];
        window_short      = fb->short_window[window_shape];
        window_short_prev = fb->short_window[window_shape_prev];
#ifdef LD_DEC
    }
#endif

#if 0
    for (i = 0; i < 1024; i++)
    {
        printf("%d\n", freq_in[i]);
    }
#endif

#if 0
    printf("%d %d\n", window_sequence, window_shape);
#endif

    switch (window_sequence)
    {
    case ONLY_LONG_SEQUENCE:
        /* perform iMDCT */
        imdct_long(fb, freq_in, transf_buf, 2*nlong);

        /* add second half output of previous frame to windowed output of current frame */
        for (i = 0; i < nlong; i+=4)
        {
            time_out[i]   = overlap[i]   + MUL_F(transf_buf[i],window_long_prev[i]);
            time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
            time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
            time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
        }

        /* window the second half and save as overlap for next frame */
        for (i = 0; i < nlong; i+=4)
        {
            overlap[i]   = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
            overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]);
            overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]);
            overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]);
        }
        break;

    case LONG_START_SEQUENCE:
        /* perform iMDCT */
        imdct_long(fb, freq_in, transf_buf, 2*nlong);

        /* add second half output of previous frame to windowed output of current frame */
        for (i = 0; i < nlong; i+=4)
        {
            time_out[i]   = overlap[i]   + MUL_F(transf_buf[i],window_long_prev[i]);
            time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
            time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
            time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
        }

        /* window the second half and save as overlap for next frame */
        /* construct second half window using padding with 1's and 0's */
        for (i = 0; i < nflat_ls; i++)
            overlap[i] = transf_buf[nlong+i];
        for (i = 0; i < nshort; i++)
            overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
        for (i = 0; i < nflat_ls; i++)
            overlap[nflat_ls+nshort+i] = 0;
        break;

    case EIGHT_SHORT_SEQUENCE:
        /* perform iMDCT for each short block */
        faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0);
        faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1);
        faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2);
        faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3);
        faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4);
        faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5);
        faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6);
        faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7);

        /* add second half output of previous frame to windowed output of current frame */
        for (i = 0; i < nflat_ls; i++)
            time_out[i] = overlap[i];
        for(i = 0; i < nshort; i++)
        {
            time_out[nflat_ls+         i] = overlap[nflat_ls+         i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]);
            time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]);
            time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]);
            time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]);
            if (i < trans)
                time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
        }

        /* window the second half and save as overlap for next frame */
        for(i = 0; i < nshort; i++)
        {
            if (i >= trans)
                overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
            overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]);
            overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]);
            overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]);
            overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]);
        }
        for (i = 0; i < nflat_ls; i++)
            overlap[nflat_ls+nshort+i] = 0;
        break;

    case LONG_STOP_SEQUENCE:
        /* perform iMDCT */
        imdct_long(fb, freq_in, transf_buf, 2*nlong);

        /* add second half output of previous frame to windowed output of current frame */
        /* construct first half window using padding with 1's and 0's */
        for (i = 0; i < nflat_ls; i++)
            time_out[i] = overlap[i];
        for (i = 0; i < nshort; i++)
            time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]);
        for (i = 0; i < nflat_ls; i++)
            time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];

        /* window the second half and save as overlap for next frame */
        for (i = 0; i < nlong; i++)
            overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
		break;
    }

#if 0
    for (i = 0; i < 1024; i++)
    {
        printf("%d\n", time_out[i]);
        //printf("0x%.8X\n", time_out[i]);
    }
#endif


#ifdef PROFILE
    count = faad_get_ts() - count;
    fb->cycles += count;
#endif
}
예제 #8
0
void faad_imdct_sse(mdct_info *mdct, real_t *X_in, real_t *X_out)
{
    uint16_t k;

    ALIGN complex_t Z1[512];
    complex_t *sincos = mdct->sincos;

    uint16_t N  = mdct->N;
    uint16_t N2 = N >> 1;
    uint16_t N4 = N >> 2;
    uint16_t N8 = N >> 3;

#ifdef PROFILE
    int64_t count1, count2 = faad_get_ts();
#endif

    /* pre-IFFT complex multiplication */
    for (k = 0; k < N4; k+=4)
    {
        __m128 m12, m13, m14, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11;
        __m128 n12, n13, n14, n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11;
        n12 = _mm_load_ps(&X_in[N2 - 2*k - 8]);
        m12 = _mm_load_ps(&X_in[N2 - 2*k - 4]);
        m13 = _mm_load_ps(&X_in[2*k]);
        n13 = _mm_load_ps(&X_in[2*k + 4]);
        m1 = _mm_load_ps(&RE(sincos[k]));
        n1 = _mm_load_ps(&RE(sincos[k+2]));

        m0 = _mm_shuffle_ps(m12, m13, _MM_SHUFFLE(2,0,1,3));
        m2 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,3,0,1));
        m14 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3,1,2,0));
        n0 = _mm_shuffle_ps(n12, n13, _MM_SHUFFLE(2,0,1,3));
        n2 = _mm_shuffle_ps(n1, n1, _MM_SHUFFLE(2,3,0,1));
        n14 = _mm_shuffle_ps(n0, n0, _MM_SHUFFLE(3,1,2,0));

        m3 = _mm_mul_ps(m14, m1);
        n3 = _mm_mul_ps(n14, n1);
        m4 = _mm_mul_ps(m14, m2);
        n4 = _mm_mul_ps(n14, n2);

        m5 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(2,0,2,0));
        n5 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(2,0,2,0));
        m6 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(3,1,3,1));
        n6 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(3,1,3,1));

        m7 = _mm_add_ps(m5, m6);
        n7 = _mm_add_ps(n5, n6);
        m8 = _mm_sub_ps(m5, m6);
        n8 = _mm_sub_ps(n5, n6);

        m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(3,2,3,2));
        n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(3,2,3,2));
        m10 = _mm_shuffle_ps(m8, m8, _MM_SHUFFLE(1,0,1,0));
        n10 = _mm_shuffle_ps(n8, n8, _MM_SHUFFLE(1,0,1,0));

        m11 = _mm_unpacklo_ps(m10, m9);
        n11 = _mm_unpacklo_ps(n10, n9);

        _mm_store_ps(&RE(Z1[k]), m11);
        _mm_store_ps(&RE(Z1[k+2]), n11);
    }

#ifdef PROFILE
    count1 = faad_get_ts();
#endif

    /* complex IFFT, any non-scaling FFT can be used here */
    cfftb_sse(mdct->cfft, Z1);

#ifdef PROFILE
    count1 = faad_get_ts() - count1;
#endif

    /* post-IFFT complex multiplication */
    for (k = 0; k < N4; k+=4)
    {
        __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11;
        __m128 n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11;
        m0 = _mm_load_ps(&RE(Z1[k]));
        n0 = _mm_load_ps(&RE(Z1[k+2]));
        m1 = _mm_load_ps(&RE(sincos[k]));
        n1 = _mm_load_ps(&RE(sincos[k+2]));

        m2 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(2,3,0,1));
        n2 = _mm_shuffle_ps(n1, n1, _MM_SHUFFLE(2,3,0,1));

        m3 = _mm_mul_ps(m0, m1);
        n3 = _mm_mul_ps(n0, n1);
        m4 = _mm_mul_ps(m0, m2);
        n4 = _mm_mul_ps(n0, n2);

        m5 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(2,0,2,0));
        n5 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(2,0,2,0));
        m6 = _mm_shuffle_ps(m3, m4, _MM_SHUFFLE(3,1,3,1));
        n6 = _mm_shuffle_ps(n3, n4, _MM_SHUFFLE(3,1,3,1));

        m7 = _mm_add_ps(m5, m6);
        n7 = _mm_add_ps(n5, n6);
        m8 = _mm_sub_ps(m5, m6);
        n8 = _mm_sub_ps(n5, n6);

        m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(3,2,3,2));
        n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(3,2,3,2));
        m10 = _mm_shuffle_ps(m8, m8, _MM_SHUFFLE(1,0,1,0));
        n10 = _mm_shuffle_ps(n8, n8, _MM_SHUFFLE(1,0,1,0));

        m11 = _mm_unpacklo_ps(m10, m9);
        n11 = _mm_unpacklo_ps(n10, n9);

        _mm_store_ps(&RE(Z1[k]), m11);
        _mm_store_ps(&RE(Z1[k+2]), n11);
    }

    /* reordering */
    for (k = 0; k < N8; k+=2)
    {
        __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m13;
        __m128 n4, n5, n6, n7, n8, n9;
        __m128 neg1 = _mm_set_ps(-1.0,  1.0, -1.0,  1.0);
        __m128 neg2 = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);

        m0 = _mm_load_ps(&RE(Z1[k]));
        m1 = _mm_load_ps(&RE(Z1[N8 - 2 - k]));
        m2 = _mm_load_ps(&RE(Z1[N8 + k]));
        m3 = _mm_load_ps(&RE(Z1[N4 - 2 - k]));

        m10 = _mm_mul_ps(m0, neg1);
        m11 = _mm_mul_ps(m1, neg2);
        m13 = _mm_mul_ps(m3, neg1);

        m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3,1,2,0));
        n4 = _mm_shuffle_ps(m10, m10, _MM_SHUFFLE(3,1,2,0));
        m4 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(3,1,2,0));
        n5 = _mm_shuffle_ps(m13, m13, _MM_SHUFFLE(3,1,2,0));

        m6 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(3,2,1,0));
        n6 = _mm_shuffle_ps(n4, n5, _MM_SHUFFLE(3,2,1,0));
        m7 = _mm_shuffle_ps(m5, m4, _MM_SHUFFLE(3,2,1,0));
        n7 = _mm_shuffle_ps(n5, n4, _MM_SHUFFLE(3,2,1,0));

        m8 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0,3,1,2));
        n8 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2,1,3,0));
        m9 = _mm_shuffle_ps(m7, m7, _MM_SHUFFLE(2,1,3,0));
        n9 = _mm_shuffle_ps(n7, n7, _MM_SHUFFLE(0,3,1,2));

        _mm_store_ps(&X_out[2*k], m8);
        _mm_store_ps(&X_out[N4 + 2*k], n8);
        _mm_store_ps(&X_out[N2 + 2*k], m9);
        _mm_store_ps(&X_out[N2 + N4 + 2*k], n9);
    }

#ifdef PROFILE
    count2 = faad_get_ts() - count2;
    mdct->fft_cycles += count1;
    mdct->cycles += (count2 - count1);
#endif
}
예제 #9
0
void faad_imdct(mdct_info *mdct, real_t *X_in, real_t *X_out)
{
    uint16_t k;

    complex_t x;
    ALIGN complex_t Z1[512];
    complex_t *sincos = mdct->sincos;

    uint16_t N  = mdct->N;
    uint16_t N2 = N >> 1;
    uint16_t N4 = N >> 2;
    uint16_t N8 = N >> 3;

#ifdef PROFILE
    int64_t count1, count2 = faad_get_ts();
#endif

    /* pre-IFFT complex multiplication */
    for (k = 0; k < N4; k++)
    {
        ComplexMult(&IM(Z1[k]), &RE(Z1[k]),
            X_in[2*k], X_in[N2 - 1 - 2*k], RE(sincos[k]), IM(sincos[k]));
    }

#ifdef PROFILE
    count1 = faad_get_ts();
#endif

    /* complex IFFT, any non-scaling FFT can be used here */
    cfftb(mdct->cfft, Z1);

#ifdef PROFILE
    count1 = faad_get_ts() - count1;
#endif

    /* post-IFFT complex multiplication */
    for (k = 0; k < N4; k++)
    {
        RE(x) = RE(Z1[k]);
        IM(x) = IM(Z1[k]);
        ComplexMult(&IM(Z1[k]), &RE(Z1[k]),
            IM(x), RE(x), RE(sincos[k]), IM(sincos[k]));
    }

    /* reordering */
    for (k = 0; k < N8; k+=2)
    {
        X_out[              2*k] =  IM(Z1[N8 +     k]);
        X_out[          2 + 2*k] =  IM(Z1[N8 + 1 + k]);

        X_out[          1 + 2*k] = -RE(Z1[N8 - 1 - k]);
        X_out[          3 + 2*k] = -RE(Z1[N8 - 2 - k]);

        X_out[N4 +          2*k] =  RE(Z1[         k]);
        X_out[N4 +    + 2 + 2*k] =  RE(Z1[     1 + k]);

        X_out[N4 +      1 + 2*k] = -IM(Z1[N4 - 1 - k]);
        X_out[N4 +      3 + 2*k] = -IM(Z1[N4 - 2 - k]);

        X_out[N2 +          2*k] =  RE(Z1[N8 +     k]);
        X_out[N2 +    + 2 + 2*k] =  RE(Z1[N8 + 1 + k]);

        X_out[N2 +      1 + 2*k] = -IM(Z1[N8 - 1 - k]);
        X_out[N2 +      3 + 2*k] = -IM(Z1[N8 - 2 - k]);

        X_out[N2 + N4 +     2*k] = -IM(Z1[         k]);
        X_out[N2 + N4 + 2 + 2*k] = -IM(Z1[     1 + k]);

        X_out[N2 + N4 + 1 + 2*k] =  RE(Z1[N4 - 1 - k]);
        X_out[N2 + N4 + 3 + 2*k] =  RE(Z1[N4 - 2 - k]);
    }

#ifdef PROFILE
    count2 = faad_get_ts() - count2;
    mdct->fft_cycles += count1;
    mdct->cycles += (count2 - count1);
#endif
}