static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 )
{
    int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ];
    const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset );
    corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 );
    corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 );
    t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) );
    t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) );
    corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC );
    corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC );
    vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] );
    vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] );
}
Beispiel #2
0
void test_vmulls32 (void)
{
    int64x2_t out_int64x2_t;
    int32x2_t arg0_int32x2_t;
    int32x2_t arg1_int32x2_t;

    out_int64x2_t = vmull_s32 (arg0_int32x2_t, arg1_int32x2_t);
}
Beispiel #3
0
inline   int64x2_t vmull(const int32x2_t   & v0, const int32x2_t   & v1) { return vmull_s32(v0, v1); }
unsigned int vp8_variance_halfpixvar16x16_hv_neon(
        const unsigned char *src_ptr,
        int  source_stride,
        const unsigned char *ref_ptr,
        int  recon_stride,
        unsigned int *sse) {
    int i;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
    int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64, d2s64, d3s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
    uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
    int32x4_t q13s32, q14s32, q15s32;
    int64x2_t q0s64, q1s64, q5s64;

    q13s32 = vdupq_n_s32(0);
    q14s32 = vdupq_n_s32(0);
    q15s32 = vdupq_n_s32(0);

    q0u8 = vld1q_u8(src_ptr);
    q1u8 = vld1q_u8(src_ptr + 16);
    src_ptr += source_stride;
    q1u8 = vextq_u8(q0u8, q1u8, 1);
    q0u8 = vrhaddq_u8(q0u8, q1u8);
    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
        q2u8 = vld1q_u8(src_ptr);
        q3u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;
        q4u8 = vld1q_u8(src_ptr);
        q5u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;
        q6u8 = vld1q_u8(src_ptr);
        q7u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;
        q8u8 = vld1q_u8(src_ptr);
        q9u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;

        q3u8 = vextq_u8(q2u8, q3u8, 1);
        q5u8 = vextq_u8(q4u8, q5u8, 1);
        q7u8 = vextq_u8(q6u8, q7u8, 1);
        q9u8 = vextq_u8(q8u8, q9u8, 1);

        q1u8 = vrhaddq_u8(q2u8, q3u8);
        q2u8 = vrhaddq_u8(q4u8, q5u8);
        q3u8 = vrhaddq_u8(q6u8, q7u8);
        q4u8 = vrhaddq_u8(q8u8, q9u8);
        q0u8 = vrhaddq_u8(q0u8, q1u8);
        q1u8 = vrhaddq_u8(q1u8, q2u8);
        q2u8 = vrhaddq_u8(q2u8, q3u8);
        q3u8 = vrhaddq_u8(q3u8, q4u8);

        q5u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q6u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q7u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q8u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;

        d0u8 = vget_low_u8(q0u8);
        d1u8 = vget_high_u8(q0u8);
        d2u8 = vget_low_u8(q1u8);
        d3u8 = vget_high_u8(q1u8);
        d4u8 = vget_low_u8(q2u8);
        d5u8 = vget_high_u8(q2u8);
        d6u8 = vget_low_u8(q3u8);
        d7u8 = vget_high_u8(q3u8);

        q9u16  = vsubl_u8(d0u8, vget_low_u8(q5u8));
        q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
        q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
        q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
        q0u16  = vsubl_u8(d4u8, vget_low_u8(q7u8));
        q1u16  = vsubl_u8(d5u8, vget_high_u8(q7u8));
        q5u16  = vsubl_u8(d6u8, vget_low_u8(q8u8));
        q6u16  = vsubl_u8(d7u8, vget_high_u8(q8u8));

        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
        q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
        q15s32 = vmlal_s16(q15s32, d19s16, d19s16);

        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
        d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
        q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
        q15s32 = vmlal_s16(q15s32, d21s16, d21s16);

        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
        q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
        q15s32 = vmlal_s16(q15s32, d23s16, d23s16);

        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
        q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
        q15s32 = vmlal_s16(q15s32, d25s16, d25s16);

        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
        q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
        q15s32 = vmlal_s16(q15s32, d1s16, d1s16);

        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
        q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
        q15s32 = vmlal_s16(q15s32, d3s16, d3s16);

        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
        q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
        q15s32 = vmlal_s16(q15s32, d11s16, d11s16);

        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
        q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
        q15s32 = vmlal_s16(q15s32, d13s16, d13s16);

        q0u8 = q4u8;
    }

    q15s32 = vaddq_s32(q14s32, q15s32);
    q0s64 = vpaddlq_s32(q13s32);
    q1s64 = vpaddlq_s32(q15s32);

    d0s64 = vget_low_s64(q0s64);
    d1s64 = vget_high_s64(q0s64);
    d2s64 = vget_low_s64(q1s64);
    d3s64 = vget_high_s64(q1s64);
    d0s64 = vadd_s64(d0s64, d1s64);
    d1s64 = vadd_s64(d2s64, d3s64);

    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

    return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_sub_pixel_variance16x16_neon_func(
        const unsigned char *src_ptr,
        int src_pixels_per_line,
        int xoffset,
        int yoffset,
        const unsigned char *dst_ptr,
        int dst_pixels_per_line,
        unsigned int *sse) {
    int i;
    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
    unsigned char *tmpp;
    unsigned char *tmpp2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
    uint8x8_t d19u8, d20u8, d21u8;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64, d2s64, d3s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;

    tmpp2 = tmp + 272;
    tmpp = tmp;
    if (xoffset == 0) {  // secondpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        q11u8 = vld1q_u8(src_ptr);
        src_ptr += src_pixels_per_line;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q13u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q14u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q15u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        for (i = 4; i > 0 ; i--) {
            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            q7u8  = vcombine_u8(d14u8, d15u8);
            q8u8  = vcombine_u8(d16u8, d17u8);
            q9u8  = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp2, q7u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q8u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q9u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q10u8);
            tmpp2 += 16;
        }
    } else {
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        d2u8 = vld1_u8(src_ptr);
        d3u8 = vld1_u8(src_ptr + 8);
        d4u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d5u8 = vld1_u8(src_ptr);
        d6u8 = vld1_u8(src_ptr + 8);
        d7u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d8u8 = vld1_u8(src_ptr);
        d9u8 = vld1_u8(src_ptr + 8);
        d10u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d11u8 = vld1_u8(src_ptr);
        d12u8 = vld1_u8(src_ptr + 8);
        d13u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        // First Pass: output_height lines x output_width columns (17x16)
        for (i = 3; i > 0; i--) {
            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            q7u8 = vcombine_u8(d14u8, d15u8);
            q8u8 = vcombine_u8(d16u8, d17u8);
            q9u8 = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp, q7u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q8u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q9u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q10u8);
            tmpp += 16;
        }

        // First-pass filtering for rest 5 lines
        d14u8 = vld1_u8(src_ptr);
        d15u8 = vld1_u8(src_ptr + 8);
        d16u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        q9u16  = vmull_u8(d2u8, d0u8);
        q10u16 = vmull_u8(d3u8, d0u8);
        q11u16 = vmull_u8(d5u8, d0u8);
        q12u16 = vmull_u8(d6u8, d0u8);
        q13u16 = vmull_u8(d8u8, d0u8);
        q14u16 = vmull_u8(d9u8, d0u8);

        d2u8  = vext_u8(d2u8, d3u8, 1);
        d5u8  = vext_u8(d5u8, d6u8, 1);
        d8u8  = vext_u8(d8u8, d9u8, 1);

        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);

        d3u8  = vext_u8(d3u8, d4u8, 1);
        d6u8  = vext_u8(d6u8, d7u8, 1);
        d9u8  = vext_u8(d9u8, d10u8, 1);

        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);

        q1u16 = vmull_u8(d11u8, d0u8);
        q2u16 = vmull_u8(d12u8, d0u8);
        q3u16 = vmull_u8(d14u8, d0u8);
        q4u16 = vmull_u8(d15u8, d0u8);

        d11u8 = vext_u8(d11u8, d12u8, 1);
        d14u8 = vext_u8(d14u8, d15u8, 1);

        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);

        d12u8 = vext_u8(d12u8, d13u8, 1);
        d15u8 = vext_u8(d15u8, d16u8, 1);

        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);

        d10u8 = vqrshrn_n_u16(q9u16, 7);
        d11u8 = vqrshrn_n_u16(q10u16, 7);
        d12u8 = vqrshrn_n_u16(q11u16, 7);
        d13u8 = vqrshrn_n_u16(q12u16, 7);
        d14u8 = vqrshrn_n_u16(q13u16, 7);
        d15u8 = vqrshrn_n_u16(q14u16, 7);
        d16u8 = vqrshrn_n_u16(q1u16, 7);
        d17u8 = vqrshrn_n_u16(q2u16, 7);
        d18u8 = vqrshrn_n_u16(q3u16, 7);
        d19u8 = vqrshrn_n_u16(q4u16, 7);

        q5u8 = vcombine_u8(d10u8, d11u8);
        q6u8 = vcombine_u8(d12u8, d13u8);
        q7u8 = vcombine_u8(d14u8, d15u8);
        q8u8 = vcombine_u8(d16u8, d17u8);
        q9u8 = vcombine_u8(d18u8, d19u8);

        vst1q_u8((uint8_t *)tmpp, q5u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q6u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q7u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q8u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q9u8);

        // secondpass_filter
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        tmpp = tmp;
        tmpp2 = tmpp + 272;
        q11u8 = vld1q_u8(tmpp);
        tmpp += 16;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q13u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q14u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q15u8 = vld1q_u8(tmpp);
            tmpp += 16;

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    }

    // sub_pixel_variance16x16_neon
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);

    tmpp = tmp + 272;
    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
        q0u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q1u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q2u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;
        q3u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;

        d0u8 = vget_low_u8(q0u8);
        d1u8 = vget_high_u8(q0u8);
        d2u8 = vget_low_u8(q1u8);
        d3u8 = vget_high_u8(q1u8);

        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));

        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);

        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);

        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);

        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }

    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);

    d0s64 = vget_low_s64(q0s64);
    d1s64 = vget_high_s64(q0s64);
    d2s64 = vget_low_s64(q1s64);
    d3s64 = vget_high_s64(q1s64);
    d0s64 = vadd_s64(d0s64, d1s64);
    d1s64 = vadd_s64(d2s64, d3s64);

    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

    return vget_lane_u32(d0u32, 0);
}
Beispiel #6
0
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre,
                                                       int16_t* inim,
                                                       int32_t* outre1,
                                                       int32_t* outre2,
                                                       int32_t sh) {
  int k;
  int16_t* p_inre = inre;
  int16_t* p_inim = inim;
  int32_t* p_outre1 = outre1;
  int32_t* p_outre2 = outre2;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  int32x4_t shift = vdupq_n_s32(-sh - 16);
  // Divide through by the normalizing constant:
  // scale all values with 1/240, i.e. with 273 in Q16.
  // 273/65536 ~= 0.0041656
  // 1/240 ~= 0.0041666
  int16x8_t scale = vdupq_n_s16(273);
  // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727.
  int factQ19 = 31727 << 16;
  int32x4_t fact = vdupq_n_s32(factQ19);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t inre16x8 = vld1q_s16(p_inre);
    int16x8_t inim16x8 = vld1q_s16(p_inim);
    p_inre += 8;
    p_inim += 8;
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    kCosTab += 8;
    kSinTab += 8;
    // By vshl and vmull, we effectively did "<< (-sh - 16)",
    // instead of "<< (-sh)" and ">> 16" as in the C code.
    int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale));
    int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale);
    int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale);
#else
    int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8),
                                   vget_high_s16(scale));
    int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8),
                                   vget_high_s16(scale));
#endif

    outre1_0 = vshlq_s32(outre1_0, shift);
    outre1_1 = vshlq_s32(outre1_1, shift);
    outre2_0 = vshlq_s32(outre2_0, shift);
    outre2_1 = vshlq_s32(outre2_1, shift);

    // Demodulate and separate.
    int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr));
    int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmpr_1 = vmovl_high_s16(tmpr);
    int32x4_t tmpi_1 = vmovl_high_s16(tmpi);
#else
    int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr));
    int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi));
#endif

    int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0));
    int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0));
    int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1));
    int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1));
    xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0));
    xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0));
    xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1));
    xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1));

#if defined(WEBRTC_ARCH_ARM64)
    int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0);
    int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0);
    int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1);
    int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1);
    xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0);
    xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0);
    xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1);
    xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1);
#else
    int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0));
    int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0));
    int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1));
    int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1));
    xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0));
    xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0));
    xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1));
    xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1));
#endif

    outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10));
    outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10));
    outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10));
    outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10));
    outre1_0 = vqdmulhq_s32(outre1_0, fact);
    outre2_0 = vqdmulhq_s32(outre2_0, fact);
    outre1_1 = vqdmulhq_s32(outre1_1, fact);
    outre2_1 = vqdmulhq_s32(outre2_1, fact);

    vst1q_s32(p_outre1, outre1_0);
    p_outre1 += 4;
    vst1q_s32(p_outre1, outre1_1);
    p_outre1 += 4;
    vst1q_s32(p_outre2, outre2_0);
    p_outre2 += 4;
    vst1q_s32(p_outre2, outre2_1);
    p_outre2 += 4;
  }
}
Beispiel #7
0
unsigned int vp8_variance16x8_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;

    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);

    for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
        q0u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q1u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        __builtin_prefetch(src_ptr);

        q2u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q3u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        __builtin_prefetch(ref_ptr);

        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));

        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);

        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);

        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);

        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }

    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);

    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));

    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

    return vget_lane_u32(d0u32, 0);
}
void BQ_2I_D32F32C30_TRC_WRA_01 (           Biquad_Instance_t       *pInstance,
                                            LVM_INT32                    *pDataIn,
                                            LVM_INT32                    *pDataOut,
                                            LVM_INT16                    NrSamples)


    {
#if !(defined  __ARM_HAVE_NEON)
        LVM_INT32 ynL,ynR,templ,tempd;
        LVM_INT16 ii;
        PFilter_State pBiquadState = (PFilter_State) pInstance;

         for (ii = NrSamples; ii != 0; ii--)
         {


            /**************************************************************************
                            PROCESSING OF THE LEFT CHANNEL
            ***************************************************************************/
            /* ynL= ( A2 (Q30) * x(n-2)L (Q0) ) >>30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[2],ynL,30)

            /* ynL+= ( A1 (Q30) * x(n-1)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[0],templ,30)
            ynL+=templ;

            /* ynL+= ( A0 (Q30) * x(n)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[2],*pDataIn,templ,30)
            ynL+=templ;

             /* ynL+= (-B2 (Q30) * y(n-2)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[6],templ,30)
            ynL+=templ;

            /* ynL+= (-B1 (Q30) * y(n-1)L (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[4],templ,30)
            ynL+=templ;

            /**************************************************************************
                            PROCESSING OF THE RIGHT CHANNEL
            ***************************************************************************/
            /* ynR= ( A2 (Q30) * x(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[3],ynR,30)

            /* ynR+= ( A1 (Q30) * x(n-1)R (Q0) ) >> 30  in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[1],templ,30)
            ynR+=templ;

            /* ynR+= ( A0 (Q30) * x(n)R (Q0) ) >> 30 in Q0*/
            tempd=*(pDataIn+1);
            MUL32x32INTO32(pBiquadState->coefs[2],tempd,templ,30)
            ynR+=templ;

            /* ynR+= (-B2 (Q30) * y(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[7],templ,30)
            ynR+=templ;

            /* ynR+= (-B1 (Q30) * y(n-1)R (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[5],templ,30)
            ynR+=templ;

            /**************************************************************************
                            UPDATING THE DELAYS
            ***************************************************************************/
            pBiquadState->pDelays[7]=pBiquadState->pDelays[5]; /* y(n-2)R=y(n-1)R*/
            pBiquadState->pDelays[6]=pBiquadState->pDelays[4]; /* y(n-2)L=y(n-1)L*/
            pBiquadState->pDelays[3]=pBiquadState->pDelays[1]; /* x(n-2)R=x(n-1)R*/
            pBiquadState->pDelays[2]=pBiquadState->pDelays[0]; /* x(n-2)L=x(n-1)L*/
            pBiquadState->pDelays[5]=(LVM_INT32)ynR; /* Update y(n-1)R in Q0*/
            pBiquadState->pDelays[4]=(LVM_INT32)ynL; /* Update y(n-1)L in Q0*/
            pBiquadState->pDelays[0]=(*pDataIn); /* Update x(n-1)L in Q0*/
            pDataIn++;
            pBiquadState->pDelays[1]=(*pDataIn); /* Update x(n-1)R in Q0*/
            pDataIn++;

            /**************************************************************************
                            WRITING THE OUTPUT
            ***************************************************************************/
            *pDataOut=(LVM_INT32)ynL; /* Write Left output in Q0*/
            pDataOut++;
            *pDataOut=(LVM_INT32)ynR; /* Write Right ouput in Q0*/
            pDataOut++;


        }
#else
        LVM_INT16 ii=0;
	      
		PFilter_State pBiquadState = (PFilter_State) pInstance;

		int32x2_t A2 = vdup_n_s32(pBiquadState->coefs[0]);
		int32x2_t A1 = vdup_n_s32(pBiquadState->coefs[1]);
		int32x2_t A0 = vdup_n_s32(pBiquadState->coefs[2]);
		int32x2_t B2 = vdup_n_s32(pBiquadState->coefs[3]);
		int32x2_t B1 = vdup_n_s32(pBiquadState->coefs[4]);
		
		int32x2_t X_2 = vld1_s32(&pBiquadState->pDelays[2]);
		int32x2_t X_1 = vld1_s32(&pBiquadState->pDelays[0]);
		int32x2_t Y_2 = vld1_s32(&pBiquadState->pDelays[6]);
		int32x2_t Y_1 = vld1_s32(&pBiquadState->pDelays[4]);

		for(ii=0; ii<NrSamples; ii++){
		  int32x2_t s = vld1_s32(pDataIn);
		  int64x2_t r = vmull_s32(A2, X_2);
		  r = vmlal_s32(r, A1, X_1);
		  r = vmlal_s32(r, A0, s);
		  r = vmlal_s32(r, B2, Y_2);
		  r = vmlal_s32(r, B1, Y_1);
		  int32_t ll =(int32_t)( vgetq_lane_s64(r, 0) >> 30);
		  int32_t rr =(int32_t)( vgetq_lane_s64(r, 1) >> 30);
		  pDataIn += 2;
		  *pDataOut ++ = ll;
		  *pDataOut ++ = rr;
		  int32_t tmp1, tmp2;
		  tmp1 = vget_lane_s32(X_1, 0);
		  tmp2 = vget_lane_s32(X_1, 1);
		  vset_lane_s32(tmp1, X_2, 0);
		  vset_lane_s32(tmp2, X_2, 1);
		  tmp1 = vget_lane_s32(Y_1, 0);
		  tmp2 = vget_lane_s32(Y_1, 1);
		  vset_lane_s32(tmp1, Y_2, 0);
		  vset_lane_s32(tmp2, Y_2, 1);

		  vset_lane_s32(ll, Y_1, 0);
		  vset_lane_s32(rr, Y_1, 1);
		  
		  tmp1 = vget_lane_s32(s, 0);
		  tmp2 = vget_lane_s32(s, 1);
		  vset_lane_s32(tmp1, X_1, 0);
		  vset_lane_s32(tmp2, X_1, 1);
		}
        vst1_s32(&pBiquadState->pDelays[2], X_2);
        vst1_s32(&pBiquadState->pDelays[0], X_1);
        vst1_s32(&pBiquadState->pDelays[6], Y_2);
        vst1_s32(&pBiquadState->pDelays[4], Y_1);
#endif         

    }