int32x2_t sub_abs_to_vabd_32()
{
  int32x2_t val1 = vdup_n_s32 (10);
  int32x2_t val2 = vdup_n_s32 (30);
  int32x2_t sres = vsub_s32(val1, val2);
  int32x2_t res = vabs_s32 (sres);

   return res;
}
Пример #2
0
void test_vdup_ns32 (void)
{
  int32x2_t out_int32x2_t;
  int32_t arg0_int32_t;

  out_int32x2_t = vdup_n_s32 (arg0_int32_t);
}
Пример #3
0
inline int32x2_t cv_vrnd_s32_f32(float32x2_t v)
{
    static int32x2_t v_sign = vdup_n_s32(1 << 31),
        v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f));

    int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v)));
    return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition)));
}
Пример #4
0
static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 )
{
    int32x4_t t_s32x4, out32_Q14_s32x4;

    *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 );              /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} )                                      */
    *S_s32x4         = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0;                                                    */
    *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 );                          /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */
    out32_Q14_s32x4  = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 );         /* out32_Q14_{0,1,0,1}                                                             */
    t_s32x4          = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 );                 /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 )                            */
    *S_s32x4         = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 );                      /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND();  S{2,3} = silk_RSHIFT_ROUND();           */
    t_s32x4          = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 );                 /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 )                            */
    *S_s32x4         = vaddq_s32( *S_s32x4, t_s32x4 );                             /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 );          */
    t_s32x4          = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 );                      /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} )                                  */
    *S_s32x4         = vaddq_s32( *S_s32x4, t_s32x4 );                             /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} );                        */
}
Пример #5
0
int32x2_t test_vdup_n_s32(int32_t v1) {
  // CHECK: test_vdup_n_s32
  return vdup_n_s32(v1);
  // CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
}
Пример #6
0
void silk_biquad_alt_stride2_neon(
    const opus_int16            *in,                /* I     input signal                                               */
    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
    opus_int16                  *out,               /* O     output signal                                              */
    const opus_int32            len                 /* I     signal length (must be even)                               */
)
{
    /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
    opus_int        k            = 0;
    const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 );
    const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 );
    int16x4_t       in_s16x4  = vdup_n_s16( 0 );
    int16x4_t       out_s16x4;
    int32x2_t       A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2;
    int32x4_t       A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4;
    int32x2x2_t     t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2;

#ifdef OPUS_CHECK_ASM
    opus_int32 S_c[ 4 ];
    VARDECL( opus_int16, out_c );
    SAVE_STACK;
    ALLOC( out_c, 2 * len, opus_int16 );

    silk_memcpy( &S_c, S, sizeof( S_c ) );
    silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len );
#endif

    /* Negate A_Q28 values and split in two parts */
    A_Q28_s32x2 = vld1_s32( A_Q28 );
    A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 );
    A_L_s32x2   = vshl_n_s32( A_Q28_s32x2, 18 );                                                        /* ( -A_Q28[] & 0x00003FFF ) << 18                                                     */
    A_L_s32x2   = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) );           /* ( -A_Q28[] & 0x00003FFF ) << 15                                                     */
    A_U_s32x2   = vshr_n_s32( A_Q28_s32x2, 14 );                                                        /* silk_RSHIFT( -A_Q28[], 14 )                                                         */
    A_U_s32x2   = vshl_n_s32( A_U_s32x2, 16 );                                                          /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */
    A_U_s32x2   = vshr_n_s32( A_U_s32x2, 1 );                                                           /* silk_RSHIFT( -A_Q28[], 14 ) << 15                                                   */

    B_Q28_s32x2  = vld1_s32( B_Q28 );
    t_s32x2      = vld1_s32( B_Q28 + 1 );
    t0_s32x2x2   = vzip_s32( A_L_s32x2, A_L_s32x2 );
    t1_s32x2x2   = vzip_s32( A_U_s32x2, A_U_s32x2 );
    t2_s32x2x2   = vzip_s32( t_s32x2, t_s32x2 );
    A_L_s32x4    = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_L_Q28          */
    A_U_s32x4    = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_U_Q28          */
    B_Q28_s32x4  = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] );                            /* B_Q28[ {1,1,2,2} ]        */
    S_s32x4      = vld1q_s32( S );                                                                      /* S0 = S[ 0 ]; S3 = S[ 3 ]; */
    S_s32x2x2    = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) );                       /* S2 = S[ 1 ]; S1 = S[ 2 ]; */
    S_s32x4      = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] );

    for( ; k < len - 1; k += 2 ) {
        int32x4_t in_s32x4[ 2 ], t_s32x4;
        int32x2_t out32_Q14_s32x2[ 2 ];

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */
        in_s16x4      = vld1_s16( &in[ 2 * k ] );                                                       /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */
        in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 );                                                    /* in{0,1,2,3} << 15                      */
        t_s32x4       = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 );                             /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */
        in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15                      */
        in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15                      */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] );
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] );                   /* out32_Q14_{0,1,2,3}                                                                                        */
        out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 );                                   /* out32_Q14_{0,1,2,3} + (1<<14) - 1                                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) )                             */
        vst1_s16( &out[ 2 * k ], out_s16x4 );                                                           /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */
    }

    /* Process leftover. */
    if( k < len ) {
        int32x4_t in_s32x4;
        int32x2_t out32_Q14_s32x2;

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s32x4     = vshll_n_s16( in_s16x4, 15 );                                                     /* in{0,1} << 15                      */
        t_s32x2      = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 );                    /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */
        in_s32x4     = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) );              /* in{0,1,0,1} << 15                  */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 );                                    /* out32_Q14_{0,1} + (1<<14) - 1                                                              */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 );                             /* out32_Q14_{0,1,0,1} + (1<<14) - 1                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) )             */
        vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 );                                               /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */
        vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 );                                               /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */
    }

    vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 );                                                              /* S[ 0 ] = S0; */
    vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 );                                                              /* S[ 1 ] = S2; */
    vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 );                                                              /* S[ 2 ] = S1; */
    vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 );                                                              /* S[ 3 ] = S3; */

#ifdef OPUS_CHECK_ASM
    silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) );
    silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) );
    RESTORE_STACK;
#endif
}
Пример #7
0
inline   int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
void mdrc5b_apply_limiter(MDRC5B_LOCAL_STRUCT_T *HeapPtr)
{
    unsigned int LaIdx;
    unsigned int NumMainCh;
    unsigned int Samples;
    unsigned int ch, k, n;
    MMlong       *Ptr;
    MMlong       *Ptr2;

    MMlong       *MemOutPtr;
    MMshort      PeakdB;
    MMlong       PeakMax;
    int          RmsMeasure;
    MMshort      LimiterAtCoef;
    MMshort      LimiterReCoef;
    MMshort      LimiterGainMant[MDRC5B_BLOCK_SIZE + 1];
    MMshort      LimiterGainExp;
    MMshort      LimiterTargetGaindB;
    unsigned int LimiterHoldRem;
    unsigned int LimiterHtSamp;
    MMshort      Exp, TargetGain;
    MMshort      MaxShiftBits;
    unsigned int lookahead_len = (unsigned int) HeapPtr->LimiterLALen;
    unsigned int cpt1, cpt2;
    uint32x2x2_t Temp_u32x2x2;
    uint32x2_t   Ldbits_u32x2, Ldbits2_u32x2;
    uint32x2_t   bsl_u32x2;
    int32x2_t    LimGainMant_32x2;
    int64x2_t    TempX_64x2, MemOut_64x2;
    int64x2_t    Tmp_64x2;
    int64x2_t    LimiterGainExp_64x2, sample_64x2;
    int64x1_t    TempX_64x1, sample_64x1;
    int32_t      *LimiterGainMant_ptr;
    int32x2_t    Tmp_32x2, Ldbits_32x2, n_32x2;
    int32x2_t    TempX_low_32x2, TempX_high_32x2;
    int32x2x2_t  Tmp_32x2x2;
    int64x1_t    Peak_64x1, PeakMax_64x1, Tmp_64x1, diffX_64x1;
    int64x1_t    Peak_scale_pow_64x1, Peak_scale_64x1, Zero_s64x1;
    int64x1_t    MaxShiftBits_neg_64x1, MaxShiftBits_hd_64x1;
    int64x2_t    diffX_64x2;
    uint64x1_t   bsl_u64x1;
    int32x2_t    LimiterPeakCoef_32x2, diffX_low_32x2, diffX_high_32x2;
    int32x2_t    TargetGain_32x2;
    uint32x2x2_t Peak_u32x2x2;
    uint32x2_t   Peak_exp_u32x2, Peak_exp2_u32x2, Peak_mant_u32x2;
    int32x2_t    x_32x2, xn_32x2, PeakdB_32x2, Peak_exp_32x2;
    int32x2_t    LimiterTargetGaindB_32x2, Exp_32x2, LimiterCoef_32x2;
    int32x4_t    Tmp_32x4;


    START_PMU_MEASURE(PMU_MEASURE_MRDC5B_APPLY_LIMITER)

    START_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT)

    Samples   = (unsigned int) HeapPtr->BlockSize;
    NumMainCh = (unsigned int) HeapPtr->NumMainCh;

    TempX_64x2 = vdupq_n_s64(0);
    for(ch = 0; ch < NumMainCh; ch++)
    {
        Ptr = HeapPtr->MainInBuf[ch];
        // compute the number of bits needs to be shifted to avoid overflow
        for(k = (Samples >> 1); k > 0; k--)
        {
            sample_64x2 = vld1q_s64(Ptr);
            Ptr        +=2;
            sample_64x2 = veorq_s64(sample_64x2, vshrq_n_s64(sample_64x2, 63));
            TempX_64x2  = vorrq_s64(TempX_64x2, sample_64x2);
        }
        if(Samples & 1)
        {
            sample_64x1 = vld1_s64(Ptr);
            sample_64x1 = veor_s64(sample_64x1, vshr_n_s64(sample_64x1, 63));
            TempX_64x2  = vorrq_s64(TempX_64x2, vcombine_s64(sample_64x1, sample_64x1));
        }
    }
    TempX_64x1    = vorr_s64(vget_low_s64(TempX_64x2), vget_high_s64(TempX_64x2));
    Temp_u32x2x2  = vuzp_u32(vreinterpret_u32_s64(TempX_64x1), vreinterpret_u32_s64(TempX_64x1));
    bsl_u32x2     = vceq_u32(Temp_u32x2x2.val[1], vdup_n_u32(0));                  // MSB == 0 ?
    // use clz instead of cls because we are sure that input value is positive
    // and because cls(LSB) could be wrong (if MSB is equal to 0 and bit 31 of LSL is 1)
    // thus clz result will be 1 more than cls result (that's why you may see (Ldbits - 1)
    // instead of Ldbits below)
    Ldbits_u32x2  = vadd_u32(vclz_u32(Temp_u32x2x2.val[0]), vdup_n_u32(32));       // clz(LSB)+32
    Ldbits2_u32x2 = vclz_u32(Temp_u32x2x2.val[1]);                                 // clz(MSB)
    Ldbits_u32x2  = vbsl_u32(bsl_u32x2, Ldbits_u32x2, Ldbits2_u32x2);              // MSB == 0 ? clz(LSB)+32 : clz(MSB)
    bsl_u32x2     = vceq_u32(Ldbits_u32x2, vdup_n_u32(64));                        // Ldbits == 64 ? (i.e. TempX == 0 ?)
    // the aim of MaxShiftBits is that sample will be shifted so that it occupies
    // 24 significant bits for 24 bits samples or 32 significant bits for 32 bits samples
    // but we are in 64 bits architecture on CA9/NEON
    // so we must right shift of ((64 - 24) - (Ldbits - 1)) bits for 24 bits samples
    // or of ((64 - 32) - (Ldbits - 1)) bits for 32 bits samples
    // and we add 1 because it was done this way on MMDSP (I don't know why !)
#ifdef SAMPLES_24_BITS
    // MaxShiftBits = ((64 - 24) - (Ldbits - 1)) + 1
    //              = 42 - Ldbits
    Ldbits_32x2     = vsub_s32(vdup_n_s32(42), vreinterpret_s32_u32(Ldbits_u32x2));
#else // SAMPLES_24_BITS
    // MaxShiftBits = ((64 - 32) - (Ldbits - 1)) + 1
    //              = 34 - Ldbits
    Ldbits_32x2     = vsub_s32(vdup_n_s32(34), vreinterpret_s32_u32(Ldbits_u32x2));
#endif // SAMPLES_24_BITS
    Ldbits_32x2     = vmax_s32(vdup_n_s32(1), Ldbits_32x2);
    Ldbits_32x2     = vbsl_s32(bsl_u32x2, vdup_n_s32(1), Ldbits_32x2);              // if(TempX == 0) Ldbits = 1
    MaxShiftBits    = vget_lane_s32(Ldbits_32x2, 0);

    STOP_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT)
#ifdef DEBUG_LIMITER_OUTPUT
    if((debug_cpt_samples >= DEBUG_CPT_MIN) && (debug_cpt_samples <= DEBUG_CPT_MAX))
    {
        char string[100];

        debug_write_string("MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT\n");
        sprintf(string, "MaxShiftBits=%d\n", MaxShiftBits);
        debug_write_string(string);
    }
#endif  // DEBUG_LIMITER_OUTPUT


    START_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_INSERT_NEW_SUBBAND)

    // insert the new subband samples into the lookahead buffers
    RmsMeasure = HeapPtr->Limiter.RmsMeasure;

    LaIdx = (unsigned int) HeapPtr->LimiterLaIdx;
    if(LaIdx + Samples >= lookahead_len)
    {
        cpt1                  = lookahead_len - LaIdx;
        cpt2                  = Samples - cpt1;
        // update index
        HeapPtr->LimiterLaIdx = (int) cpt2;
    }
    else
    {
        cpt1                  = Samples;
        cpt2                  = 0;
        // update index
        HeapPtr->LimiterLaIdx = (int) (LaIdx + Samples);
    }

    LimiterPeakCoef_32x2  = vdup_n_s32(HeapPtr->LimiterPeakAtCoef);                               // LimiterPeakAtCoef, LimiterPeakAtCoef
    LimiterPeakCoef_32x2  = vset_lane_s32(HeapPtr->LimiterPeakReCoef, LimiterPeakCoef_32x2, 1);   // LimiterPeakAtCoef, LimiterPeakReCoef
    Peak_scale_64x1       = vdup_n_s64(HeapPtr->PrevShiftBits - MaxShiftBits);
    Peak_scale_pow_64x1   = vshl_n_s64(Peak_scale_64x1, 1);
    MaxShiftBits_neg_64x1 = vdup_n_s64(-MaxShiftBits);
#ifdef SAMPLES_24_BITS
    MaxShiftBits_hd_64x1  = vdup_n_s64(24 - MaxShiftBits);
#else // SAMPLES_24_BITS
    MaxShiftBits_hd_64x1  = vdup_n_s64(32 - MaxShiftBits);
#endif // SAMPLES_24_BITS
    PeakMax_64x1          = vdup_n_s64(0);

    for(ch = 0; ch < NumMainCh; ch++)
    {
        Ptr  = HeapPtr->MainInBuf[ch];
        Ptr2 = HeapPtr->LimiterLABuf[ch] + LaIdx;  // go to the first valid sample

        Peak_64x1 = vdup_n_s64(HeapPtr->LimiterPeak[ch]);
        if(RmsMeasure)
        {
            // compensate Peak according to the previous shift bits
            Peak_64x1 = vqrshl_s64(Peak_64x1, Peak_scale_pow_64x1);                                 // neg value => shift right rounding

            // rms measure
            for(k = cpt1; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                Tmp_64x1        = vqrshl_s64(Tmp_64x1, MaxShiftBits_neg_64x1);
                Tmp_64x2        = vcombine_s64(Tmp_64x1, Tmp_64x1);
                Tmp_32x2x2      = vuzp_s32(vget_low_s32(vreinterpretq_s32_s64(Tmp_64x2)), vget_high_s32(vreinterpretq_s32_s64(Tmp_64x2)));
                Tmp_32x2        = Tmp_32x2x2.val[0];                                                // LSB of Tmp_64x2 (MSB is dummy)
                TempX_64x2      = vqdmull_s32(Tmp_32x2, Tmp_32x2);
                TempX_64x1      = vget_low_s64(TempX_64x2);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
            Ptr2 = HeapPtr->LimiterLABuf[ch];
            for(k = cpt2; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                Tmp_64x1        = vqrshl_s64(Tmp_64x1, MaxShiftBits_neg_64x1);
                Tmp_64x2        = vcombine_s64(Tmp_64x1, Tmp_64x1);
                Tmp_32x2x2      = vuzp_s32(vget_low_s32(vreinterpretq_s32_s64(Tmp_64x2)), vget_high_s32(vreinterpretq_s32_s64(Tmp_64x2)));
                Tmp_32x2        = Tmp_32x2x2.val[0];                                                // LSB of Tmp_64x2 (MSB is dummy)
                TempX_64x2      = vqdmull_s32(Tmp_32x2, Tmp_32x2);
                TempX_64x1      = vget_low_s64(TempX_64x2);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
        }
        else
        {
            // compensate Peak according to the previous shift bits
            Peak_64x1  = vqrshl_s64(Peak_64x1, Peak_scale_64x1);

            // amplitude measure
            Zero_s64x1 = vdup_n_s64(0);
            for(k = cpt1; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Tmp_64x1)
                TempX_64x1      = vqsub_s64(Zero_s64x1, Tmp_64x1);                                  // -Tmp_64x1
                TempX_64x1      = vbsl_s64(bsl_u64x1, TempX_64x1, Tmp_64x1);
                TempX_64x1      = vqrshl_s64(TempX_64x1, MaxShiftBits_hd_64x1);
                TempX_64x2      = vcombine_s64(TempX_64x1, TempX_64x1);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
            Ptr2 = HeapPtr->LimiterLABuf[ch];
            for(k = cpt2; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Tmp_64x1)
                TempX_64x1      = vqsub_s64(Zero_s64x1, Tmp_64x1);                                  // -Tmp_64x1
                TempX_64x1      = vbsl_s64(bsl_u64x1, TempX_64x1, Tmp_64x1);
                TempX_64x1      = vqrshl_s64(TempX_64x1, MaxShiftBits_hd_64x1);
                TempX_64x2      = vcombine_s64(TempX_64x1, TempX_64x1);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
        }

        HeapPtr->LimiterPeak[ch] = vget_lane_s64(Peak_64x1, 0);                                     // save history
    }  // for(ch = 0...)
    PeakMax                = vget_lane_s64(PeakMax_64x1, 0);
    HeapPtr->PrevShiftBits = MaxShiftBits;

    STOP_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_INSERT_NEW_SUBBAND)


    if(PeakMax < MDRC5B_ALMOST_ZERO_THRESH)
    {
        PeakdB = (MDRC5B_POWER_DB_MINUS_INF << 16); // 8.16, [-128.0, 127.0] dB
    }
    else
    {
        Peak_u32x2x2    = vuzp_u32(vreinterpret_u32_s64(PeakMax_64x1), vreinterpret_u32_s64(PeakMax_64x1));
        bsl_u32x2       = vceq_u32(Peak_u32x2x2.val[1], vdup_n_u32(0));
        Peak_exp_u32x2  = vadd_u32(vclz_u32(Peak_u32x2x2.val[0]), vdup_n_u32(32));
        Peak_exp2_u32x2 = vclz_u32(Peak_u32x2x2.val[1]);
        Peak_exp_u32x2  = vbsl_u32(bsl_u32x2, Peak_exp_u32x2, Peak_exp2_u32x2);
        Peak_mant_u32x2 = vrshrn_n_u64(vshlq_u64(vreinterpretq_u64_s64(vcombine_s64(PeakMax_64x1, PeakMax_64x1)), vreinterpretq_s64_u64(vmovl_u32(Peak_exp_u32x2))), 32);

        // if(Peak_mant >= sqrt(0.5))
        // {
        //     Peak_exp--;
        //     Peak_mant >>= 1;
        // }
        bsl_u32x2       = vcge_u32(Peak_mant_u32x2, vdup_n_u32(0xB504F334));
        Peak_exp_u32x2  = vbsl_u32(bsl_u32x2, vsub_u32(Peak_exp_u32x2, vdup_n_u32(1)), Peak_exp_u32x2);
        Peak_mant_u32x2 = vbsl_u32(bsl_u32x2, vrshr_n_u32(Peak_mant_u32x2, 1), Peak_mant_u32x2);

        Peak_exp_32x2 = vreinterpret_s32_u32(Peak_exp_u32x2);
#ifdef SAMPLES_24_BITS
        // correction of 16 bits if input samples are 24 bits
        Peak_exp_32x2 = vsub_s32(Peak_exp_32x2, vdup_n_s32(16));
#endif // SAMPLES_24_BITS

        // at this point : sqrt(0.5)/2 <= Peak_mant < sqrt(0.5)
        //
        // ln(1+x) = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9 - x^10/10 ...    accuracy OK if |x| < 0.5
        // sqrt(0.5)/2 <= Peak_mant < sqrt(0.5)  =>  sqrt(0.5)-1 <= 2*Peak_mant-1 < 2*sqrt(0.5)-1
        //                                       =>  ln(Peak_mant) = ln(1+x)-ln(2) with x=2*Peak_mant-1, i.e. |x| < 0.414214...

        // x=2*PeakMax_mant-1 in Q31
        // => sqrt(0.5)-1 <= x < 2*sqrt(0.5)-1
        x_32x2      = vreinterpret_s32_u32(vsub_u32(Peak_mant_u32x2, vdup_n_u32(0x80000000)));

        PeakdB_32x2 = x_32x2;                                                                     // PeakdB = x

        xn_32x2     = vqrdmulh_s32(x_32x2, x_32x2);                                               // xn = x^2
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 1));                            // PeakdB = x - x^2/2

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^3
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x2AAAAAAB)));      // PeakdB = x - x^2/2 + x^3/3

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^4
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 2));                            // PeakdB = x - x^2/2 + x^3/3 - x^4/4

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^5
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x1999999A)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^6
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x15555555)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^7
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x12492492)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^8
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 3));                            // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^9
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x0E38E38E)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^10
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x0CCCCCCD)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9 - x^10/10

        // at this point : PeakMaxdB contains ln(1+x) in Q31

        if(RmsMeasure)
        {
            // dB(power) = 10*log10(power)

            // PeakMaxdB = 10*log10(PeakMax)+20*log10(2)*(HEADROOM+MaxShiftBits)
            //           = 10*ln(PeakMax)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*ln(PeakMax_mant*2^(-PeakMax_exp))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*(ln(PeakMax_mant)-PeakMax_exp*ln(2))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*ln(PeakMax_mant)-PeakMax_exp*10*ln(2)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*ln(PeakMax_mant)+10*ln(2)/ln(10)*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp)
            //
            // => RmsdB = 10/ln(10)*ln(1+x)+10*ln(2)/ln(10)*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp)
            // => RmsdB (Q16) = 0x457CB*ln(1+x)+0x302A3*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp)

            // fractional mutiply 0x457CB*ln(1+x) in Q16
            PeakdB_32x2   = vqrdmulh_s32(PeakdB_32x2, vdup_n_s32(0x457CB));

            // PeakdB_exp = 2*(HEADROOM+MaxShiftBits)-PeakdB_exp
            Peak_exp_32x2 = vsub_s32(vdup_n_s32(2 * (HEADROOM + MaxShiftBits)), Peak_exp_32x2);

            // PeakMaxdB final value (integer mac 0x302A3*PeakdB_exp)
            PeakdB_32x2   = vmla_s32(PeakdB_32x2, Peak_exp_32x2, vdup_n_s32(0x302A3));
        }
        else
        {
            // dB(power) = 20*log10(abs)

            // PeakMaxdB = 20*log10(PeakMax)+20*log10(2)*(HEADROOM+MaxShiftBits)
            //           = 20*ln(PeakMax)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*ln(PeakMax_mant*2^(-PeakMax_exp))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*(ln(PeakMax_mant)-PeakMax_exp*ln(2))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*ln(PeakMax_mant)-PeakMax_exp*20*ln(2)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*ln(PeakMax_mant)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits-PeakMax_exp)
            //
            // => RmsdB = 20/ln(10)*ln(1+x)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits-PeakMax_exp)
            // => RmsdB (Q16) = 0x8AF96*ln(1+x)+0x60546*(HEADROOM+MaxShiftBits-PeakMax_exp)

            // fractional mutiply 0x8AF96*ln(1+x) in Q16
            PeakdB_32x2     = vqrdmulh_s32(PeakdB_32x2, vdup_n_s32(0x8AF96));

            // PeakdB_exp = HEADROOM+MaxShiftBits-PeakdB_exp
            Peak_exp_32x2 = vsub_s32(vdup_n_s32(HEADROOM + MaxShiftBits), Peak_exp_32x2);

            // PeakMaxdB final value (integer mac 0x60546*PeakdB_exp)
            PeakdB_32x2     = vmla_s32(PeakdB_32x2, Peak_exp_32x2, vdup_n_s32(0x60546));
        }
        PeakdB = vget_lane_s32(PeakdB_32x2, 0);
    }
#ifdef DEBUG_LIMITER_OUTPUT
    if((debug_cpt_samples >= DEBUG_CPT_MIN) && (debug_cpt_samples <= DEBUG_CPT_MAX))
    {
        char string[100];

        debug_write_string("MRDC5B_LIMITER_PEAKMAX_PEAKDB\n");
        sprintf(string, "PeakMax=0x%012llX, HEADROOM+MaxShiftBits=%d => PeakdB=0x%06X\n",
#ifdef SAMPLES_24_BITS
                        PeakMax & 0xFFFFFFFFFFFFLL,
#else // SAMPLES_24_BITS
                        (PeakMax >> 16) & 0xFFFFFFFFFFFFLL,
#endif // SAMPLES_24_BITS
                        HEADROOM + MaxShiftBits,
                        PeakdB & 0xFFFFFF);
        debug_write_string(string);
    }
Пример #9
0
void idct_dequant_full_2x_neon(
        int16_t *q,
        int16_t *dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0, *dst1;
    int32x2_t d28, d29, d30, d31;
    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
    int16x8_t qEmpty = vdupq_n_s16(0);
    int32x4x2_t q2tmp0, q2tmp1;
    int16x8x2_t q2tmp2, q2tmp3;
    int16x4_t dLow0, dLow1, dHigh0, dHigh1;

    d28 = d29 = d30 = d31 = vdup_n_s32(0);

    // load dq
    q0 = vld1q_s16(dq);
    dq += 8;
    q1 = vld1q_s16(dq);

    // load q
    q2 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q3 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q4 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q5 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);

    // load src from dst
    dst0 = dst;
    dst1 = dst + 4;
    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
    dst0 += stride;
    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
    dst1 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
    dst0 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
    dst1 += stride;

    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
    dst0 += stride;
    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
    dst1 += stride;
    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);

    q2 = vmulq_s16(q2, q0);
    q3 = vmulq_s16(q3, q1);
    q4 = vmulq_s16(q4, q0);
    q5 = vmulq_s16(q5, q1);

    // vswp
    dLow0 = vget_low_s16(q2);
    dHigh0 = vget_high_s16(q2);
    dLow1 = vget_low_s16(q4);
    dHigh1 = vget_high_s16(q4);
    q2 = vcombine_s16(dLow0, dLow1);
    q4 = vcombine_s16(dHigh0, dHigh1);

    dLow0 = vget_low_s16(q3);
    dHigh0 = vget_high_s16(q3);
    dLow1 = vget_low_s16(q5);
    dHigh1 = vget_high_s16(q5);
    q3 = vcombine_s16(dLow0, dLow1);
    q5 = vcombine_s16(dHigh0, dHigh1);

    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);

    q10 = vqaddq_s16(q2, q3);
    q11 = vqsubq_s16(q2, q3);

    q8 = vshrq_n_s16(q8, 1);
    q9 = vshrq_n_s16(q9, 1);

    q4 = vqaddq_s16(q4, q8);
    q5 = vqaddq_s16(q5, q9);

    q2 = vqsubq_s16(q6, q5);
    q3 = vqaddq_s16(q7, q4);

    q4 = vqaddq_s16(q10, q3);
    q5 = vqaddq_s16(q11, q2);
    q6 = vqsubq_s16(q11, q2);
    q7 = vqsubq_s16(q10, q3);

    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));

    // loop 2
    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);

    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);

    q10 = vshrq_n_s16(q10, 1);
    q11 = vshrq_n_s16(q11, 1);

    q10 = vqaddq_s16(q2tmp2.val[1], q10);
    q11 = vqaddq_s16(q2tmp3.val[1], q11);

    q8 = vqsubq_s16(q8, q11);
    q9 = vqaddq_s16(q9, q10);

    q4 = vqaddq_s16(q2, q9);
    q5 = vqaddq_s16(q3, q8);
    q6 = vqsubq_s16(q3, q8);
    q7 = vqsubq_s16(q2, q9);

    q4 = vrshrq_n_s16(q4, 3);
    q5 = vrshrq_n_s16(q5, 3);
    q6 = vrshrq_n_s16(q6, 3);
    q7 = vrshrq_n_s16(q7, 3);

    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));

    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
                                          vreinterpret_u8_s32(d28)));
    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
                                          vreinterpret_u8_s32(d29)));
    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
                                          vreinterpret_u8_s32(d30)));
    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
                                          vreinterpret_u8_s32(d31)));

    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));

    dst0 = dst;
    dst1 = dst + 4;
    vst1_lane_s32((int32_t *)dst0, d28, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d28, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d29, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d29, 1);
    dst1 += stride;

    vst1_lane_s32((int32_t *)dst0, d30, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d30, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d31, 0);
    vst1_lane_s32((int32_t *)dst1, d31, 1);
    return;
}
Пример #10
0
void BQ_2I_D32F32C30_TRC_WRA_01 (           Biquad_Instance_t       *pInstance,
                                            LVM_INT32                    *pDataIn,
                                            LVM_INT32                    *pDataOut,
                                            LVM_INT16                    NrSamples)


    {
#if !(defined  __ARM_HAVE_NEON)
        LVM_INT32 ynL,ynR,templ,tempd;
        LVM_INT16 ii;
        PFilter_State pBiquadState = (PFilter_State) pInstance;

         for (ii = NrSamples; ii != 0; ii--)
         {


            /**************************************************************************
                            PROCESSING OF THE LEFT CHANNEL
            ***************************************************************************/
            /* ynL= ( A2 (Q30) * x(n-2)L (Q0) ) >>30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[2],ynL,30)

            /* ynL+= ( A1 (Q30) * x(n-1)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[0],templ,30)
            ynL+=templ;

            /* ynL+= ( A0 (Q30) * x(n)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[2],*pDataIn,templ,30)
            ynL+=templ;

             /* ynL+= (-B2 (Q30) * y(n-2)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[6],templ,30)
            ynL+=templ;

            /* ynL+= (-B1 (Q30) * y(n-1)L (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[4],templ,30)
            ynL+=templ;

            /**************************************************************************
                            PROCESSING OF THE RIGHT CHANNEL
            ***************************************************************************/
            /* ynR= ( A2 (Q30) * x(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[3],ynR,30)

            /* ynR+= ( A1 (Q30) * x(n-1)R (Q0) ) >> 30  in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[1],templ,30)
            ynR+=templ;

            /* ynR+= ( A0 (Q30) * x(n)R (Q0) ) >> 30 in Q0*/
            tempd=*(pDataIn+1);
            MUL32x32INTO32(pBiquadState->coefs[2],tempd,templ,30)
            ynR+=templ;

            /* ynR+= (-B2 (Q30) * y(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[7],templ,30)
            ynR+=templ;

            /* ynR+= (-B1 (Q30) * y(n-1)R (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[5],templ,30)
            ynR+=templ;

            /**************************************************************************
                            UPDATING THE DELAYS
            ***************************************************************************/
            pBiquadState->pDelays[7]=pBiquadState->pDelays[5]; /* y(n-2)R=y(n-1)R*/
            pBiquadState->pDelays[6]=pBiquadState->pDelays[4]; /* y(n-2)L=y(n-1)L*/
            pBiquadState->pDelays[3]=pBiquadState->pDelays[1]; /* x(n-2)R=x(n-1)R*/
            pBiquadState->pDelays[2]=pBiquadState->pDelays[0]; /* x(n-2)L=x(n-1)L*/
            pBiquadState->pDelays[5]=(LVM_INT32)ynR; /* Update y(n-1)R in Q0*/
            pBiquadState->pDelays[4]=(LVM_INT32)ynL; /* Update y(n-1)L in Q0*/
            pBiquadState->pDelays[0]=(*pDataIn); /* Update x(n-1)L in Q0*/
            pDataIn++;
            pBiquadState->pDelays[1]=(*pDataIn); /* Update x(n-1)R in Q0*/
            pDataIn++;

            /**************************************************************************
                            WRITING THE OUTPUT
            ***************************************************************************/
            *pDataOut=(LVM_INT32)ynL; /* Write Left output in Q0*/
            pDataOut++;
            *pDataOut=(LVM_INT32)ynR; /* Write Right ouput in Q0*/
            pDataOut++;


        }
#else
        LVM_INT16 ii=0;
	      
		PFilter_State pBiquadState = (PFilter_State) pInstance;

		int32x2_t A2 = vdup_n_s32(pBiquadState->coefs[0]);
		int32x2_t A1 = vdup_n_s32(pBiquadState->coefs[1]);
		int32x2_t A0 = vdup_n_s32(pBiquadState->coefs[2]);
		int32x2_t B2 = vdup_n_s32(pBiquadState->coefs[3]);
		int32x2_t B1 = vdup_n_s32(pBiquadState->coefs[4]);
		
		int32x2_t X_2 = vld1_s32(&pBiquadState->pDelays[2]);
		int32x2_t X_1 = vld1_s32(&pBiquadState->pDelays[0]);
		int32x2_t Y_2 = vld1_s32(&pBiquadState->pDelays[6]);
		int32x2_t Y_1 = vld1_s32(&pBiquadState->pDelays[4]);

		for(ii=0; ii<NrSamples; ii++){
		  int32x2_t s = vld1_s32(pDataIn);
		  int64x2_t r = vmull_s32(A2, X_2);
		  r = vmlal_s32(r, A1, X_1);
		  r = vmlal_s32(r, A0, s);
		  r = vmlal_s32(r, B2, Y_2);
		  r = vmlal_s32(r, B1, Y_1);
		  int32_t ll =(int32_t)( vgetq_lane_s64(r, 0) >> 30);
		  int32_t rr =(int32_t)( vgetq_lane_s64(r, 1) >> 30);
		  pDataIn += 2;
		  *pDataOut ++ = ll;
		  *pDataOut ++ = rr;
		  int32_t tmp1, tmp2;
		  tmp1 = vget_lane_s32(X_1, 0);
		  tmp2 = vget_lane_s32(X_1, 1);
		  vset_lane_s32(tmp1, X_2, 0);
		  vset_lane_s32(tmp2, X_2, 1);
		  tmp1 = vget_lane_s32(Y_1, 0);
		  tmp2 = vget_lane_s32(Y_1, 1);
		  vset_lane_s32(tmp1, Y_2, 0);
		  vset_lane_s32(tmp2, Y_2, 1);

		  vset_lane_s32(ll, Y_1, 0);
		  vset_lane_s32(rr, Y_1, 1);
		  
		  tmp1 = vget_lane_s32(s, 0);
		  tmp2 = vget_lane_s32(s, 1);
		  vset_lane_s32(tmp1, X_1, 0);
		  vset_lane_s32(tmp2, X_1, 1);
		}
        vst1_s32(&pBiquadState->pDelays[2], X_2);
        vst1_s32(&pBiquadState->pDelays[0], X_1);
        vst1_s32(&pBiquadState->pDelays[6], Y_2);
        vst1_s32(&pBiquadState->pDelays[4], Y_1);
#endif         

    }