Ejemplo n.º 1
0
/* return the sum of all elements in an array. This works by calculating 4 totals (one for each lane) and adding those at the end to get the final total */
int sum_array(int16_t *array, int size)
{
     /* initialize the accumulator vector to zero */
     int16x4_t acc = vdup_n_s16(0);
     int32x2_t acc1;
     int64x1_t acc2;
     /* this implementation assumes the size of the array is a multiple of 4 */
     assert((size % 4) == 0);
     /* counting backwards gives better code */
     for (; size != 0; size -= 4)
     {
          int16x4_t vec;
          /* load 4 values in parallel from the array */
          vec = vld1_s16(array);
          /* increment the array pointer to the next element */
          array += 4;
          /* add the vector to the accumulator vector */
          acc = vadd_s16(acc, vec);
      }
      /* calculate the total */
      acc1 = vpaddl_s16(acc);
      acc2 = vpaddl_s32(acc1);
      /* return the total as an integer */
      return (int)vget_lane_s64(acc2, 0);
}
Ejemplo n.º 2
0
int64_t test_vget_lane_s64(int64x1_t v1) {
  // CHECK: test_vget_lane_s64
  return vget_lane_s64(v1, 0);
  // CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
}
Ejemplo n.º 3
0
int64_t test_vget_lane_s64(int64x1_t a) {
  // CHECK-LABEL: test_vget_lane_s64:
  // CHECK-NEXT:  fmov x0, d0
  // CHECK-NEXT:  ret
  return vget_lane_s64(a, 0);
}
void silk_warped_autocorrelation_FIX_neon(
          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
    const opus_int                  length,                                 /* I    Length of input                                                             */
    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
)
{
    if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) {
        silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order );
    } else {
        opus_int       n, i, lsh;
        opus_int64     corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */
        opus_int64     corr_QC_orderT;
        int64x2_t      lsh_s64x2;
        const opus_int orderT = ( order + 3 ) & ~3;
        opus_int64     *corr_QCT;
        opus_int32     *input_QS;
        VARDECL( opus_int32, input_QST );
        VARDECL( opus_int32, state );
        SAVE_STACK;

        /* Order must be even */
        silk_assert( ( order & 1 ) == 0 );
        silk_assert( 2 * QS - QC >= 0 );

        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 );

        input_QS = input_QST;
        /* input_QS has zero paddings in the beginning and end. */
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;

        /* Loop over samples */
        for( n = 0; n < length - 7; n += 8, input_QS += 8 ) {
            const int16x8_t t0_s16x4 = vld1q_s16( input + n );
            vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) );
            vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) );
        }
        for( ; n < length; n++, input_QS++ ) {
            input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
        }
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;

        /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
        /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues.       */
        /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */

        /* Keep the C code here to help understand the intrinsics optimization. */
        /*
        {
            opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
            opus_int32 *state_QST[ 3 ];
            state_QST[ 0 ] = state_QS[ 0 ];
            state_QST[ 1 ] = state_QS[ 1 ];
            for( n = 0; n < length + order; n++, input_QS++ ) {
                state_QST[ 0 ][ orderT ] = input_QS[ orderT ];
                for( i = 0; i < orderT; i++ ) {
                    corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC );
                    state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 );
                }
                state_QST[ 2 ] = state_QST[ 0 ];
                state_QST[ 0 ] = state_QST[ 1 ];
                state_QST[ 1 ] = state_QST[ 2 ];
            }
        }
        */

        {
            const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 );
            const opus_int32 *in = input_QS + orderT;
            opus_int o = orderT;
            int32x4_t state_QS_s32x4[ 3 ][ 2 ];

            ALLOC( state, length + orderT, opus_int32 );
            state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );

            /* Calculate 8 taps of all inputs in each loop. */
            do {
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] =
                state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 );
                n = 0;
                do {
                    calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] );
                    calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] );
                    state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n );
                    vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 );
                    state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ];
                } while( ++n < ( length + order ) );
                in = state;
                o -= 8;
            } while( o > 4 );

            if( o ) {
                /* Calculate the last 4 taps of all inputs. */
                opus_int32 *stateT = state;
                silk_assert( o == 4 );
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 );
                n = length + order;
                do {
                    calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] );
                    state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT );
                    vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    input_QS++;
                    stateT++;
                } while( --n );
            }
        }

        {
            const opus_int16 *inputT = input;
            int32x4_t t_s32x4;
            int64x1_t t_s64x1;
            int64x2_t t_s64x2 = vdupq_n_s64( 0 );
            for( n = 0; n <= length - 8; n += 8 ) {
                int16x8_t input_s16x8 = vld1q_s16( inputT );
                t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) );
                t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) );
                inputT += 8;
            }
            t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) );
            corr_QC_orderT = vget_lane_s64( t_s64x1, 0 );
            for( ; n < length; n++ ) {
                corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] );
            }
            corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC );
            corr_QC[ orderT ] = corr_QC_orderT;
        }

        corr_QCT = corr_QC + orderT - order;
        lsh = silk_CLZ64( corr_QC_orderT ) - 35;
        lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
        *scale = -( QC + lsh );
        silk_assert( *scale >= -30 && *scale <= 12 );
        lsh_s64x2 = vdupq_n_s64( lsh );
        for( i = 0; i <= order - 3; i += 4 ) {
            int32x4_t corr_s32x4;
            int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
            corr_QC0_s64x2 = vld1q_s64( corr_QCT + i );
            corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 );
            corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 );
            corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 );
            corr_s32x4     = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) );
            corr_s32x4     = vrev64q_s32( corr_s32x4 );
            vst1q_s32( corr + order - i - 3, corr_s32x4 );
        }
        if( lsh >= 0 ) {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) );
            }
        } else {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) );
            }
        }
        silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/
        RESTORE_STACK;
    }

#ifdef OPUS_CHECK_ASM
    {
        opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ];
        opus_int   scale_c;
        silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order );
        silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) );
        silk_assert( scale_c == *scale );
    }
#endif
}
void mdrc5b_apply_limiter(MDRC5B_LOCAL_STRUCT_T *HeapPtr)
{
    unsigned int LaIdx;
    unsigned int NumMainCh;
    unsigned int Samples;
    unsigned int ch, k, n;
    MMlong       *Ptr;
    MMlong       *Ptr2;

    MMlong       *MemOutPtr;
    MMshort      PeakdB;
    MMlong       PeakMax;
    int          RmsMeasure;
    MMshort      LimiterAtCoef;
    MMshort      LimiterReCoef;
    MMshort      LimiterGainMant[MDRC5B_BLOCK_SIZE + 1];
    MMshort      LimiterGainExp;
    MMshort      LimiterTargetGaindB;
    unsigned int LimiterHoldRem;
    unsigned int LimiterHtSamp;
    MMshort      Exp, TargetGain;
    MMshort      MaxShiftBits;
    unsigned int lookahead_len = (unsigned int) HeapPtr->LimiterLALen;
    unsigned int cpt1, cpt2;
    uint32x2x2_t Temp_u32x2x2;
    uint32x2_t   Ldbits_u32x2, Ldbits2_u32x2;
    uint32x2_t   bsl_u32x2;
    int32x2_t    LimGainMant_32x2;
    int64x2_t    TempX_64x2, MemOut_64x2;
    int64x2_t    Tmp_64x2;
    int64x2_t    LimiterGainExp_64x2, sample_64x2;
    int64x1_t    TempX_64x1, sample_64x1;
    int32_t      *LimiterGainMant_ptr;
    int32x2_t    Tmp_32x2, Ldbits_32x2, n_32x2;
    int32x2_t    TempX_low_32x2, TempX_high_32x2;
    int32x2x2_t  Tmp_32x2x2;
    int64x1_t    Peak_64x1, PeakMax_64x1, Tmp_64x1, diffX_64x1;
    int64x1_t    Peak_scale_pow_64x1, Peak_scale_64x1, Zero_s64x1;
    int64x1_t    MaxShiftBits_neg_64x1, MaxShiftBits_hd_64x1;
    int64x2_t    diffX_64x2;
    uint64x1_t   bsl_u64x1;
    int32x2_t    LimiterPeakCoef_32x2, diffX_low_32x2, diffX_high_32x2;
    int32x2_t    TargetGain_32x2;
    uint32x2x2_t Peak_u32x2x2;
    uint32x2_t   Peak_exp_u32x2, Peak_exp2_u32x2, Peak_mant_u32x2;
    int32x2_t    x_32x2, xn_32x2, PeakdB_32x2, Peak_exp_32x2;
    int32x2_t    LimiterTargetGaindB_32x2, Exp_32x2, LimiterCoef_32x2;
    int32x4_t    Tmp_32x4;


    START_PMU_MEASURE(PMU_MEASURE_MRDC5B_APPLY_LIMITER)

    START_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT)

    Samples   = (unsigned int) HeapPtr->BlockSize;
    NumMainCh = (unsigned int) HeapPtr->NumMainCh;

    TempX_64x2 = vdupq_n_s64(0);
    for(ch = 0; ch < NumMainCh; ch++)
    {
        Ptr = HeapPtr->MainInBuf[ch];
        // compute the number of bits needs to be shifted to avoid overflow
        for(k = (Samples >> 1); k > 0; k--)
        {
            sample_64x2 = vld1q_s64(Ptr);
            Ptr        +=2;
            sample_64x2 = veorq_s64(sample_64x2, vshrq_n_s64(sample_64x2, 63));
            TempX_64x2  = vorrq_s64(TempX_64x2, sample_64x2);
        }
        if(Samples & 1)
        {
            sample_64x1 = vld1_s64(Ptr);
            sample_64x1 = veor_s64(sample_64x1, vshr_n_s64(sample_64x1, 63));
            TempX_64x2  = vorrq_s64(TempX_64x2, vcombine_s64(sample_64x1, sample_64x1));
        }
    }
    TempX_64x1    = vorr_s64(vget_low_s64(TempX_64x2), vget_high_s64(TempX_64x2));
    Temp_u32x2x2  = vuzp_u32(vreinterpret_u32_s64(TempX_64x1), vreinterpret_u32_s64(TempX_64x1));
    bsl_u32x2     = vceq_u32(Temp_u32x2x2.val[1], vdup_n_u32(0));                  // MSB == 0 ?
    // use clz instead of cls because we are sure that input value is positive
    // and because cls(LSB) could be wrong (if MSB is equal to 0 and bit 31 of LSL is 1)
    // thus clz result will be 1 more than cls result (that's why you may see (Ldbits - 1)
    // instead of Ldbits below)
    Ldbits_u32x2  = vadd_u32(vclz_u32(Temp_u32x2x2.val[0]), vdup_n_u32(32));       // clz(LSB)+32
    Ldbits2_u32x2 = vclz_u32(Temp_u32x2x2.val[1]);                                 // clz(MSB)
    Ldbits_u32x2  = vbsl_u32(bsl_u32x2, Ldbits_u32x2, Ldbits2_u32x2);              // MSB == 0 ? clz(LSB)+32 : clz(MSB)
    bsl_u32x2     = vceq_u32(Ldbits_u32x2, vdup_n_u32(64));                        // Ldbits == 64 ? (i.e. TempX == 0 ?)
    // the aim of MaxShiftBits is that sample will be shifted so that it occupies
    // 24 significant bits for 24 bits samples or 32 significant bits for 32 bits samples
    // but we are in 64 bits architecture on CA9/NEON
    // so we must right shift of ((64 - 24) - (Ldbits - 1)) bits for 24 bits samples
    // or of ((64 - 32) - (Ldbits - 1)) bits for 32 bits samples
    // and we add 1 because it was done this way on MMDSP (I don't know why !)
#ifdef SAMPLES_24_BITS
    // MaxShiftBits = ((64 - 24) - (Ldbits - 1)) + 1
    //              = 42 - Ldbits
    Ldbits_32x2     = vsub_s32(vdup_n_s32(42), vreinterpret_s32_u32(Ldbits_u32x2));
#else // SAMPLES_24_BITS
    // MaxShiftBits = ((64 - 32) - (Ldbits - 1)) + 1
    //              = 34 - Ldbits
    Ldbits_32x2     = vsub_s32(vdup_n_s32(34), vreinterpret_s32_u32(Ldbits_u32x2));
#endif // SAMPLES_24_BITS
    Ldbits_32x2     = vmax_s32(vdup_n_s32(1), Ldbits_32x2);
    Ldbits_32x2     = vbsl_s32(bsl_u32x2, vdup_n_s32(1), Ldbits_32x2);              // if(TempX == 0) Ldbits = 1
    MaxShiftBits    = vget_lane_s32(Ldbits_32x2, 0);

    STOP_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT)
#ifdef DEBUG_LIMITER_OUTPUT
    if((debug_cpt_samples >= DEBUG_CPT_MIN) && (debug_cpt_samples <= DEBUG_CPT_MAX))
    {
        char string[100];

        debug_write_string("MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT\n");
        sprintf(string, "MaxShiftBits=%d\n", MaxShiftBits);
        debug_write_string(string);
    }
#endif  // DEBUG_LIMITER_OUTPUT


    START_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_INSERT_NEW_SUBBAND)

    // insert the new subband samples into the lookahead buffers
    RmsMeasure = HeapPtr->Limiter.RmsMeasure;

    LaIdx = (unsigned int) HeapPtr->LimiterLaIdx;
    if(LaIdx + Samples >= lookahead_len)
    {
        cpt1                  = lookahead_len - LaIdx;
        cpt2                  = Samples - cpt1;
        // update index
        HeapPtr->LimiterLaIdx = (int) cpt2;
    }
    else
    {
        cpt1                  = Samples;
        cpt2                  = 0;
        // update index
        HeapPtr->LimiterLaIdx = (int) (LaIdx + Samples);
    }

    LimiterPeakCoef_32x2  = vdup_n_s32(HeapPtr->LimiterPeakAtCoef);                               // LimiterPeakAtCoef, LimiterPeakAtCoef
    LimiterPeakCoef_32x2  = vset_lane_s32(HeapPtr->LimiterPeakReCoef, LimiterPeakCoef_32x2, 1);   // LimiterPeakAtCoef, LimiterPeakReCoef
    Peak_scale_64x1       = vdup_n_s64(HeapPtr->PrevShiftBits - MaxShiftBits);
    Peak_scale_pow_64x1   = vshl_n_s64(Peak_scale_64x1, 1);
    MaxShiftBits_neg_64x1 = vdup_n_s64(-MaxShiftBits);
#ifdef SAMPLES_24_BITS
    MaxShiftBits_hd_64x1  = vdup_n_s64(24 - MaxShiftBits);
#else // SAMPLES_24_BITS
    MaxShiftBits_hd_64x1  = vdup_n_s64(32 - MaxShiftBits);
#endif // SAMPLES_24_BITS
    PeakMax_64x1          = vdup_n_s64(0);

    for(ch = 0; ch < NumMainCh; ch++)
    {
        Ptr  = HeapPtr->MainInBuf[ch];
        Ptr2 = HeapPtr->LimiterLABuf[ch] + LaIdx;  // go to the first valid sample

        Peak_64x1 = vdup_n_s64(HeapPtr->LimiterPeak[ch]);
        if(RmsMeasure)
        {
            // compensate Peak according to the previous shift bits
            Peak_64x1 = vqrshl_s64(Peak_64x1, Peak_scale_pow_64x1);                                 // neg value => shift right rounding

            // rms measure
            for(k = cpt1; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                Tmp_64x1        = vqrshl_s64(Tmp_64x1, MaxShiftBits_neg_64x1);
                Tmp_64x2        = vcombine_s64(Tmp_64x1, Tmp_64x1);
                Tmp_32x2x2      = vuzp_s32(vget_low_s32(vreinterpretq_s32_s64(Tmp_64x2)), vget_high_s32(vreinterpretq_s32_s64(Tmp_64x2)));
                Tmp_32x2        = Tmp_32x2x2.val[0];                                                // LSB of Tmp_64x2 (MSB is dummy)
                TempX_64x2      = vqdmull_s32(Tmp_32x2, Tmp_32x2);
                TempX_64x1      = vget_low_s64(TempX_64x2);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
            Ptr2 = HeapPtr->LimiterLABuf[ch];
            for(k = cpt2; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                Tmp_64x1        = vqrshl_s64(Tmp_64x1, MaxShiftBits_neg_64x1);
                Tmp_64x2        = vcombine_s64(Tmp_64x1, Tmp_64x1);
                Tmp_32x2x2      = vuzp_s32(vget_low_s32(vreinterpretq_s32_s64(Tmp_64x2)), vget_high_s32(vreinterpretq_s32_s64(Tmp_64x2)));
                Tmp_32x2        = Tmp_32x2x2.val[0];                                                // LSB of Tmp_64x2 (MSB is dummy)
                TempX_64x2      = vqdmull_s32(Tmp_32x2, Tmp_32x2);
                TempX_64x1      = vget_low_s64(TempX_64x2);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
        }
        else
        {
            // compensate Peak according to the previous shift bits
            Peak_64x1  = vqrshl_s64(Peak_64x1, Peak_scale_64x1);

            // amplitude measure
            Zero_s64x1 = vdup_n_s64(0);
            for(k = cpt1; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Tmp_64x1)
                TempX_64x1      = vqsub_s64(Zero_s64x1, Tmp_64x1);                                  // -Tmp_64x1
                TempX_64x1      = vbsl_s64(bsl_u64x1, TempX_64x1, Tmp_64x1);
                TempX_64x1      = vqrshl_s64(TempX_64x1, MaxShiftBits_hd_64x1);
                TempX_64x2      = vcombine_s64(TempX_64x1, TempX_64x1);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
            Ptr2 = HeapPtr->LimiterLABuf[ch];
            for(k = cpt2; k > 0; k--)
            {
                Tmp_64x1        = vld1_s64(Ptr);
                Ptr++;
                vst1_s64(Ptr2, Tmp_64x1);
                Ptr2++;
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Tmp_64x1)
                TempX_64x1      = vqsub_s64(Zero_s64x1, Tmp_64x1);                                  // -Tmp_64x1
                TempX_64x1      = vbsl_s64(bsl_u64x1, TempX_64x1, Tmp_64x1);
                TempX_64x1      = vqrshl_s64(TempX_64x1, MaxShiftBits_hd_64x1);
                TempX_64x2      = vcombine_s64(TempX_64x1, TempX_64x1);
                diffX_64x1      = vqsub_s64(Peak_64x1, TempX_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63));                 // sign(diffX)
                diffX_64x2      = vcombine_s64(diffX_64x1, diffX_64x1);
                diffX_low_32x2  = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32);                     // wextract_l(diffX), wextract_l(diffX)
                diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32);                                     // wround_L(diffX), wround_L(diffX)
                Tmp_64x2        = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2));    // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef)
                Tmp_64x2        = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2);     // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef))
                Tmp_64x2        = vqaddq_s64(TempX_64x2, Tmp_64x2);
                Peak_64x1       = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2));
                Tmp_64x1        = vqsub_s64(Peak_64x1, PeakMax_64x1);
                bsl_u64x1       = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63));                   // sign(Peak_64x1 - PeakMax_64x1)
                PeakMax_64x1    = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1);
            }
        }

        HeapPtr->LimiterPeak[ch] = vget_lane_s64(Peak_64x1, 0);                                     // save history
    }  // for(ch = 0...)
    PeakMax                = vget_lane_s64(PeakMax_64x1, 0);
    HeapPtr->PrevShiftBits = MaxShiftBits;

    STOP_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_INSERT_NEW_SUBBAND)


    if(PeakMax < MDRC5B_ALMOST_ZERO_THRESH)
    {
        PeakdB = (MDRC5B_POWER_DB_MINUS_INF << 16); // 8.16, [-128.0, 127.0] dB
    }
    else
    {
        Peak_u32x2x2    = vuzp_u32(vreinterpret_u32_s64(PeakMax_64x1), vreinterpret_u32_s64(PeakMax_64x1));
        bsl_u32x2       = vceq_u32(Peak_u32x2x2.val[1], vdup_n_u32(0));
        Peak_exp_u32x2  = vadd_u32(vclz_u32(Peak_u32x2x2.val[0]), vdup_n_u32(32));
        Peak_exp2_u32x2 = vclz_u32(Peak_u32x2x2.val[1]);
        Peak_exp_u32x2  = vbsl_u32(bsl_u32x2, Peak_exp_u32x2, Peak_exp2_u32x2);
        Peak_mant_u32x2 = vrshrn_n_u64(vshlq_u64(vreinterpretq_u64_s64(vcombine_s64(PeakMax_64x1, PeakMax_64x1)), vreinterpretq_s64_u64(vmovl_u32(Peak_exp_u32x2))), 32);

        // if(Peak_mant >= sqrt(0.5))
        // {
        //     Peak_exp--;
        //     Peak_mant >>= 1;
        // }
        bsl_u32x2       = vcge_u32(Peak_mant_u32x2, vdup_n_u32(0xB504F334));
        Peak_exp_u32x2  = vbsl_u32(bsl_u32x2, vsub_u32(Peak_exp_u32x2, vdup_n_u32(1)), Peak_exp_u32x2);
        Peak_mant_u32x2 = vbsl_u32(bsl_u32x2, vrshr_n_u32(Peak_mant_u32x2, 1), Peak_mant_u32x2);

        Peak_exp_32x2 = vreinterpret_s32_u32(Peak_exp_u32x2);
#ifdef SAMPLES_24_BITS
        // correction of 16 bits if input samples are 24 bits
        Peak_exp_32x2 = vsub_s32(Peak_exp_32x2, vdup_n_s32(16));
#endif // SAMPLES_24_BITS

        // at this point : sqrt(0.5)/2 <= Peak_mant < sqrt(0.5)
        //
        // ln(1+x) = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9 - x^10/10 ...    accuracy OK if |x| < 0.5
        // sqrt(0.5)/2 <= Peak_mant < sqrt(0.5)  =>  sqrt(0.5)-1 <= 2*Peak_mant-1 < 2*sqrt(0.5)-1
        //                                       =>  ln(Peak_mant) = ln(1+x)-ln(2) with x=2*Peak_mant-1, i.e. |x| < 0.414214...

        // x=2*PeakMax_mant-1 in Q31
        // => sqrt(0.5)-1 <= x < 2*sqrt(0.5)-1
        x_32x2      = vreinterpret_s32_u32(vsub_u32(Peak_mant_u32x2, vdup_n_u32(0x80000000)));

        PeakdB_32x2 = x_32x2;                                                                     // PeakdB = x

        xn_32x2     = vqrdmulh_s32(x_32x2, x_32x2);                                               // xn = x^2
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 1));                            // PeakdB = x - x^2/2

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^3
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x2AAAAAAB)));      // PeakdB = x - x^2/2 + x^3/3

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^4
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 2));                            // PeakdB = x - x^2/2 + x^3/3 - x^4/4

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^5
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x1999999A)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^6
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x15555555)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^7
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x12492492)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^8
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 3));                            // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^9
        PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x0E38E38E)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9

        xn_32x2     = vqrdmulh_s32(xn_32x2, x_32x2);                                              // xn = x^10
        PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x0CCCCCCD)));      // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9 - x^10/10

        // at this point : PeakMaxdB contains ln(1+x) in Q31

        if(RmsMeasure)
        {
            // dB(power) = 10*log10(power)

            // PeakMaxdB = 10*log10(PeakMax)+20*log10(2)*(HEADROOM+MaxShiftBits)
            //           = 10*ln(PeakMax)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*ln(PeakMax_mant*2^(-PeakMax_exp))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*(ln(PeakMax_mant)-PeakMax_exp*ln(2))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*ln(PeakMax_mant)-PeakMax_exp*10*ln(2)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 10/ln(10)*ln(PeakMax_mant)+10*ln(2)/ln(10)*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp)
            //
            // => RmsdB = 10/ln(10)*ln(1+x)+10*ln(2)/ln(10)*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp)
            // => RmsdB (Q16) = 0x457CB*ln(1+x)+0x302A3*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp)

            // fractional mutiply 0x457CB*ln(1+x) in Q16
            PeakdB_32x2   = vqrdmulh_s32(PeakdB_32x2, vdup_n_s32(0x457CB));

            // PeakdB_exp = 2*(HEADROOM+MaxShiftBits)-PeakdB_exp
            Peak_exp_32x2 = vsub_s32(vdup_n_s32(2 * (HEADROOM + MaxShiftBits)), Peak_exp_32x2);

            // PeakMaxdB final value (integer mac 0x302A3*PeakdB_exp)
            PeakdB_32x2   = vmla_s32(PeakdB_32x2, Peak_exp_32x2, vdup_n_s32(0x302A3));
        }
        else
        {
            // dB(power) = 20*log10(abs)

            // PeakMaxdB = 20*log10(PeakMax)+20*log10(2)*(HEADROOM+MaxShiftBits)
            //           = 20*ln(PeakMax)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*ln(PeakMax_mant*2^(-PeakMax_exp))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*(ln(PeakMax_mant)-PeakMax_exp*ln(2))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*ln(PeakMax_mant)-PeakMax_exp*20*ln(2)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits)
            //           = 20/ln(10)*ln(PeakMax_mant)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits-PeakMax_exp)
            //
            // => RmsdB = 20/ln(10)*ln(1+x)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits-PeakMax_exp)
            // => RmsdB (Q16) = 0x8AF96*ln(1+x)+0x60546*(HEADROOM+MaxShiftBits-PeakMax_exp)

            // fractional mutiply 0x8AF96*ln(1+x) in Q16
            PeakdB_32x2     = vqrdmulh_s32(PeakdB_32x2, vdup_n_s32(0x8AF96));

            // PeakdB_exp = HEADROOM+MaxShiftBits-PeakdB_exp
            Peak_exp_32x2 = vsub_s32(vdup_n_s32(HEADROOM + MaxShiftBits), Peak_exp_32x2);

            // PeakMaxdB final value (integer mac 0x60546*PeakdB_exp)
            PeakdB_32x2     = vmla_s32(PeakdB_32x2, Peak_exp_32x2, vdup_n_s32(0x60546));
        }
        PeakdB = vget_lane_s32(PeakdB_32x2, 0);
    }
#ifdef DEBUG_LIMITER_OUTPUT
    if((debug_cpt_samples >= DEBUG_CPT_MIN) && (debug_cpt_samples <= DEBUG_CPT_MAX))
    {
        char string[100];

        debug_write_string("MRDC5B_LIMITER_PEAKMAX_PEAKDB\n");
        sprintf(string, "PeakMax=0x%012llX, HEADROOM+MaxShiftBits=%d => PeakdB=0x%06X\n",
#ifdef SAMPLES_24_BITS
                        PeakMax & 0xFFFFFFFFFFFFLL,
#else // SAMPLES_24_BITS
                        (PeakMax >> 16) & 0xFFFFFFFFFFFFLL,
#endif // SAMPLES_24_BITS
                        HEADROOM + MaxShiftBits,
                        PeakdB & 0xFFFFFF);
        debug_write_string(string);
    }
Ejemplo n.º 6
0
// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
// CHECK:   ret i64 [[VGET_LANE]]
int64_t test_vget_lane_s64(int64x1_t a) {
  return vget_lane_s64(a, 0);
}
Ejemplo n.º 7
0
f64 dotProduct(const Size2D &_size,
               const s8 * src0Base, ptrdiff_t src0Stride,
               const s8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }

// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        int64x2_t ws = vmovq_n_s64(0);

        while(i + 16 <= size.width)
        {
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;

            int32x4_t s1 = vmovq_n_s32(0);
            int32x4_t s2 = vmovq_n_s32(0);

            for (; i <= lim; i += 16)
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);

                int8x16_t vs1 = vld1q_s8(src0 + i);
                int8x16_t vs2 = vld1q_s8(src1 + i);

                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));

                s1 = vpadalq_s16(s1, vdot1);
                s2 = vpadalq_s16(s2, vdot2);
            }

            ws = vpadalq_s32(ws, s1);
            ws = vpadalq_s32(ws, s2);
        }

        if(i + 8 <= size.width)
        {
            int8x8_t vs1 = vld1_s8(src0 + i);
            int8x8_t vs2 = vld1_s8(src1 + i);

            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
            i += 8;
        }

        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);

        for (; i < size.width; ++i)
            result += s32(src0[i]) * s32(src1[i]);
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}