/* Compute reflection coefficients from input signal */
void silk_burg_modified_sse4_1(
    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * (D + subfr_length)       */
    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
    const opus_int              D,                  /* I    Order                                                       */
    int                         arch                /* I    Run-time architecture                                       */
)
{
    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
    const opus_int16 *x_ptr;
    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];

    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
    __m128i CONST1 = _mm_set1_epi32(1);

    silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE);

    /* Compute autocorrelations, added over subframes */
    silk_sum_sqr_shift(&C0, &rshifts, x, nb_subfr * subfr_length);
    if(rshifts > MAX_RSHIFTS) {
        C0 = silk_LSHIFT32(C0, rshifts - MAX_RSHIFTS);
        silk_assert(C0 > 0);
        rshifts = MAX_RSHIFTS;
    } else {
        lz = silk_CLZ32(C0) - 1;
        rshifts_extra = N_BITS_HEAD_ROOM - lz;
        if(rshifts_extra > 0) {
            rshifts_extra = silk_min(rshifts_extra, MAX_RSHIFTS - rshifts);
            C0 = silk_RSHIFT32(C0, rshifts_extra);
        } else {
            rshifts_extra = silk_max(rshifts_extra, MIN_RSHIFTS - rshifts);
            C0 = silk_LSHIFT32(C0, -rshifts_extra);
        }
        rshifts += rshifts_extra;
    }
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */
    silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32));
    if(rshifts > 0) {
        for(s = 0; s < nb_subfr; s++) {
            x_ptr = x + s * subfr_length;
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
                    silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts);
            }
        }
    } else {
        for(s = 0; s < nb_subfr; s++) {
            int i;
            opus_int32 d;
            x_ptr = x + s * subfr_length;
            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch);
            for(n = 1; n < D + 1; n++) {
               for (i = n + subfr_length - D, d = 0; i < subfr_length; i++)
                  d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]);
               xcorr[ n - 1 ] += d;
            }
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts);
            }
        }
    }
    silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32));

    /* Initialize */
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */

    invGain_Q30 = (opus_int32)1 << 30;
    reached_max_gain = 0;
    for(n = 0; n < D; n++) {
        /* Update first row of correlation matrix (without first element) */
        /* Update last row of correlation matrix (without last element, stored in reversed order) */
        /* Update C * Af */
        /* Update C * flipud(Af) (stored in reversed order) */
        if(rshifts > -2) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    16 - rshifts);        /* Q(16-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts);        /* Q(16-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    QA - 16);             /* Q(QA-16) */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16);             /* Q(QA-16) */
                for(k = 0; k < n; k++) {
                    C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_SMLAWB(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp_QA = Af_QA[ k ];
                    tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ]           );                 /* Q(QA-16) */
                    tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]);                 /* Q(QA-16) */
                }
                tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                for(k = 0; k <= n; k++) {
                    CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ]                   );        /* Q(-rshift) */
                    CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]);        /* Q(-rshift) */
                }
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    -rshifts);            /* Q(-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts);            /* Q(-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    17);                  /* Q17 */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17);                  /* Q17 */

                X1_3210 = _mm_set1_epi32(x1);
                X2_3210 = _mm_set1_epi32(x2);
                TMP1_3210 = _mm_setzero_si128();
                TMP2_3210 = _mm_setzero_si128();
                for(k = 0; k < n - 3; k += 4) {
                    PTR_3210   = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 1 - 3 ]);
                    SUBFR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k ]);
                    FIRST_3210 = _mm_loadu_si128((__m128i *)&C_first_row[ k ]);
                    PTR_3210   = _mm_shuffle_epi32(PTR_3210,  _MM_SHUFFLE(0, 1, 2, 3));
                    LAST_3210  = _mm_loadu_si128((__m128i *)&C_last_row[ k ]);
                    ATMP_3210  = _mm_loadu_si128((__m128i *)&Af_QA[ k ]);

                    T1_3210 = _mm_mullo_epi32(PTR_3210, X1_3210);
                    T2_3210 = _mm_mullo_epi32(SUBFR_3210, X2_3210);

                    ATMP_3210 = _mm_srai_epi32(ATMP_3210, 7);
                    ATMP_3210 = _mm_add_epi32(ATMP_3210, CONST1);
                    ATMP_3210 = _mm_srai_epi32(ATMP_3210, 1);

                    FIRST_3210 = _mm_add_epi32(FIRST_3210, T1_3210);
                    LAST_3210 = _mm_add_epi32(LAST_3210, T2_3210);

                    PTR_3210   = _mm_mullo_epi32(ATMP_3210, PTR_3210);
                    SUBFR_3210   = _mm_mullo_epi32(ATMP_3210, SUBFR_3210);

                    _mm_storeu_si128((__m128i *)&C_first_row[ k ], FIRST_3210);
                    _mm_storeu_si128((__m128i *)&C_last_row[ k ], LAST_3210);

                    TMP1_3210 = _mm_add_epi32(TMP1_3210, PTR_3210);
                    TMP2_3210 = _mm_add_epi32(TMP2_3210, SUBFR_3210);
                }

                TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210));
                TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210));
                TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E));
                TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E));

                tmp1 += _mm_cvtsi128_si32(TMP1_3210);
                tmp2 += _mm_cvtsi128_si32(TMP2_3210);

                for(; k < n; k++) {
                    C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_MLA(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17);                                   /* Q17 */
                    tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ],            Atmp1);                      /* Q17 */
                    tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1);                      /* Q17 */
                }

                tmp1 = -tmp1;                /* Q17 */
                tmp2 = -tmp2;                /* Q17 */

                {
                    __m128i xmm_tmp1, xmm_tmp2;
                    __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
                    __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;

                    xmm_tmp1 = _mm_set1_epi32(tmp1);
                    xmm_tmp2 = _mm_set1_epi32(tmp2);

                    for(k = 0; k <= n - 3; k += 4) {
                        xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 3 ]);
                        xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k - 1 ]);

                        xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 1, 2, 3));

                        xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32(xmm_x_ptr_n_k_x2x0, -rshifts - 1);
                        xmm_x_ptr_sub_x2x0 = _mm_slli_epi32(xmm_x_ptr_sub_x2x0, -rshifts - 1);

                        /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
                        xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 3, 2, 1));
                        xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_sub_x2x0, _MM_SHUFFLE(0, 3, 2, 1));

                        xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32(xmm_x_ptr_n_k_x2x0, xmm_tmp1);
                        xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32(xmm_x_ptr_n_k_x3x1, xmm_tmp1);
                        xmm_x_ptr_sub_x2x0 = _mm_mul_epi32(xmm_x_ptr_sub_x2x0, xmm_tmp2);
                        xmm_x_ptr_sub_x3x1 = _mm_mul_epi32(xmm_x_ptr_sub_x3x1, xmm_tmp2);

                        xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64(xmm_x_ptr_n_k_x2x0, 16);
                        xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64(xmm_x_ptr_n_k_x3x1, 16);
                        xmm_x_ptr_sub_x2x0 = _mm_srli_epi64(xmm_x_ptr_sub_x2x0, 16);
                        xmm_x_ptr_sub_x3x1 = _mm_slli_epi64(xmm_x_ptr_sub_x3x1, 16);

                        xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16(xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC);
                        xmm_x_ptr_sub_x2x0 = _mm_blend_epi16(xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC);

                        X1_3210  = _mm_loadu_si128((__m128i *)&CAf[ k ]);
                        PTR_3210 = _mm_loadu_si128((__m128i *)&CAb[ k ]);

                        X1_3210  = _mm_add_epi32(X1_3210, xmm_x_ptr_n_k_x2x0);
                        PTR_3210 = _mm_add_epi32(PTR_3210, xmm_x_ptr_sub_x2x0);

                        _mm_storeu_si128((__m128i *)&CAf[ k ], X1_3210);
                        _mm_storeu_si128((__m128i *)&CAb[ k ], PTR_3210);
                    }

                    for(; k <= n; k++) {
                        CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1,
                            silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1));                    /* Q(-rshift) */
                        CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2,
                            silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */
                    }
                }
            }
        }

        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
        tmp1 = C_first_row[ n ];                                                                        /* Q(-rshifts) */
        tmp2 = C_last_row[ n ];                                                                         /* Q(-rshifts) */
        num  = 0;                                                                                       /* Q(-rshifts) */
        nrg  = silk_ADD32(CAb[ 0 ], CAf[ 0 ]);                                                        /* Q(1-rshifts) */
        for(k = 0; k < n; k++) {
            Atmp_QA = Af_QA[ k ];
            lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1;
            lz = silk_min(32 - QA, lz);
            Atmp1 = silk_LSHIFT32(Atmp_QA, lz);                                                       /* Q(QA + lz) */

            tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[  n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            num  = silk_ADD_LSHIFT32(num,  silk_SMMUL(CAb[ n - k ],             Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            nrg  = silk_ADD_LSHIFT32(nrg,  silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]),
                                                                                Atmp1), 32 - QA - lz);    /* Q(1-rshifts) */
        }
        CAf[ n + 1 ] = tmp1;                                                                            /* Q(-rshifts) */
        CAb[ n + 1 ] = tmp2;                                                                            /* Q(-rshifts) */
        num = silk_ADD32(num, tmp2);                                                                  /* Q(-rshifts) */
        num = silk_LSHIFT32(-num, 1);                                                                 /* Q(1-rshifts) */

        /* Calculate the next order reflection (parcor) coefficient */
        if(silk_abs(num) < nrg) {
            rc_Q31 = silk_DIV32_varQ(num, nrg, 31);
        } else {
            rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN;
        }

        /* Update inverse prediction gain */
        tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31);
        tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2);
        if(tmp1 <= minInvGain_Q30) {
            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
            tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30);            /* Q30 */
            rc_Q31 = silk_SQRT_APPROX(tmp2);                                                  /* Q15 */
            /* Newton-Raphson iteration */
            rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1);                   /* Q15 */
            rc_Q31 = silk_LSHIFT32(rc_Q31, 16);                                               /* Q31 */
            if(num < 0) {
                /* Ensure adjusted reflection coefficients has the original sign */
                rc_Q31 = -rc_Q31;
            }
            invGain_Q30 = minInvGain_Q30;
            reached_max_gain = 1;
        } else {
            invGain_Q30 = tmp1;
        }

        /* Update the AR coefficients */
        for(k = 0; k < (n + 1) >> 1; k++) {
            tmp1 = Af_QA[ k ];                                                                  /* QA */
            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
            Af_QA[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);      /* QA */
            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);      /* QA */
        }
        Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA);                                          /* QA */

        if(reached_max_gain) {
            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
            for(k = n + 1; k < D; k++) {
                Af_QA[ k ] = 0;
            }
            break;
        }

        /* Update C * Af and C * Ab */
        for(k = 0; k <= n + 1; k++) {
            tmp1 = CAf[ k ];                                                                    /* Q(-rshifts) */
            tmp2 = CAb[ n - k + 1 ];                                                            /* Q(-rshifts) */
            CAf[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);        /* Q(-rshifts) */
            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);        /* Q(-rshifts) */
        }
    }

    if(reached_max_gain) {
        for(k = 0; k < D; k++) {
            /* Scale coefficients */
            A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);
        }
        /* Subtract energy of preceding samples from C0 */
        if(rshifts > 0) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts);
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts);
            }
        }
        /* Approximate residual energy */
        *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2);
        *res_nrg_Q = -rshifts;
    } else {
        /* Return residual energy */
        nrg  = CAf[ 0 ];                                                                            /* Q(-rshifts) */
        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
        for(k = 0; k < D; k++) {
            Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);                                       /* Q16 */
            nrg  = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1);                                         /* Q(-rshifts) */
            tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1);                                               /* Q16 */
            A_Q16[ k ] = -Atmp1;
        }
        *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */
        *res_nrg_Q = -rshifts;
    }
}
Example #2
0
int _celt_autocorr(
                   const opus_val16 *x,   /*  in: [0...n-1] samples x   */
                   opus_val32       *ac,  /* out: [0...lag-1] ac values */
                   const opus_val16       *window,
                   int          overlap,
                   int          lag,
                   int          n,
                   int          arch
                  )
{
   opus_val32 d;
   int i, k;
   int fastN=n-lag;
   int shift;
   const opus_val16 *xptr;
   VARDECL(opus_val16, xx);
   SAVE_STACK;
   ALLOC(xx, n, opus_val16);
   celt_assert(n>0);
   celt_assert(overlap>=0);
   if (overlap == 0)
   {
      xptr = x;
   } else {
      for (i=0;i<n;i++)
         xx[i] = x[i];
      for (i=0;i<overlap;i++)
      {
         xx[i] = MULT16_16_Q15(x[i],window[i]);
         xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]);
      }
      xptr = xx;
   }
   shift=0;
#ifdef OPUS_FIXED_POINT
   {
      opus_val32 ac0;
      ac0 = 1+(n<<7);
      if (n&1) ac0 += SHR32(MULT16_16(xptr[0],xptr[0]),9);
      for(i=(n&1);i<n;i+=2)
      {
         ac0 += SHR32(MULT16_16(xptr[i],xptr[i]),9);
         ac0 += SHR32(MULT16_16(xptr[i+1],xptr[i+1]),9);
      }

      shift = celt_ilog2(ac0)-30+10;
      shift = (shift)/2;
      if (shift>0)
      {
         for(i=0;i<n;i++)
            xx[i] = PSHR32(xptr[i], shift);
         xptr = xx;
      } else
         shift = 0;
   }
#endif
   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch);
   for (k=0;k<=lag;k++)
   {
      for (i = k+fastN, d = 0; i < n; i++)
         d = MAC16_16(d, xptr[i], xptr[i-k]);
      ac[k] += d;
   }
#ifdef OPUS_FIXED_POINT
   shift = 2*shift;
   if (shift<=0)
      ac[0] += SHL32((opus_int32)1, -shift);
   if (ac[0] < 268435456)
   {
      int shift2 = 29 - EC_ILOG(ac[0]);
      for (i=0;i<=lag;i++)
         ac[i] = SHL32(ac[i], shift2);
      shift -= shift2;
   } else if (ac[0] >= 536870912)
   {
      int shift2=1;
      if (ac[0] >= 1073741824)
         shift2++;
      for (i=0;i<=lag;i++)
         ac[i] = SHR32(ac[i], shift2);
      shift += shift2;
   }
#endif

   RESTORE_STACK;
   return shift;
}
/* Compute reflection coefficients from input signal */
void silk_burg_modified_c(
    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * (D + subfr_length)       */
    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
    const opus_int              D,                  /* I    Order                                                       */
    int                         arch                /* I    Run-time architecture                                       */
)
{
    opus_int         k, n, s, lz, rshifts, reached_max_gain;
    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
    const opus_int16 *x_ptr;
    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
    opus_int64       C0_64;

    silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE);

    /* Compute autocorrelations, added over subframes */
    C0_64 = silk_inner_prod16_aligned_64(x, x, subfr_length*nb_subfr, arch);
    lz = silk_CLZ64(C0_64);
    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;

    if (rshifts > 0) {
        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts);
    } else {
        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts);
    }

    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */
    silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32));
    if(rshifts > 0) {
        for(s = 0; s < nb_subfr; s++) {
            x_ptr = x + s * subfr_length;
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
                    silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts);
            }
        }
    } else {
        for(s = 0; s < nb_subfr; s++) {
            int i;
            opus_int32 d;
            x_ptr = x + s * subfr_length;
            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch);
            for(n = 1; n < D + 1; n++) {
               for (i = n + subfr_length - D, d = 0; i < subfr_length; i++)
                  d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]);
               xcorr[ n - 1 ] += d;
            }
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts);
            }
        }
    }
    silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32));

    /* Initialize */
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */

    invGain_Q30 = (opus_int32)1 << 30;
    reached_max_gain = 0;
    for(n = 0; n < D; n++) {
        /* Update first row of correlation matrix (without first element) */
        /* Update last row of correlation matrix (without last element, stored in reversed order) */
        /* Update C * Af */
        /* Update C * flipud(Af) (stored in reversed order) */
        if(rshifts > -2) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    16 - rshifts);        /* Q(16-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts);        /* Q(16-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    QA - 16);             /* Q(QA-16) */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16);             /* Q(QA-16) */
                for(k = 0; k < n; k++) {
                    C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_SMLAWB(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp_QA = Af_QA[ k ];
                    tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ]           );                 /* Q(QA-16) */
                    tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]);                 /* Q(QA-16) */
                }
                tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                for(k = 0; k <= n; k++) {
                    CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ]                   );        /* Q(-rshift) */
                    CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]);        /* Q(-rshift) */
                }
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    -rshifts);            /* Q(-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts);            /* Q(-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    17);                  /* Q17 */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17);                  /* Q17 */
                for(k = 0; k < n; k++) {
                    C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_MLA(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17);                                   /* Q17 */
                    tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ],            Atmp1);                      /* Q17 */
                    tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1);                      /* Q17 */
                }
                tmp1 = -tmp1;                                                                           /* Q17 */
                tmp2 = -tmp2;                                                                           /* Q17 */
                for(k = 0; k <= n; k++) {
                    CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1,
                        silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1));                    /* Q(-rshift) */
                    CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2,
                        silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */
                }
            }
        }

        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
        tmp1 = C_first_row[ n ];                                                                        /* Q(-rshifts) */
        tmp2 = C_last_row[ n ];                                                                         /* Q(-rshifts) */
        num  = 0;                                                                                       /* Q(-rshifts) */
        nrg  = silk_ADD32(CAb[ 0 ], CAf[ 0 ]);                                                        /* Q(1-rshifts) */
        for(k = 0; k < n; k++) {
            Atmp_QA = Af_QA[ k ];
            lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1;
            lz = silk_min(32 - QA, lz);
            Atmp1 = silk_LSHIFT32(Atmp_QA, lz);                                                       /* Q(QA + lz) */

            tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[  n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            num  = silk_ADD_LSHIFT32(num,  silk_SMMUL(CAb[ n - k ],             Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            nrg  = silk_ADD_LSHIFT32(nrg,  silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]),
                                                                                Atmp1), 32 - QA - lz);    /* Q(1-rshifts) */
        }
        CAf[ n + 1 ] = tmp1;                                                                            /* Q(-rshifts) */
        CAb[ n + 1 ] = tmp2;                                                                            /* Q(-rshifts) */
        num = silk_ADD32(num, tmp2);                                                                  /* Q(-rshifts) */
        num = silk_LSHIFT32(-num, 1);                                                                 /* Q(1-rshifts) */

        /* Calculate the next order reflection (parcor) coefficient */
        if(silk_abs(num) < nrg) {
            rc_Q31 = silk_DIV32_varQ(num, nrg, 31);
        } else {
            rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN;
        }

        /* Update inverse prediction gain */
        tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31);
        tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2);
        if(tmp1 <= minInvGain_Q30) {
            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
            tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30);            /* Q30 */
            rc_Q31 = silk_SQRT_APPROX(tmp2);                                                  /* Q15 */
            /* Newton-Raphson iteration */
            rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1);                   /* Q15 */
            rc_Q31 = silk_LSHIFT32(rc_Q31, 16);                                               /* Q31 */
            if(num < 0) {
                /* Ensure adjusted reflection coefficients has the original sign */
                rc_Q31 = -rc_Q31;
            }
            invGain_Q30 = minInvGain_Q30;
            reached_max_gain = 1;
        } else {
            invGain_Q30 = tmp1;
        }

        /* Update the AR coefficients */
        for(k = 0; k < (n + 1) >> 1; k++) {
            tmp1 = Af_QA[ k ];                                                                  /* QA */
            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
            Af_QA[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);      /* QA */
            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);      /* QA */
        }
        Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA);                                          /* QA */

        if(reached_max_gain) {
            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
            for(k = n + 1; k < D; k++) {
                Af_QA[ k ] = 0;
            }
            break;
        }

        /* Update C * Af and C * Ab */
        for(k = 0; k <= n + 1; k++) {
            tmp1 = CAf[ k ];                                                                    /* Q(-rshifts) */
            tmp2 = CAb[ n - k + 1 ];                                                            /* Q(-rshifts) */
            CAf[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);        /* Q(-rshifts) */
            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);        /* Q(-rshifts) */
        }
    }

    if(reached_max_gain) {
        for(k = 0; k < D; k++) {
            /* Scale coefficients */
            A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);
        }
        /* Subtract energy of preceding samples from C0 */
        if(rshifts > 0) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts);
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts);
            }
        }
        /* Approximate residual energy */
        *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2);
        *res_nrg_Q = -rshifts;
    } else {
        /* Return residual energy */
        nrg  = CAf[ 0 ];                                                                            /* Q(-rshifts) */
        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
        for(k = 0; k < D; k++) {
            Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);                                       /* Q16 */
            nrg  = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1);                                         /* Q(-rshifts) */
            tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1);                                               /* Q16 */
            A_Q16[ k ] = -Atmp1;
        }
        *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */
        *res_nrg_Q = -rshifts;
    }
}