示例#1
0
文件: armdct.c 项目: AlecGamble/daala
OD_SIMD_INLINE void od_load4(const od_coeff *x, int xstride,
 int32x4_t *t0, int32x4_t *t1, int32x4_t *t2, int32x4_t *t3) {
  *t0 = vld1q_s32((const int *)(x + 0*xstride));
  *t1 = vld1q_s32((const int *)(x + 1*xstride));
  *t2 = vld1q_s32((const int *)(x + 2*xstride));
  *t3 = vld1q_s32((const int *)(x + 3*xstride));
}
示例#2
0
文件: la.c 项目: dribbroc/HONEI-Droid
void * scaled_sumi_thread_NEON(void * argument)
{
    jsize i = 0;
    struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument;
    int32_t * r = (int32_t *)data->r;
    const int32_t * x = (const int32_t *)data->x;
    const int32_t * y = (const int32_t *)data->y;
    const int32_t a = (const int32_t)data->a;
    const jsize size = data->size;

    int32x4_t rx4, xx4, yx4, ax4;

    ax4 = vdupq_n_s32(a);

    for(i; i < size ; i += 4)
    {
        xx4 = vld1q_s32(&(x[i]));
        yx4 = vld1q_s32(&(y[i]));

        rx4 = vmlaq_s32(xx4, ax4, yx4);

        vst1q_s32(&(r[i]), rx4);
    }

}
示例#3
0
文件: add.c 项目: Winddoing/MyCode
static void add_int_neon(int* dst, int* src1, int* src2, int count)
{
  int i;
  for (i = 0; i < count; i += 4) {
      int32x4_t in1, in2, out;
      in1 = vld1q_s32(src1);
      src1 += 4;
      in2 = vld1q_s32(src2);
      src2 += 4;
      out = vaddq_s32(in1, in2);
      vst1q_s32(dst, out);
      dst += 4;
    }
}
示例#4
0
/* s32x4 mm mul */
void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C)
{
	int i, k, j;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_s32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_s32(A + j_T);

				neon_b = vld1q_s32(B + k_Row + j);
				neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
				neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
				neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
				neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

				neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

				vst1q_lane_s32(C + k_Row + i, neon_c, 0);
				vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
示例#5
0
/* s32x4 mv mul */
void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C)
{
	int i = 0;
	int k = 0;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_s32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_s32(A + j);
			j+=Row;
			neon_a1 = vld1q_s32(A + j);
			j+=Row;
			neon_a2 = vld1q_s32(A + j);
			j+=Row;
			neon_a3 = vld1q_s32(A + j);

			neon_b = vld1q_s32(B + k);
			neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
			neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
			neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
			neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

			neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

		}

		vst1q_s32(C + i, neon_c);
	}
}
示例#6
0
/* s32x4 saturated sub */
void mw_neon_mm_qsub_s32x4(int * A, int Row, int Col, int * B, int * C)
{
	int32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_s32(A + k);
		neon_b = vld1q_s32(B + k);
		neon_c = vqsubq_s32(neon_a, neon_b);
		vst1q_s32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] - B[k + i];
	}
}
static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 )
{
    int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ];
    const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset );
    corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 );
    corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 );
    t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) );
    t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) );
    corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC );
    corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC );
    vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] );
    vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] );
}
示例#8
0
void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
  int32x4_t c0 = vld1q_s32(input);
  int32x4_t c1 = vld1q_s32(input + 4);
  int32x4_t c2 = vld1q_s32(input + 8);
  int32x4_t c3 = vld1q_s32(input + 12);
  int16x8_t a0, a1;

  if (bd == 8) {
    const int16x4_t cospis = vld1_s16(kCospi);

    // Rows
    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
    idct4x4_16_kernel_bd8(cospis, &a0, &a1);

    // Columns
    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
    a0 = vrshrq_n_s16(a0, 4);
    a1 = vrshrq_n_s16(a1, 4);
  } else {
    const int32x4_t cospis = vld1q_s32(kCospi32);

    if (bd == 10) {
      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
    } else {
      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
    }
    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
  }

  highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
  highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
}
示例#9
0
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) {
#ifdef _M_SSE
	// Size will always be 16-byte aligned as the hwBlockSize is.
	while (size >= 8) {
		__m128i in1 = _mm_loadu_si128((__m128i *)in);
		__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
		__m128i packed = _mm_packs_epi32(in1, in2);
		if (useShift) {
			packed = _mm_srai_epi16(packed, volShift);
		}
		_mm_storeu_si128((__m128i *)out, packed);
		out += 8;
		in += 8;
		size -= 8;
	}
#elif PPSSPP_ARCH(ARM_NEON)
	int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer
	while (size >= 8) {
		int32x4_t in1 = vld1q_s32(in);
		int32x4_t in2 = vld1q_s32(in + 4);
		int16x4_t packed1 = vqmovn_s32(in1);
		int16x4_t packed2 = vqmovn_s32(in2);
		if (useShift) {
			packed1 = vshl_s16(packed1, signedVolShift);
			packed2 = vshl_s16(packed2, signedVolShift);
		}
		vst1_s16(out, packed1);
		vst1_s16(out + 4, packed2);
		out += 8;
		in += 8;
		size -= 8;
	}
#endif
	// This does the remainder if SIMD was used, otherwise it does it all.
	for (size_t i = 0; i < size; i++) {
		out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]);
	}
}
示例#10
0
static inline void yuv2rgb_4x2(const uint8_t *y1, const uint8_t *y2, const uint8_t *u, const uint8_t *v, int16_t *r1, int16_t *g1, int16_t *b1, int16_t *r2, int16_t *g2, int16_t *b2){
	int32x4_t ry1;
	int32x4_t ry2;
	int32x4_t rvug;
	int32x4_t rvr;
	int32x4_t rub;
	int32x4_t rr1,rg1,rb1,rr2,rg2,rb2;
	int32x4_t max;

	LOAD_Y_PREMULTS(0)
	LOAD_Y_PREMULTS(1)
	LOAD_Y_PREMULTS(2)
	LOAD_Y_PREMULTS(3)

	LOAD_UV_PREMULTS(0)
	LOAD_UV_PREMULTS(1)

	max=vld1q_s32(yuvmax);
	/*the following does not work */
	//max=vdupq_n_s32(255);

	rr1=vaddq_s32(ry1,rvr);
	rr2=vaddq_s32(ry2,rvr);
	rg1=vaddq_s32(ry1,rvug);
	rg2=vaddq_s32(ry2,rvug);
	rb1=vaddq_s32(ry1,rub);
	rb2=vaddq_s32(ry2,rub);



	rr1=vminq_s32(vabsq_s32(rr1),max);
	rr2=vminq_s32(vabsq_s32(rr2),max);
	rg1=vminq_s32(vabsq_s32(rg1),max);
	rg2=vminq_s32(vabsq_s32(rg2),max);
	rb1=vminq_s32(vabsq_s32(rb1),max);
	rb2=vminq_s32(vabsq_s32(rb2),max);

	vst1_s16(r1,vqshrn_n_s32(rr1,13));
	vst1_s16(r2,vqshrn_n_s32(rr2,13));

	vst1_s16(g1,vqshrn_n_s32(rg1,13));
	vst1_s16(g2,vqshrn_n_s32(rg2,13));

	vst1_s16(b1,vqshrn_n_s32(rb1,13));
	vst1_s16(b2,vqshrn_n_s32(rb2,13));
}
示例#11
0
test_vreinterpretq_f64_s32 ()
{
  int32x4_t a;
  float64x2_t b;
  int32_t c[4] = { 0x54442D18, 0x400921FB, 0x8B145769, 0x4005BF0A };
  float64_t d[2] = { PI_F64, E_F64 };
  float64_t e[2];
  int i;

  a = vld1q_s32 (c);
  b = wrap_vreinterpretq_f64_s32 (a);
  vst1q_f64 (e, b);
  for (i = 0; i < 2; i++)
    if (!DOUBLE_EQUALS (d[i], e[i], __DBL_EPSILON__))
      return 1;
  return 0;
};
static INLINE void load_4x8_s32_dual(const tran_low_t *input,
                                     int32x4_t *const in0, int32x4_t *const in1,
                                     int32x4_t *const in2, int32x4_t *const in3,
                                     int32x4_t *const in4, int32x4_t *const in5,
                                     int32x4_t *const in6,
                                     int32x4_t *const in7) {
  *in0 = vld1q_s32(input);
  input += 32;
  *in1 = vld1q_s32(input);
  input += 32;
  *in2 = vld1q_s32(input);
  input += 32;
  *in3 = vld1q_s32(input);
  input += 32;
  *in4 = vld1q_s32(input);
  input += 32;
  *in5 = vld1q_s32(input);
  input += 32;
  *in6 = vld1q_s32(input);
  input += 32;
  *in7 = vld1q_s32(input);
}
示例#13
0
void silk_biquad_alt_stride2_neon(
    const opus_int16            *in,                /* I     input signal                                               */
    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
    opus_int16                  *out,               /* O     output signal                                              */
    const opus_int32            len                 /* I     signal length (must be even)                               */
)
{
    /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
    opus_int        k            = 0;
    const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 );
    const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 );
    int16x4_t       in_s16x4  = vdup_n_s16( 0 );
    int16x4_t       out_s16x4;
    int32x2_t       A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2;
    int32x4_t       A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4;
    int32x2x2_t     t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2;

#ifdef OPUS_CHECK_ASM
    opus_int32 S_c[ 4 ];
    VARDECL( opus_int16, out_c );
    SAVE_STACK;
    ALLOC( out_c, 2 * len, opus_int16 );

    silk_memcpy( &S_c, S, sizeof( S_c ) );
    silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len );
#endif

    /* Negate A_Q28 values and split in two parts */
    A_Q28_s32x2 = vld1_s32( A_Q28 );
    A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 );
    A_L_s32x2   = vshl_n_s32( A_Q28_s32x2, 18 );                                                        /* ( -A_Q28[] & 0x00003FFF ) << 18                                                     */
    A_L_s32x2   = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) );           /* ( -A_Q28[] & 0x00003FFF ) << 15                                                     */
    A_U_s32x2   = vshr_n_s32( A_Q28_s32x2, 14 );                                                        /* silk_RSHIFT( -A_Q28[], 14 )                                                         */
    A_U_s32x2   = vshl_n_s32( A_U_s32x2, 16 );                                                          /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */
    A_U_s32x2   = vshr_n_s32( A_U_s32x2, 1 );                                                           /* silk_RSHIFT( -A_Q28[], 14 ) << 15                                                   */

    B_Q28_s32x2  = vld1_s32( B_Q28 );
    t_s32x2      = vld1_s32( B_Q28 + 1 );
    t0_s32x2x2   = vzip_s32( A_L_s32x2, A_L_s32x2 );
    t1_s32x2x2   = vzip_s32( A_U_s32x2, A_U_s32x2 );
    t2_s32x2x2   = vzip_s32( t_s32x2, t_s32x2 );
    A_L_s32x4    = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_L_Q28          */
    A_U_s32x4    = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_U_Q28          */
    B_Q28_s32x4  = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] );                            /* B_Q28[ {1,1,2,2} ]        */
    S_s32x4      = vld1q_s32( S );                                                                      /* S0 = S[ 0 ]; S3 = S[ 3 ]; */
    S_s32x2x2    = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) );                       /* S2 = S[ 1 ]; S1 = S[ 2 ]; */
    S_s32x4      = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] );

    for( ; k < len - 1; k += 2 ) {
        int32x4_t in_s32x4[ 2 ], t_s32x4;
        int32x2_t out32_Q14_s32x2[ 2 ];

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */
        in_s16x4      = vld1_s16( &in[ 2 * k ] );                                                       /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */
        in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 );                                                    /* in{0,1,2,3} << 15                      */
        t_s32x4       = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 );                             /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */
        in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15                      */
        in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15                      */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] );
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] );                   /* out32_Q14_{0,1,2,3}                                                                                        */
        out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 );                                   /* out32_Q14_{0,1,2,3} + (1<<14) - 1                                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) )                             */
        vst1_s16( &out[ 2 * k ], out_s16x4 );                                                           /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */
    }

    /* Process leftover. */
    if( k < len ) {
        int32x4_t in_s32x4;
        int32x2_t out32_Q14_s32x2;

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s32x4     = vshll_n_s16( in_s16x4, 15 );                                                     /* in{0,1} << 15                      */
        t_s32x2      = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 );                    /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */
        in_s32x4     = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) );              /* in{0,1,0,1} << 15                  */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 );                                    /* out32_Q14_{0,1} + (1<<14) - 1                                                              */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 );                             /* out32_Q14_{0,1,0,1} + (1<<14) - 1                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) )             */
        vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 );                                               /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */
        vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 );                                               /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */
    }

    vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 );                                                              /* S[ 0 ] = S0; */
    vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 );                                                              /* S[ 1 ] = S2; */
    vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 );                                                              /* S[ 2 ] = S1; */
    vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 );                                                              /* S[ 3 ] = S3; */

#ifdef OPUS_CHECK_ASM
    silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) );
    silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) );
    RESTORE_STACK;
#endif
}
void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
                                    const uchar *srcPixels, int sbpl,
                                    int w, int h,
                                    int const_alpha)
{
    const uint *src = (const uint *) srcPixels;
    uint *dst = (uint *) destPixels;
    int16x8_t half = vdupq_n_s16(0x80);
    int16x8_t full = vdupq_n_s16(0xff);
    if (const_alpha == 256) {
        for (int y = 0; y < h; ++y) {
            int x = 0;
            for (; x < w-3; x += 4) {
                int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
                if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) {
                    // all opaque
                    vst1q_s32((int32_t *)&dst[x], src32);
                } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
                    int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);

                    const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
                    const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);

                    const uint8x8_t src8_low = vget_low_u8(src8);
                    const uint8x8_t dst8_low = vget_low_u8(dst8);

                    const uint8x8_t src8_high = vget_high_u8(src8);
                    const uint8x8_t dst8_high = vget_high_u8(dst8);

                    const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
                    const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));

                    const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
                    const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));

                    const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full);
                    const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full);

                    const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
                    const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));

                    vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
                }
            }
            for (; x<w; ++x) {
                uint s = src[x];
                if (s >= 0xff000000)
                    dst[x] = s;
                else if (s != 0)
                    dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
            }
            dst = (quint32 *)(((uchar *) dst) + dbpl);
            src = (const quint32 *)(((const uchar *) src) + sbpl);
        }
    } else if (const_alpha != 0) {
        const_alpha = (const_alpha * 255) >> 8;
        int16x8_t const_alpha16 = vdupq_n_s16(const_alpha);
        for (int y = 0; y < h; ++y) {
            int x = 0;
            for (; x < w-3; x += 4) {
                if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
                    int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
                    int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);

                    const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
                    const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);

                    const uint8x8_t src8_low = vget_low_u8(src8);
                    const uint8x8_t dst8_low = vget_low_u8(dst8);

                    const uint8x8_t src8_high = vget_high_u8(src8);
                    const uint8x8_t dst8_high = vget_high_u8(dst8);

                    const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
                    const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));

                    const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
                    const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));

                    const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half);
                    const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half);

                    const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full);
                    const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full);

                    const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
                    const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));

                    vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
                }
            }
            for (; x<w; ++x) {
                uint s = src[x];
                if (s != 0) {
                    s = BYTE_MUL(s, const_alpha);
                    dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
                }
            }
            dst = (quint32 *)(((uchar *) dst) + dbpl);
            src = (const quint32 *)(((const uchar *) src) + sbpl);
        }
    }
void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
                                   int stride, int tx_type, int bd) {
  int32x4_t a[16];
  int16x8_t c[8];

  a[0] = vld1q_s32(input);
  a[1] = vld1q_s32(input + 4);
  a[2] = vld1q_s32(input + 8);
  a[3] = vld1q_s32(input + 12);
  a[4] = vld1q_s32(input + 16);
  a[5] = vld1q_s32(input + 20);
  a[6] = vld1q_s32(input + 24);
  a[7] = vld1q_s32(input + 28);
  a[8] = vld1q_s32(input + 32);
  a[9] = vld1q_s32(input + 36);
  a[10] = vld1q_s32(input + 40);
  a[11] = vld1q_s32(input + 44);
  a[12] = vld1q_s32(input + 48);
  a[13] = vld1q_s32(input + 52);
  a[14] = vld1q_s32(input + 56);
  a[15] = vld1q_s32(input + 60);

  if (bd == 8) {
    c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
    c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
    c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
    c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
    c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
    c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
    c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
    c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));

    switch (tx_type) {
      case DCT_DCT: {
        const int16x8_t cospis = vld1q_s16(kCospi);
        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28

        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        break;
      }

      case ADST_DCT: {
        const int16x8_t cospis = vld1q_s16(kCospi);
        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28

        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        break;
      }

      case DCT_ADST: {
        const int16x8_t cospis = vld1q_s16(kCospi);
        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28

        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        break;
      }

      default: {
        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        break;
      }
    }

    c[0] = vrshrq_n_s16(c[0], 5);
    c[1] = vrshrq_n_s16(c[1], 5);
    c[2] = vrshrq_n_s16(c[2], 5);
    c[3] = vrshrq_n_s16(c[3], 5);
    c[4] = vrshrq_n_s16(c[4], 5);
    c[5] = vrshrq_n_s16(c[5], 5);
    c[6] = vrshrq_n_s16(c[6], 5);
    c[7] = vrshrq_n_s16(c[7], 5);
  } else {
    switch (tx_type) {
      case DCT_DCT: {
        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
        const int32x4_t cospis1 =
            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28

        if (bd == 10) {
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        } else {
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        }
        break;
      }

      case ADST_DCT: {
        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
        const int32x4_t cospis1 =
            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28

        if (bd == 10) {
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        } else {
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        }
        break;
      }

      case DCT_ADST: {
        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
        const int32x4_t cospis1 =
            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28

        if (bd == 10) {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        } else {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        }
        break;
      }

      default: {
        assert(tx_type == ADST_ADST);
        if (bd == 10) {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        } else {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        }
        break;
      }
    }

    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
    c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
    c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
    c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
    c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
    c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
    c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
  }
  highbd_add8x8(c, dest, stride, bd);
}
void silk_warped_autocorrelation_FIX_neon(
          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
    const opus_int                  length,                                 /* I    Length of input                                                             */
    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
)
{
    if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) {
        silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order );
    } else {
        opus_int       n, i, lsh;
        opus_int64     corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */
        opus_int64     corr_QC_orderT;
        int64x2_t      lsh_s64x2;
        const opus_int orderT = ( order + 3 ) & ~3;
        opus_int64     *corr_QCT;
        opus_int32     *input_QS;
        VARDECL( opus_int32, input_QST );
        VARDECL( opus_int32, state );
        SAVE_STACK;

        /* Order must be even */
        silk_assert( ( order & 1 ) == 0 );
        silk_assert( 2 * QS - QC >= 0 );

        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 );

        input_QS = input_QST;
        /* input_QS has zero paddings in the beginning and end. */
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;

        /* Loop over samples */
        for( n = 0; n < length - 7; n += 8, input_QS += 8 ) {
            const int16x8_t t0_s16x4 = vld1q_s16( input + n );
            vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) );
            vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) );
        }
        for( ; n < length; n++, input_QS++ ) {
            input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
        }
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;

        /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
        /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues.       */
        /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */

        /* Keep the C code here to help understand the intrinsics optimization. */
        /*
        {
            opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
            opus_int32 *state_QST[ 3 ];
            state_QST[ 0 ] = state_QS[ 0 ];
            state_QST[ 1 ] = state_QS[ 1 ];
            for( n = 0; n < length + order; n++, input_QS++ ) {
                state_QST[ 0 ][ orderT ] = input_QS[ orderT ];
                for( i = 0; i < orderT; i++ ) {
                    corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC );
                    state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 );
                }
                state_QST[ 2 ] = state_QST[ 0 ];
                state_QST[ 0 ] = state_QST[ 1 ];
                state_QST[ 1 ] = state_QST[ 2 ];
            }
        }
        */

        {
            const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 );
            const opus_int32 *in = input_QS + orderT;
            opus_int o = orderT;
            int32x4_t state_QS_s32x4[ 3 ][ 2 ];

            ALLOC( state, length + orderT, opus_int32 );
            state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );

            /* Calculate 8 taps of all inputs in each loop. */
            do {
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] =
                state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 );
                n = 0;
                do {
                    calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] );
                    calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] );
                    state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n );
                    vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 );
                    state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ];
                } while( ++n < ( length + order ) );
                in = state;
                o -= 8;
            } while( o > 4 );

            if( o ) {
                /* Calculate the last 4 taps of all inputs. */
                opus_int32 *stateT = state;
                silk_assert( o == 4 );
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 );
                n = length + order;
                do {
                    calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] );
                    state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT );
                    vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    input_QS++;
                    stateT++;
                } while( --n );
            }
        }

        {
            const opus_int16 *inputT = input;
            int32x4_t t_s32x4;
            int64x1_t t_s64x1;
            int64x2_t t_s64x2 = vdupq_n_s64( 0 );
            for( n = 0; n <= length - 8; n += 8 ) {
                int16x8_t input_s16x8 = vld1q_s16( inputT );
                t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) );
                t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) );
                inputT += 8;
            }
            t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) );
            corr_QC_orderT = vget_lane_s64( t_s64x1, 0 );
            for( ; n < length; n++ ) {
                corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] );
            }
            corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC );
            corr_QC[ orderT ] = corr_QC_orderT;
        }

        corr_QCT = corr_QC + orderT - order;
        lsh = silk_CLZ64( corr_QC_orderT ) - 35;
        lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
        *scale = -( QC + lsh );
        silk_assert( *scale >= -30 && *scale <= 12 );
        lsh_s64x2 = vdupq_n_s64( lsh );
        for( i = 0; i <= order - 3; i += 4 ) {
            int32x4_t corr_s32x4;
            int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
            corr_QC0_s64x2 = vld1q_s64( corr_QCT + i );
            corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 );
            corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 );
            corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 );
            corr_s32x4     = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) );
            corr_s32x4     = vrev64q_s32( corr_s32x4 );
            vst1q_s32( corr + order - i - 3, corr_s32x4 );
        }
        if( lsh >= 0 ) {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) );
            }
        } else {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) );
            }
        }
        silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/
        RESTORE_STACK;
    }

#ifdef OPUS_CHECK_ASM
    {
        opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ];
        opus_int   scale_c;
        silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order );
        silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) );
        silk_assert( scale_c == *scale );
    }
#endif
}
示例#17
0
void test_vld1Qs32 (void)
{
  int32x4_t out_int32x4_t;

  out_int32x4_t = vld1q_s32 (0);
}
示例#18
0
inline   int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
static INLINE void load_8x8_s32_dual(
    const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1,
    int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4,
    int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) {
  in0->val[0] = vld1q_s32(input);
  in0->val[1] = vld1q_s32(input + 4);
  input += 32;
  in1->val[0] = vld1q_s32(input);
  in1->val[1] = vld1q_s32(input + 4);
  input += 32;
  in2->val[0] = vld1q_s32(input);
  in2->val[1] = vld1q_s32(input + 4);
  input += 32;
  in3->val[0] = vld1q_s32(input);
  in3->val[1] = vld1q_s32(input + 4);
  input += 32;
  in4->val[0] = vld1q_s32(input);
  in4->val[1] = vld1q_s32(input + 4);
  input += 32;
  in5->val[0] = vld1q_s32(input);
  in5->val[1] = vld1q_s32(input + 4);
  input += 32;
  in6->val[0] = vld1q_s32(input);
  in6->val[1] = vld1q_s32(input + 4);
  input += 32;
  in7->val[0] = vld1q_s32(input);
  in7->val[1] = vld1q_s32(input + 4);
}
示例#20
0
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width)
{
    const int *S0 = src[0], *S1 = src[1];

    int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
    int32x4_t qT_0123, qT_4567;
    int16x4_t dT_0123, dT_4567;
    uint16x8_t qT_01234567;
    uint8x8_t dT_01234567, dDst_01234567;

    int32x2_t dBeta;
    dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);
    dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);

    int32x4_t qDelta, qMin, qMax;
    qDelta = vdupq_n_s32 (DELTA);
    qMin = vdupq_n_s32 (0);
    qMax = vdupq_n_s32 (255);

    int x = 0;
    for (; x <= width - 8; x += 8)
    {
        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        vst1_u8 (&dst[x], dT_01234567);
    }

    if (x < width)
    {
        uint8x8_t dMask;
        dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));
        dDst_01234567 = vld1_u8 (&dst[x]);

        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
        vst1_u8 (&dst[x], dMask);
    }
}
示例#21
0
// Contains a function for the core loop in the normalized lattice MA
// filter routine for iSAC codec, optimized for ARM Neon platform.
// It does:
//  for 0 <= n < HALF_SUBFRAMELEN - 1:
//    *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
//    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
// Output is not bit-exact with the reference C code, due to the replacement
// of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
// instructions. The difference should not be bigger than 1.
void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
                                    int16_t input1,  // Filter coefficient
                                    int32_t input2,  // Inverse coefficient
                                    int32_t* ptr0,   // Sample buffer
                                    int32_t* ptr1,   // Sample buffer
                                    int32_t* ptr2)   // Sample buffer
{
  int n = 0;
  int loop = (HALF_SUBFRAMELEN - 1) >> 3;
  int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;

  int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
  int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
  int32x4_t input2_v = vdupq_n_s32(input2);
  int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
  int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
  int32x4_t ptr0va, ptr1va, ptr2va;
  int32x4_t ptr0vb, ptr1vb, ptr2vb;

  // Unroll to process 8 samples at once.
  for (n = 0; n < loop; n++) {
    ptr0va = vld1q_s32(ptr0);
    ptr0vb = vld1q_s32(ptr0 + 4);
    ptr0 += 8;

    ptr2va = vld1q_s32(ptr2);
    ptr2vb = vld1q_s32(ptr2 + 4);

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
    tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
    tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2b = vaddq_s32(tmp0b, ptr2vb);
    tmp2a = vshlq_n_s32(tmp2a, 15);
    tmp2b = vshlq_n_s32(tmp2b, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
    ptr2vb = vqrdmulhq_s32(tmp2b, input2_v);

    vst1q_s32(ptr2, ptr2va);
    vst1q_s32(ptr2 + 4, ptr2vb);
    ptr2 += 8;

    // Calculate tmp3 = ptr2v * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
    tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);
    ptr1vb = vaddq_s32(tmp1b, tmp3b);

    vst1q_s32(ptr1, ptr1va);
    vst1q_s32(ptr1 + 4, ptr1vb);
    ptr1 += 8;
  }

  // Process four more samples.
  if (loop_tail & 0x4) {
    ptr0va = vld1q_s32(ptr0);
    ptr2va = vld1q_s32(ptr2);
    ptr0 += 4;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2a = vshlq_n_s32(tmp2a, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);

    vst1q_s32(ptr2, ptr2va);
    ptr2 += 4;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);

    vst1q_s32(ptr1, ptr1va);
    ptr1 += 4;
  }

  // Process two more samples.
  if (loop_tail & 0x2) {
    int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
    int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
    ptr0v_tail = vld1_s32(ptr0);
    ptr2v_tail = vld1_s32(ptr2);
    ptr0 += 2;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
    tmp2_tail = vshl_n_s32(tmp2_tail, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v));

    vst1_s32(ptr2, ptr2v_tail);
    ptr2 += 2;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);

    vst1_s32(ptr1, ptr1v_tail);
    ptr1 += 2;
  }

  // Process one more sample.
  if (loop_tail & 0x1) {
    int16_t t16a = (int16_t)(input2 >> 16);
    int16_t t16b = (int16_t)input2;
    if (t16b < 0) t16a++;
    int32_t tmp32a;
    int32_t tmp32b;

    // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
    tmp32b = *ptr2 + tmp32a;
    *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
                       (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));

    // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
    tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
    *ptr1 = tmp32a + tmp32b;
  }