void test_vshll_ns16 (void) { int32x4_t out_int32x4_t; int16x4_t arg0_int16x4_t; out_int32x4_t = vshll_n_s16 (arg0_int16x4_t, 1); }
void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore* aecm) { assert((uintptr_t)(aecm->channelStored) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0); // The C code of following optimized code. // for (i = 0; i < PART_LEN1; i++) { // aecm->channelAdapt16[i] = aecm->channelStored[i]; // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( // (int32_t)aecm->channelStored[i], 16); // } int16_t* start_stored_p = aecm->channelStored; int16_t* start_adapt16_p = aecm->channelAdapt16; int32_t* start_adapt32_p = aecm->channelAdapt32; const int16_t* end_stored_p = start_stored_p + PART_LEN; int16x8_t stored_v; int32x4_t adapt32_v_low, adapt32_v_high; while (start_stored_p < end_stored_p) { stored_v = vld1q_s16(start_stored_p); vst1q_s16(start_adapt16_p, stored_v); adapt32_v_low = vshll_n_s16(vget_low_s16(stored_v), 16); adapt32_v_high = vshll_n_s16(vget_high_s16(stored_v), 16); vst1q_s32(start_adapt32_p, adapt32_v_low); vst1q_s32(start_adapt32_p + 4, adapt32_v_high); start_stored_p += 8; start_adapt16_p += 8; start_adapt32_p += 8; } aecm->channelAdapt16[PART_LEN] = aecm->channelStored[PART_LEN]; aecm->channelAdapt32[PART_LEN] = (int32_t)aecm->channelStored[PART_LEN] << 16; }
void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len) { int block; uint8_t uv_buf[2 * 32 + 15]; uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); const int uv_len = (len + 1) >> 1; const int num_blocks = (uv_len - 1) >> 3; const int leftover = uv_len - num_blocks * 8; const int last_pos = 1 + 16 * num_blocks; const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; const int16x4_t cf16 = vld1_s16(coef); const int32x2_t cf32 = vmov_n_s32(76283); const uint8x8_t u16 = vmov_n_u8(16); const uint8x8_t u128 = vmov_n_u8(128); for (block = 0; block < num_blocks; ++block) { { uint8x8_t a = vld1_u8(top_u); uint8x8_t b = vld1_u8(top_u + 1); uint8x8_t c = vld1_u8(cur_u); uint8x8_t d = vld1_u8(cur_u + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv, a_b); vst2_u8(r_uv + 32, c_d); } } { uint8x8_t a = vld1_u8(top_v); uint8x8_t b = vld1_u8(top_v + 1); uint8x8_t c = vld1_u8(cur_v); uint8x8_t d = vld1_u8(cur_v + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv + 16, a_b); vst2_u8(r_uv + 16 + 32, c_d); } } { if (top_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8((r_uv) + i); uint8x8_t v = vld1_u8((r_uv) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(top_dst + off, r_g_b_v255); } while (0); } } } if (bottom_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8(((r_uv) + 32) + i); uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(bottom_dst + off, r_g_b_v255); } while (0); } } } } } }
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
void WebRtcIsacfix_AllpassFilter2FixDec16Neon( int16_t* data_ch1, // Input and output in channel 1, in Q0 int16_t* data_ch2, // Input and output in channel 2, in Q0 const int16_t* factor_ch1, // Scaling factor for channel 1, in Q15 const int16_t* factor_ch2, // Scaling factor for channel 2, in Q15 const int length, // Length of the data buffers int32_t* filter_state_ch1, // Filter state for channel 1, in Q16 int32_t* filter_state_ch2) { // Filter state for channel 2, in Q16 assert(length % 2 == 0); int n = 0; int16x4_t factorv; int16x4_t datav; int32x4_t statev; int32x2_t tmp; // Load factor_ch1 and factor_ch2. tmp = vld1_dup_s32((int32_t*)factor_ch1); tmp = vld1_lane_s32((int32_t*)factor_ch2, tmp, 1); factorv = vreinterpret_s16_s32(tmp); // Load filter_state_ch1[0] and filter_state_ch2[0]. statev = vld1q_dup_s32(filter_state_ch1); statev = vld1q_lane_s32(filter_state_ch2, statev, 2); // Loop unrolling preprocessing. int32x4_t a; int16x4_t tmp1, tmp2; // Load data_ch1[0] and data_ch2[0]. datav = vld1_dup_s16(data_ch1); datav = vld1_lane_s16(data_ch2, datav, 2); a = vqdmlal_s16(statev, datav, factorv); tmp1 = vshrn_n_s32(a, 16); // Update filter_state_ch1[0] and filter_state_ch2[0]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv); // Load filter_state_ch1[1] and filter_state_ch2[1]. statev = vld1q_lane_s32(filter_state_ch1 + 1, statev, 1); statev = vld1q_lane_s32(filter_state_ch2 + 1, statev, 3); // Load data_ch1[1] and data_ch2[1]. tmp1 = vld1_lane_s16(data_ch1 + 1, tmp1, 1); tmp1 = vld1_lane_s16(data_ch2 + 1, tmp1, 3); datav = vrev32_s16(tmp1); // Loop unrolling processing. for (n = 0; n < length - 2; n += 2) { a = vqdmlal_s16(statev, datav, factorv); tmp1 = vshrn_n_s32(a, 16); // Store data_ch1[n] and data_ch2[n]. vst1_lane_s16(data_ch1 + n, tmp1, 1); vst1_lane_s16(data_ch2 + n, tmp1, 3); // Update filter_state_ch1[0], filter_state_ch1[1] // and filter_state_ch2[0], filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv); // Load data_ch1[n + 2] and data_ch2[n + 2]. tmp1 = vld1_lane_s16(data_ch1 + n + 2, tmp1, 1); tmp1 = vld1_lane_s16(data_ch2 + n + 2, tmp1, 3); datav = vrev32_s16(tmp1); a = vqdmlal_s16(statev, datav, factorv); tmp2 = vshrn_n_s32(a, 16); // Store data_ch1[n + 1] and data_ch2[n + 1]. vst1_lane_s16(data_ch1 + n + 1, tmp2, 1); vst1_lane_s16(data_ch2 + n + 1, tmp2, 3); // Update filter_state_ch1[0], filter_state_ch1[1] // and filter_state_ch2[0], filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv); // Load data_ch1[n + 3] and data_ch2[n + 3]. tmp2 = vld1_lane_s16(data_ch1 + n + 3, tmp2, 1); tmp2 = vld1_lane_s16(data_ch2 + n + 3, tmp2, 3); datav = vrev32_s16(tmp2); } // Loop unrolling post-processing. a = vqdmlal_s16(statev, datav, factorv); tmp1 = vshrn_n_s32(a, 16); // Store data_ch1[n] and data_ch2[n]. vst1_lane_s16(data_ch1 + n, tmp1, 1); vst1_lane_s16(data_ch2 + n, tmp1, 3); // Update filter_state_ch1[0], filter_state_ch1[1] // and filter_state_ch2[0], filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv); // Store filter_state_ch1[0] and filter_state_ch2[0]. vst1q_lane_s32(filter_state_ch1, statev, 0); vst1q_lane_s32(filter_state_ch2, statev, 2); datav = vrev32_s16(tmp1); a = vqdmlal_s16(statev, datav, factorv); tmp2 = vshrn_n_s32(a, 16); // Store data_ch1[n + 1] and data_ch2[n + 1]. vst1_lane_s16(data_ch1 + n + 1, tmp2, 1); vst1_lane_s16(data_ch2 + n + 1, tmp2, 3); // Update filter_state_ch1[1] and filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv); // Store filter_state_ch1[1] and filter_state_ch2[1]. vst1q_lane_s32(filter_state_ch1 + 1, statev, 1); vst1q_lane_s32(filter_state_ch2 + 1, statev, 3); }
int32x4_t test_vshll_n_s16_2 (int16x4_t __a) { return vshll_n_s16 (__a, 16); }
int32x4_t test_vshll_n_s16 (int16x4_t __a) { return vshll_n_s16 (__a, 3); }
void silk_warped_autocorrelation_FIX_neon( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ const opus_int warping_Q16, /* I Warping coefficient */ const opus_int length, /* I Length of input */ const opus_int order /* I Correlation order (even) */ ) { if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); } else { opus_int n, i, lsh; opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ opus_int64 corr_QC_orderT; int64x2_t lsh_s64x2; const opus_int orderT = ( order + 3 ) & ~3; opus_int64 *corr_QCT; opus_int32 *input_QS; VARDECL( opus_int32, input_QST ); VARDECL( opus_int32, state ); SAVE_STACK; /* Order must be even */ silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; /* Loop over samples */ for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { const int16x8_t t0_s16x4 = vld1q_s16( input + n ); vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); } for( ; n < length; n++, input_QS++ ) { input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); } vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ /* Keep the C code here to help understand the intrinsics optimization. */ /* { opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; opus_int32 *state_QST[ 3 ]; state_QST[ 0 ] = state_QS[ 0 ]; state_QST[ 1 ] = state_QS[ 1 ]; for( n = 0; n < length + order; n++, input_QS++ ) { state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; for( i = 0; i < orderT; i++ ) { corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); } state_QST[ 2 ] = state_QST[ 0 ]; state_QST[ 0 ] = state_QST[ 1 ]; state_QST[ 1 ] = state_QST[ 2 ]; } } */ { const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); const opus_int32 *in = input_QS + orderT; opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; ALLOC( state, length + orderT, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ do { state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); n = 0; do { calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; } while( ++n < ( length + order ) ); in = state; o -= 8; } while( o > 4 ); if( o ) { /* Calculate the last 4 taps of all inputs. */ opus_int32 *stateT = state; silk_assert( o == 4 ); state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); n = length + order; do { calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; input_QS++; stateT++; } while( --n ); } } { const opus_int16 *inputT = input; int32x4_t t_s32x4; int64x1_t t_s64x1; int64x2_t t_s64x2 = vdupq_n_s64( 0 ); for( n = 0; n <= length - 8; n += 8 ) { int16x8_t input_s16x8 = vld1q_s16( inputT ); t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); inputT += 8; } t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); for( ; n < length; n++ ) { corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); } corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); corr_QC[ orderT ] = corr_QC_orderT; } corr_QCT = corr_QC + orderT - order; lsh = silk_CLZ64( corr_QC_orderT ) - 35; lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); *scale = -( QC + lsh ); silk_assert( *scale >= -30 && *scale <= 12 ); lsh_s64x2 = vdupq_n_s64( lsh ); for( i = 0; i <= order - 3; i += 4 ) { int32x4_t corr_s32x4; int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); corr_s32x4 = vrev64q_s32( corr_s32x4 ); vst1q_s32( corr + order - i - 3, corr_s32x4 ); } if( lsh >= 0 ) { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); } } else { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); } } silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ RESTORE_STACK; } #ifdef OPUS_CHECK_ASM { opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; opus_int scale_c; silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); silk_assert( scale_c == *scale ); } #endif }