static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
unsigned int vpx_get4x4sse_cs_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride) { int16x4_t d22s16, d24s16, d26s16, d28s16; int64x1_t d0s64; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; int64x2_t q1s64; d0u8 = vld1_u8(src_ptr); src_ptr += source_stride; d4u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d1u8 = vld1_u8(src_ptr); src_ptr += source_stride; d5u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d2u8 = vld1_u8(src_ptr); src_ptr += source_stride; d6u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d3u8 = vld1_u8(src_ptr); src_ptr += source_stride; d7u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; q11u16 = vsubl_u8(d0u8, d4u8); q12u16 = vsubl_u8(d1u8, d5u8); q13u16 = vsubl_u8(d2u8, d6u8); q14u16 = vsubl_u8(d3u8, d7u8); d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); q7s32 = vmull_s16(d22s16, d22s16); q8s32 = vmull_s16(d24s16, d24s16); q9s32 = vmull_s16(d26s16, d26s16); q10s32 = vmull_s16(d28s16, d28s16); q7s32 = vaddq_s32(q7s32, q8s32); q9s32 = vaddq_s32(q9s32, q10s32); q9s32 = vaddq_s32(q7s32, q9s32); q1s64 = vpaddlq_s32(q9s32); d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); }
/* s32x4 mm mul */ void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C) { int i, k, j; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_s32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_s32(A + j_T); j_T+=Row; neon_a1 = vld1q_s32(A + j_T); j_T+=Row; neon_a2 = vld1q_s32(A + j_T); j_T+=Row; neon_a3 = vld1q_s32(A + j_T); neon_b = vld1q_s32(B + k_Row + j); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); vst1q_lane_s32(C + k_Row + i, neon_c, 0); vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3); } } } }
static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 ) { int32x4_t t_s32x4, out32_Q14_s32x4; *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 ); /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ) */ *S_s32x4 = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0; */ *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 ); /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */ out32_Q14_s32x4 = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} */ t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 ) */ *S_s32x4 = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 ); /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND(); S{2,3} = silk_RSHIFT_ROUND(); */ t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ) */ *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ); */ t_s32x4 = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 ); /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} ) */ *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} ); */ }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); int32x4x2_t cd = vtrnq_s32(c, d); return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); }
static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, unsigned int *sse, int *sum) { int i, j; int16x8_t v_sum = vdupq_n_s16(0); int32x4_t v_sse_lo = vdupq_n_s32(0); int32x4_t v_sse_hi = vdupq_n_s32(0); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { const uint8x8_t v_a = vld1_u8(&a[j]); const uint8x8_t v_b = vld1_u8(&b[j]); const uint16x8_t v_diff = vsubl_u8(v_a, v_b); const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); v_sum = vaddq_s16(v_sum, sv_diff); v_sse_lo = vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); v_sse_hi = vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); } a += a_stride; b += b_stride; } *sum = horizontal_add_s16x8(v_sum); *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); }
void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { int i; if (count >= 8) { /* SkFixed is 16.16 fixed point */ SkFixed dx2 = dx+dx; SkFixed dx4 = dx2+dx2; SkFixed dx8 = dx4+dx4; /* now build fx/fx+dx/fx+2dx/fx+3dx */ SkFixed fx1, fx2, fx3; int32x4_t lbase, hbase; uint16_t *dst16 = (uint16_t *)dst; fx1 = fx+dx; fx2 = fx1+dx; fx3 = fx2+dx; /* avoid an 'lbase unitialized' warning */ lbase = vdupq_n_s32(fx); lbase = vsetq_lane_s32(fx1, lbase, 1); lbase = vsetq_lane_s32(fx2, lbase, 2); lbase = vsetq_lane_s32(fx3, lbase, 3); hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); /* take upper 16 of each, store, and bump everything */ do { int32x4_t lout, hout; uint16x8_t hi16; lout = lbase; hout = hbase; /* gets hi's of all louts then hi's of all houts */ asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); hi16 = vreinterpretq_u16_s32(hout); vst1q_u16(dst16, hi16); /* on to the next */ lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); dst16 += 8; count -= 8; fx += dx8; } while (count >= 8); dst = (uint32_t *) dst16; }
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, int16x8_t *q3s16, int16x8_t *q8s16, int16x8_t *q9s16) { int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; d6s16 = vget_low_s16(*q3s16); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); q10s32 = vmull_s16(*d3s16, d16s16); q11s32 = vmull_s16(*d4s16, d16s16); q12s32 = vmull_s16(d6s16, d17s16); q13s32 = vmull_s16(*d5s16, d18s16); q14s32 = vmull_s16(*d3s16, d18s16); q15s32 = vmovl_s16(d16s16); q15s32 = vaddw_s16(q15s32, d19s16); q8s32 = vmull_s16(*d4s16, d19s16); q15s32 = vsubw_s16(q15s32, d18s16); q9s32 = vmull_s16(*d5s16, d19s16); q10s32 = vaddq_s32(q10s32, q13s32); q10s32 = vaddq_s32(q10s32, q8s32); q11s32 = vsubq_s32(q11s32, q14s32); q8s32 = vdupq_n_s32(sinpi_3_9); q11s32 = vsubq_s32(q11s32, q9s32); q15s32 = vmulq_s32(q15s32, q8s32); q13s32 = vaddq_s32(q10s32, q12s32); q10s32 = vaddq_s32(q10s32, q11s32); q14s32 = vaddq_s32(q11s32, q12s32); q10s32 = vsubq_s32(q10s32, q12s32); d16s16 = vqrshrn_n_s32(q13s32, 14); d17s16 = vqrshrn_n_s32(q14s32, 14); d18s16 = vqrshrn_n_s32(q15s32, 14); d19s16 = vqrshrn_n_s32(q10s32, 14); *q8s16 = vcombine_s16(d16s16, d17s16); *q9s16 = vcombine_s16(d18s16, d19s16); return; }
static INLINE void iadst_half_butterfly_bd10_neon(int32x4_t *const x, const int32x2_t c) { const int32x4_t sum = vaddq_s32(x[0], x[1]); const int32x4_t sub = vsubq_s32(x[0], x[1]); x[0] = vmulq_lane_s32(sum, c, 0); x[1] = vmulq_lane_s32(sub, c, 0); x[0] = vrshrq_n_s32(x[0], DCT_CONST_BITS); x[1] = vrshrq_n_s32(x[1], DCT_CONST_BITS); }
OD_SIMD_INLINE void od_idct4_kernel(int32x4_t *y0, int32x4_t *y1, int32x4_t *y2, int32x4_t *y3) { int32x4_t t0 = *y0; int32x4_t t1 = *y1; int32x4_t t2 = *y2; int32x4_t t3 = *y3; int32x4_t t2h; od_transpose4(&t0, &t1, &t2, &t3); t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14)); t1 = vsubq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15)); t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15)); t2 = vsubq_s32(t0, t2); t2h = OD_UNBIASED_RSHIFT(t2, 1); t0 = vsubq_s32(t0, vsubq_s32(t2h, OD_UNBIASED_RSHIFT(t3, 1))); t1 = vsubq_s32(t2h, t1); *y0 = t0; *y1 = vsubq_s32(t2, t1); *y2 = t1; *y3 = vsubq_s32(t0, t3); }
/* s32x4 mv mul */ void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C) { int i = 0; int k = 0; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_s32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_s32(A + j); j+=Row; neon_a1 = vld1q_s32(A + j); j+=Row; neon_a2 = vld1q_s32(A + j); j+=Row; neon_a3 = vld1q_s32(A + j); neon_b = vld1q_s32(B + k); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); } vst1q_s32(C + i, neon_c); } }
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); c4 = vsubq_s64(c4, c8); c5 = vsubq_s64(c5, c9); c6 = vaddq_s64(c6, c10); c7 = vaddq_s64(c7, c11); b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), vrshrn_n_s64(c1, DCT_CONST_BITS)); b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), vrshrn_n_s64(c3, DCT_CONST_BITS)); b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), vrshrn_n_s64(c5, DCT_CONST_BITS)); b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), vrshrn_n_s64(c7, DCT_CONST_BITS)); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
static void add_int_neon(int* dst, int* src1, int* src2, int count) { int i; for (i = 0; i < count; i += 4) { int32x4_t in1, in2, out; in1 = vld1q_s32(src1); src1 += 4; in2 = vld1q_s32(src2); src2 += 4; out = vaddq_s32(in1, in2); vst1q_s32(dst, out); dst += 4; } }
OD_SIMD_INLINE void od_fdct4_kernel(int32x4_t *x0, int32x4_t *x1, int32x4_t *x2, int32x4_t *x3) { /*9 adds, 2 shifts, 3 "muls".*/ int32x4_t t0 = *x0; int32x4_t t2 = *x1; int32x4_t t1 = *x2; int32x4_t t3 = *x3; int32x4_t t2h; /*+1/-1 butterflies:*/ t3 = vsubq_s32(t0, t3); t2 = vaddq_s32(t2, t1); t2h = OD_UNBIASED_RSHIFT(t2, 1); t1 = vsubq_s32(t2h, t1); t0 = vsubq_s32(t0, OD_UNBIASED_RSHIFT(t3, 1)); /*+ Embedded 2-point type-II DCT.*/ t0 = vaddq_s32(t0, t2h); t2 = vsubq_s32(t0, t2); /*+ Embedded 2-point type-IV DST.*/ /*23013/32768 ~= 4*sin(\frac{\pi}{8}) - 2*tan(\frac{\pi}{8}) ~= 0.70230660471416898931046248770220*/ od_overflow_check_epi32(t1, 23013, 16384, 0); t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15)); /*21407/32768~=\sqrt{1/2}*cos(\frac{\pi}{8})) ~=0.65328148243818826392832158671359*/ od_overflow_check_epi32(t3, 21407, 16384, 1); t1 = vaddq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15)); /*18293/16384 ~= 4*sin(\frac{\pi}{8}) - tan(\frac{\pi}{8}) ~= 1.1165201670872640381121512119119*/ od_overflow_check_epi32(t3, 18293, 8192, 2); t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14)); od_transpose4(&t0, &t1, &t2, &t3); *x0 = t0; *x1 = t1; *x2 = t2; *x3 = t3; }
static INLINE void iadst_half_butterfly_bd12_neon(int32x4_t *const x, const int32x2_t c) { const int32x4_t sum = vaddq_s32(x[0], x[1]); const int32x4_t sub = vsubq_s32(x[0], x[1]); const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); x[0] = vcombine_s32(out0_lo, out0_hi); x[1] = vcombine_s32(out1_lo, out1_hi); }
/* s32x4 add */ void mw_neon_mm_add_s32x4(int * A, int Row, int Col, int * B, int * C) { int32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_s32(A + k); neon_b = vld1q_s32(B + k); neon_c = vaddq_s32(neon_a, neon_b); vst1q_s32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] + B[k + i]; } }
static inline void yuv2rgb_4x2(const uint8_t *y1, const uint8_t *y2, const uint8_t *u, const uint8_t *v, int16_t *r1, int16_t *g1, int16_t *b1, int16_t *r2, int16_t *g2, int16_t *b2){ int32x4_t ry1; int32x4_t ry2; int32x4_t rvug; int32x4_t rvr; int32x4_t rub; int32x4_t rr1,rg1,rb1,rr2,rg2,rb2; int32x4_t max; LOAD_Y_PREMULTS(0) LOAD_Y_PREMULTS(1) LOAD_Y_PREMULTS(2) LOAD_Y_PREMULTS(3) LOAD_UV_PREMULTS(0) LOAD_UV_PREMULTS(1) max=vld1q_s32(yuvmax); /*the following does not work */ //max=vdupq_n_s32(255); rr1=vaddq_s32(ry1,rvr); rr2=vaddq_s32(ry2,rvr); rg1=vaddq_s32(ry1,rvug); rg2=vaddq_s32(ry2,rvug); rb1=vaddq_s32(ry1,rub); rb2=vaddq_s32(ry2,rub); rr1=vminq_s32(vabsq_s32(rr1),max); rr2=vminq_s32(vabsq_s32(rr2),max); rg1=vminq_s32(vabsq_s32(rg1),max); rg2=vminq_s32(vabsq_s32(rg2),max); rb1=vminq_s32(vabsq_s32(rb1),max); rb2=vminq_s32(vabsq_s32(rb2),max); vst1_s16(r1,vqshrn_n_s32(rr1,13)); vst1_s16(r2,vqshrn_n_s32(rr2,13)); vst1_s16(g1,vqshrn_n_s32(rg1,13)); vst1_s16(g2,vqshrn_n_s32(rg2,13)); vst1_s16(b1,vqshrn_n_s32(rb1,13)); vst1_s16(b2,vqshrn_n_s32(rb2,13)); }
static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { const int32x4_t c0 = create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); const int32x4_t c1 = create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); const int32x4_t c2 = create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); int32x4_t x[8], t[4]; int64x2_t s[8][2]; x[0] = *io7; x[1] = *io0; x[2] = *io5; x[3] = *io2; x[4] = *io3; x[5] = *io4; x[6] = *io1; x[7] = *io6; // stage 1 iadst_butterfly_lane_0_1_bd12_neon(x[0], x[1], vget_low_s32(c0), s[0], s[1]); iadst_butterfly_lane_0_1_bd12_neon(x[2], x[3], vget_high_s32(c0), s[2], s[3]); iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_low_s32(c1), s[4], s[5]); iadst_butterfly_lane_0_1_bd12_neon(x[6], x[7], vget_high_s32(c1), s[6], s[7]); x[0] = add_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[1] = add_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[2] = add_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[3] = add_dct_const_round_shift_low_8_bd12(s[3], s[7]); x[4] = sub_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[5] = sub_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[3], s[7]); // stage 2 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_high_s32(c2), s[4], s[5]); iadst_butterfly_lane_1_0_bd12_neon(x[7], x[6], vget_high_s32(c2), s[7], s[6]); x[0] = vaddq_s32(t[0], t[2]); x[1] = vaddq_s32(t[1], t[3]); x[2] = vsubq_s32(t[0], t[2]); x[3] = vsubq_s32(t[1], t[3]); x[4] = add_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[5] = add_dct_const_round_shift_low_8_bd12(s[5], s[7]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[5], s[7]); // stage 3 iadst_half_butterfly_bd12_neon(x + 2, vget_low_s32(c2)); iadst_half_butterfly_bd12_neon(x + 6, vget_low_s32(c2)); *io0 = x[0]; *io1 = vnegq_s32(x[4]); *io2 = x[6]; *io3 = vnegq_s32(x[2]); *io4 = x[3]; *io5 = vnegq_s32(x[7]); *io6 = x[5]; *io7 = vnegq_s32(x[1]); }
static INLINE int32x4_t add_dct_const_round_shift_low_8_bd10(const int32x4_t in0, const int32x4_t in1) { const int32x4_t sum = vaddq_s32(in0, in1); return vrshrq_n_s32(sum, DCT_CONST_BITS); }
unsigned int vp8_sub_pixel_variance16x16_neon_func( const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { int i; DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); unsigned char *tmpp; unsigned char *tmpp2; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; uint8x8_t d19u8, d20u8, d21u8; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; tmpp2 = tmp + 272; tmpp = tmp; if (xoffset == 0) { // secondpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); q11u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } else if (yoffset == 0) { // firstpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); for (i = 4; i > 0 ; i--) { d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp2, q7u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q8u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q9u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q10u8); tmpp2 += 16; } } else { d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; // First Pass: output_height lines x output_width columns (17x16) for (i = 3; i > 0; i--) { q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; } // First-pass filtering for rest 5 lines d14u8 = vld1_u8(src_ptr); d15u8 = vld1_u8(src_ptr + 8); d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q9u16 = vmull_u8(d2u8, d0u8); q10u16 = vmull_u8(d3u8, d0u8); q11u16 = vmull_u8(d5u8, d0u8); q12u16 = vmull_u8(d6u8, d0u8); q13u16 = vmull_u8(d8u8, d0u8); q14u16 = vmull_u8(d9u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); q9u16 = vmlal_u8(q9u16, d2u8, d1u8); q11u16 = vmlal_u8(q11u16, d5u8, d1u8); q13u16 = vmlal_u8(q13u16, d8u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); q10u16 = vmlal_u8(q10u16, d3u8, d1u8); q12u16 = vmlal_u8(q12u16, d6u8, d1u8); q14u16 = vmlal_u8(q14u16, d9u8, d1u8); q1u16 = vmull_u8(d11u8, d0u8); q2u16 = vmull_u8(d12u8, d0u8); q3u16 = vmull_u8(d14u8, d0u8); q4u16 = vmull_u8(d15u8, d0u8); d11u8 = vext_u8(d11u8, d12u8, 1); d14u8 = vext_u8(d14u8, d15u8, 1); q1u16 = vmlal_u8(q1u16, d11u8, d1u8); q3u16 = vmlal_u8(q3u16, d14u8, d1u8); d12u8 = vext_u8(d12u8, d13u8, 1); d15u8 = vext_u8(d15u8, d16u8, 1); q2u16 = vmlal_u8(q2u16, d12u8, d1u8); q4u16 = vmlal_u8(q4u16, d15u8, d1u8); d10u8 = vqrshrn_n_u16(q9u16, 7); d11u8 = vqrshrn_n_u16(q10u16, 7); d12u8 = vqrshrn_n_u16(q11u16, 7); d13u8 = vqrshrn_n_u16(q12u16, 7); d14u8 = vqrshrn_n_u16(q13u16, 7); d15u8 = vqrshrn_n_u16(q14u16, 7); d16u8 = vqrshrn_n_u16(q1u16, 7); d17u8 = vqrshrn_n_u16(q2u16, 7); d18u8 = vqrshrn_n_u16(q3u16, 7); d19u8 = vqrshrn_n_u16(q4u16, 7); q5u8 = vcombine_u8(d10u8, d11u8); q6u8 = vcombine_u8(d12u8, d13u8); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); // secondpass_filter d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); tmpp = tmp; tmpp2 = tmpp + 272; q11u8 = vld1q_u8(tmpp); tmpp += 16; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(tmpp); tmpp += 16; q13u8 = vld1q_u8(tmpp); tmpp += 16; q14u8 = vld1q_u8(tmpp); tmpp += 16; q15u8 = vld1q_u8(tmpp); tmpp += 16; q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } // sub_pixel_variance16x16_neon q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); tmpp = tmp + 272; for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop q0u8 = vld1q_u8(tmpp); tmpp += 16; q1u8 = vld1q_u8(tmpp); tmpp += 16; q2u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; q3u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
// Update the noise estimation information. static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) { const int16_t kExp2Const = 11819; // Q13 int16_t* ptr_noiseEstLogQuantile = NULL; int16_t* ptr_noiseEstQuantile = NULL; int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const); int32x4_t twentyOne32x4 = vdupq_n_s32(21); int32x4_t constA32x4 = vdupq_n_s32(0x1fffff); int32x4_t constB32x4 = vdupq_n_s32(0x200000); int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, inst->magnLen); // Guarantee a Q-domain as high as possible and still fit in int16 inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const, tmp16, 21); int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise); for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset], ptr_noiseEstQuantile = &inst->noiseEstQuantile[0]; ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3]; ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) { // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i]; int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile); int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4); // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4); v32x4A = vorrq_s32(v32x4A, constB32x4); // tmp16 = (int16_t)(tmp32no2 >> 21); v32x4B = vshrq_n_s32(v32x4B, 21); // tmp16 -= 21;// shift 21 to get result in Q0 v32x4B = vsubq_s32(v32x4B, twentyOne32x4); // tmp16 += (int16_t) inst->qNoise; // shift to get result in Q(qNoise) v32x4B = vaddq_s32(v32x4B, qNoise32x4); // if (tmp16 < 0) { // tmp32no1 >>= -tmp16; // } else { // tmp32no1 <<= tmp16; // } v32x4B = vshlq_s32(v32x4A, v32x4B); // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1); v16x4 = vqmovn_s32(v32x4B); //inst->noiseEstQuantile[i] = tmp16; vst1_s16(ptr_noiseEstQuantile, v16x4); } // Last iteration: // inst->quantile[i]=exp(inst->lquantile[offset+i]); // in Q21 int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile; int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac tmp16 = (int16_t)(tmp32no2 >> 21); tmp16 -= 21;// shift 21 to get result in Q0 tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise) if (tmp16 < 0) { tmp32no1 >>= -tmp16; } else {
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); }
static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, int16x8_t *q14s16, int16x8_t *q15s16) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x8_t q2s16, q4s16, q5s16, q6s16; int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); d20s16 = vget_low_s16(*q10s16); d21s16 = vget_high_s16(*q10s16); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d30s16 = vget_low_s16(*q15s16); d31s16 = vget_high_s16(*q15s16); d14s16 = vdup_n_s16((int16_t)cospi_2_64); d15s16 = vdup_n_s16((int16_t)cospi_30_64); q1s32 = vmull_s16(d30s16, d14s16); q2s32 = vmull_s16(d31s16, d14s16); q3s32 = vmull_s16(d30s16, d15s16); q4s32 = vmull_s16(d31s16, d15s16); d30s16 = vdup_n_s16((int16_t)cospi_18_64); d31s16 = vdup_n_s16((int16_t)cospi_14_64); q1s32 = vmlal_s16(q1s32, d16s16, d15s16); q2s32 = vmlal_s16(q2s32, d17s16, d15s16); q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); q5s32 = vmull_s16(d22s16, d30s16); q6s32 = vmull_s16(d23s16, d30s16); q7s32 = vmull_s16(d22s16, d31s16); q8s32 = vmull_s16(d23s16, d31s16); q5s32 = vmlal_s16(q5s32, d24s16, d31s16); q6s32 = vmlal_s16(q6s32, d25s16, d31s16); q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); q11s32 = vaddq_s32(q1s32, q5s32); q12s32 = vaddq_s32(q2s32, q6s32); q1s32 = vsubq_s32(q1s32, q5s32); q2s32 = vsubq_s32(q2s32, q6s32); d22s16 = vqrshrn_n_s32(q11s32, 14); d23s16 = vqrshrn_n_s32(q12s32, 14); *q11s16 = vcombine_s16(d22s16, d23s16); q12s32 = vaddq_s32(q3s32, q7s32); q15s32 = vaddq_s32(q4s32, q8s32); q3s32 = vsubq_s32(q3s32, q7s32); q4s32 = vsubq_s32(q4s32, q8s32); d2s16 = vqrshrn_n_s32(q1s32, 14); d3s16 = vqrshrn_n_s32(q2s32, 14); d24s16 = vqrshrn_n_s32(q12s32, 14); d25s16 = vqrshrn_n_s32(q15s32, 14); d6s16 = vqrshrn_n_s32(q3s32, 14); d7s16 = vqrshrn_n_s32(q4s32, 14); *q12s16 = vcombine_s16(d24s16, d25s16); d0s16 = vdup_n_s16((int16_t)cospi_10_64); d1s16 = vdup_n_s16((int16_t)cospi_22_64); q4s32 = vmull_s16(d26s16, d0s16); q5s32 = vmull_s16(d27s16, d0s16); q2s32 = vmull_s16(d26s16, d1s16); q6s32 = vmull_s16(d27s16, d1s16); d30s16 = vdup_n_s16((int16_t)cospi_26_64); d31s16 = vdup_n_s16((int16_t)cospi_6_64); q4s32 = vmlal_s16(q4s32, d20s16, d1s16); q5s32 = vmlal_s16(q5s32, d21s16, d1s16); q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); q0s32 = vmull_s16(d18s16, d30s16); q13s32 = vmull_s16(d19s16, d30s16); q0s32 = vmlal_s16(q0s32, d28s16, d31s16); q13s32 = vmlal_s16(q13s32, d29s16, d31s16); q10s32 = vmull_s16(d18s16, d31s16); q9s32 = vmull_s16(d19s16, d31s16); q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); q14s32 = vaddq_s32(q2s32, q10s32); q15s32 = vaddq_s32(q6s32, q9s32); q2s32 = vsubq_s32(q2s32, q10s32); q6s32 = vsubq_s32(q6s32, q9s32); d28s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d4s16 = vqrshrn_n_s32(q2s32, 14); d5s16 = vqrshrn_n_s32(q6s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); q9s32 = vaddq_s32(q4s32, q0s32); q10s32 = vaddq_s32(q5s32, q13s32); q4s32 = vsubq_s32(q4s32, q0s32); q5s32 = vsubq_s32(q5s32, q13s32); d30s16 = vdup_n_s16((int16_t)cospi_8_64); d31s16 = vdup_n_s16((int16_t)cospi_24_64); d18s16 = vqrshrn_n_s32(q9s32, 14); d19s16 = vqrshrn_n_s32(q10s32, 14); d8s16 = vqrshrn_n_s32(q4s32, 14); d9s16 = vqrshrn_n_s32(q5s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q5s32 = vmull_s16(d2s16, d30s16); q6s32 = vmull_s16(d3s16, d30s16); q7s32 = vmull_s16(d2s16, d31s16); q0s32 = vmull_s16(d3s16, d31s16); q5s32 = vmlal_s16(q5s32, d6s16, d31s16); q6s32 = vmlal_s16(q6s32, d7s16, d31s16); q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); q1s32 = vmull_s16(d4s16, d30s16); q3s32 = vmull_s16(d5s16, d30s16); q10s32 = vmull_s16(d4s16, d31s16); q2s32 = vmull_s16(d5s16, d31s16); q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); q10s32 = vmlal_s16(q10s32, d8s16, d30s16); q2s32 = vmlal_s16(q2s32, d9s16, d30s16); *q8s16 = vaddq_s16(*q11s16, *q9s16); *q11s16 = vsubq_s16(*q11s16, *q9s16); q4s16 = vaddq_s16(*q12s16, *q14s16); *q12s16 = vsubq_s16(*q12s16, *q14s16); q14s32 = vaddq_s32(q5s32, q1s32); q15s32 = vaddq_s32(q6s32, q3s32); q5s32 = vsubq_s32(q5s32, q1s32); q6s32 = vsubq_s32(q6s32, q3s32); d18s16 = vqrshrn_n_s32(q14s32, 14); d19s16 = vqrshrn_n_s32(q15s32, 14); d10s16 = vqrshrn_n_s32(q5s32, 14); d11s16 = vqrshrn_n_s32(q6s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q1s32 = vaddq_s32(q7s32, q10s32); q3s32 = vaddq_s32(q0s32, q2s32); q7s32 = vsubq_s32(q7s32, q10s32); q0s32 = vsubq_s32(q0s32, q2s32); d28s16 = vqrshrn_n_s32(q1s32, 14); d29s16 = vqrshrn_n_s32(q3s32, 14); d14s16 = vqrshrn_n_s32(q7s32, 14); d15s16 = vqrshrn_n_s32(q0s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); d30s16 = vdup_n_s16((int16_t)cospi_16_64); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); q2s32 = vmull_s16(d22s16, d30s16); q3s32 = vmull_s16(d23s16, d30s16); q13s32 = vmull_s16(d22s16, d30s16); q1s32 = vmull_s16(d23s16, d30s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); q2s32 = vmlal_s16(q2s32, d24s16, d30s16); q3s32 = vmlal_s16(q3s32, d25s16, d30s16); q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); d4s16 = vqrshrn_n_s32(q2s32, 14); d5s16 = vqrshrn_n_s32(q3s32, 14); d24s16 = vqrshrn_n_s32(q13s32, 14); d25s16 = vqrshrn_n_s32(q1s32, 14); q2s16 = vcombine_s16(d4s16, d5s16); *q12s16 = vcombine_s16(d24s16, d25s16); q13s32 = vmull_s16(d10s16, d30s16); q1s32 = vmull_s16(d11s16, d30s16); q11s32 = vmull_s16(d10s16, d30s16); q0s32 = vmull_s16(d11s16, d30s16); q13s32 = vmlal_s16(q13s32, d14s16, d30s16); q1s32 = vmlal_s16(q1s32, d15s16, d30s16); q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); d20s16 = vqrshrn_n_s32(q13s32, 14); d21s16 = vqrshrn_n_s32(q1s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q0s32, 14); *q10s16 = vcombine_s16(d20s16, d21s16); q6s16 = vcombine_s16(d12s16, d13s16); q5s16 = vdupq_n_s16(0); *q9s16 = vsubq_s16(q5s16, *q9s16); *q11s16 = vsubq_s16(q5s16, q2s16); *q13s16 = vsubq_s16(q5s16, q6s16); *q15s16 = vsubq_s16(q5s16, q4s16); return; }
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff) { // pre-condition : width, height must be even if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv) return false; // in & out pointers unsigned char* dst = out; // constants int const stride = width*trait::bytes_per_pixel; int const itHeight = height>>1; int const itWidth = width>>3; uint8x8_t const Yshift = vdup_n_u8(16); int16x8_t const half = vdupq_n_u16(128); int32x4_t const rounding = vdupq_n_s32(128); // tmp variable uint16x8_t t; // pixel block to temporary store 8 pixels typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha); for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) { for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) { t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift)); int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift)); int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 } t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half); // UV.val[0] : v0, v1, v2, v3 // UV.val[1] : u0, u1, u2, u3 int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t)); // tR : 128+409V // tG : 128-100U-208V // tB : 128+516U int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409); int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100); int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516); int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3] int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3] int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3] // upper 8 pixels trait::store_pixel_block(dst, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8)); // lower 8 pixels trait::store_pixel_block(dst+stride, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8)); } } return true; }
inline v_int32x4 v_floor(const v_float32x4& a) { int32x4_t a1 = vcvtq_s32_f32(a.val); uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val); return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask))); }
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. float32x4_t log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); const float32x4_t n = vsubq_f32(vreinterpretq_f32_u32(n_0), vreinterpretq_f32_u32(vec_implicit_leading_one)); // Compute y. const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); const float32x4_t y = vreinterpretq_f32_u32(vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); const float32x4_t C3 = vdupq_n_f32(-1.2315303f); const float32x4_t C2 = vdupq_n_f32(2.5988452f); const float32x4_t C1 = vdupq_n_f32(-3.3241990f); const float32x4_t C0 = vdupq_n_f32(3.1157899f); float32x4_t pol5_y = C5; pol5_y = vmlaq_f32(C4, y, pol5_y); pol5_y = vmlaq_f32(C3, y, pol5_y); pol5_y = vmlaq_f32(C2, y, pol5_y); pol5_y = vmlaq_f32(C1, y, pol5_y); pol5_y = vmlaq_f32(C0, y, pol5_y); const float32x4_t y_minus_one = vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); // Combine parts. log2_a = vaddq_f32(n, log2_y); } // b * log2(a) b_log2_a = vmulq_f32(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. const float32x4_t max_input = vdupq_n_f32(129.f); const float32x4_t min_input = vdupq_n_f32(-126.99999f); const float32x4_t x_min = vminq_f32(b_log2_a, max_input); const float32x4_t x_max = vmaxq_f32(x_min, min_input); // Compute n. const float32x4_t half = vdupq_n_f32(0.5f); const float32x4_t x_minus_half = vsubq_f32(x_max, half); const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); // Compute 2^n. const int32x4_t float_exponent_bias = vdupq_n_s32(127); const int32x4_t two_n_exponent = vaddq_s32(x_minus_half_floor, float_exponent_bias); const float32x4_t two_n = vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); // Compute y. const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); const float32x4_t C0 = vdupq_n_f32(1.0017247f); float32x4_t exp2_y = C2; exp2_y = vmlaq_f32(C1, y, exp2_y); exp2_y = vmlaq_f32(C0, y, exp2_y); // Combine parts. a_exp_b = vmulq_f32(exp2_y, two_n); } return a_exp_b; }
unsigned int vp8_variance_halfpixvar16x16_hv_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; int32x4_t q13s32, q14s32, q15s32; int64x2_t q0s64, q1s64, q5s64; q13s32 = vdupq_n_s32(0); q14s32 = vdupq_n_s32(0); q15s32 = vdupq_n_s32(0); q0u8 = vld1q_u8(src_ptr); q1u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q1u8 = vextq_u8(q0u8, q1u8, 1); q0u8 = vrhaddq_u8(q0u8, q1u8); for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon q2u8 = vld1q_u8(src_ptr); q3u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q4u8 = vld1q_u8(src_ptr); q5u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q6u8 = vld1q_u8(src_ptr); q7u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q8u8 = vld1q_u8(src_ptr); q9u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q3u8 = vextq_u8(q2u8, q3u8, 1); q5u8 = vextq_u8(q4u8, q5u8, 1); q7u8 = vextq_u8(q6u8, q7u8, 1); q9u8 = vextq_u8(q8u8, q9u8, 1); q1u8 = vrhaddq_u8(q2u8, q3u8); q2u8 = vrhaddq_u8(q4u8, q5u8); q3u8 = vrhaddq_u8(q6u8, q7u8); q4u8 = vrhaddq_u8(q8u8, q9u8); q0u8 = vrhaddq_u8(q0u8, q1u8); q1u8 = vrhaddq_u8(q1u8, q2u8); q2u8 = vrhaddq_u8(q2u8, q3u8); q3u8 = vrhaddq_u8(q3u8, q4u8); q5u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q6u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q7u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q8u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); d4u8 = vget_low_u8(q2u8); d5u8 = vget_high_u8(q2u8); d6u8 = vget_low_u8(q3u8); d7u8 = vget_high_u8(q3u8); q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); q14s32 = vmlal_s16(q14s32, d18s16, d18s16); q15s32 = vmlal_s16(q15s32, d19s16, d19s16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); q14s32 = vmlal_s16(q14s32, d20s16, d20s16); q15s32 = vmlal_s16(q15s32, d21s16, d21s16); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); q14s32 = vmlal_s16(q14s32, d22s16, d22s16); q15s32 = vmlal_s16(q15s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); q14s32 = vmlal_s16(q14s32, d24s16, d24s16); q15s32 = vmlal_s16(q15s32, d25s16, d25s16); d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); q14s32 = vmlal_s16(q14s32, d0s16, d0s16); q15s32 = vmlal_s16(q15s32, d1s16, d1s16); d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); q14s32 = vmlal_s16(q14s32, d2s16, d2s16); q15s32 = vmlal_s16(q15s32, d3s16, d3s16); d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); q14s32 = vmlal_s16(q14s32, d10s16, d10s16); q15s32 = vmlal_s16(q15s32, d11s16, d11s16); d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); q14s32 = vmlal_s16(q14s32, d12s16, d12s16); q15s32 = vmlal_s16(q15s32, d13s16, d13s16); q0u8 = q4u8; } q15s32 = vaddq_s32(q14s32, q15s32); q0s64 = vpaddlq_s32(q13s32); q1s64 = vpaddlq_s32(q15s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width) { const int *S0 = src[0], *S1 = src[1]; int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567; int32x4_t qT_0123, qT_4567; int16x4_t dT_0123, dT_4567; uint16x8_t qT_01234567; uint8x8_t dT_01234567, dDst_01234567; int32x2_t dBeta; dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0); dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1); int32x4_t qDelta, qMin, qMax; qDelta = vdupq_n_s32 (DELTA); qMin = vdupq_n_s32 (0); qMax = vdupq_n_s32 (255); int x = 0; for (; x <= width - 8; x += 8) { qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); vst1_u8 (&dst[x], dT_01234567); } if (x < width) { uint8x8_t dMask; dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)])); dDst_01234567 = vld1_u8 (&dst[x]); qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567); vst1_u8 (&dst[x], dMask); } }
static void PCorr2Q32(const int16_t *in, int32_t *logcorQ8) { int16_t scaling,n,k; int32_t ysum32,csum32, lys, lcs; int32_t oneQ8; const int16_t *x, *inptr; oneQ8 = WEBRTC_SPL_LSHIFT_W32((int32_t)1, 8); // 1.00 in Q8 x = in + PITCH_MAX_LAG/2 + 2; scaling = WebRtcSpl_GetScalingSquare ((int16_t *) in, PITCH_CORR_LEN2, PITCH_CORR_LEN2); ysum32 = 1; csum32 = 0; x = in + PITCH_MAX_LAG/2 + 2; for (n = 0; n < PITCH_CORR_LEN2; n++) { ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[n],(int16_t) in[n], scaling); // Q0 csum32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t) x[n],(int16_t) in[n], scaling); // Q0 } logcorQ8 += PITCH_LAG_SPAN2 - 1; lys=Log2Q8((uint32_t) ysum32); // Q8 lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum); if (csum32>0) { lcs=Log2Q8((uint32_t) csum32); // 2log(csum) in Q8 if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 in Q8 *logcorQ8 = lcs - lys; // log2(csum/sqrt(ysum)) } else { *logcorQ8 = oneQ8; // 1.00 } } else { *logcorQ8 = 0; } for (k = 1; k < PITCH_LAG_SPAN2; k++) { inptr = &in[k]; ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[k-1],(int16_t) in[k-1], scaling); ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[PITCH_CORR_LEN2 + k - 1],(int16_t) in[PITCH_CORR_LEN2 + k - 1], scaling); #ifdef WEBRTC_ARCH_ARM_NEON { int32_t vbuff[4]; int32x4_t int_32x4_sum = vmovq_n_s32(0); // Can't shift a Neon register to right with a non-constant shift value. int32x4_t int_32x4_scale = vdupq_n_s32(-scaling); // Assert a codition used in loop unrolling at compile-time. COMPILE_ASSERT(PITCH_CORR_LEN2 %4 == 0); for (n = 0; n < PITCH_CORR_LEN2; n += 4) { int16x4_t int_16x4_x = vld1_s16(&x[n]); int16x4_t int_16x4_in = vld1_s16(&inptr[n]); int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in); int_32x4 = vshlq_s32(int_32x4, int_32x4_scale); int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4); } // Use vector store to avoid long stall from data trasferring // from vector to general register. vst1q_s32(vbuff, int_32x4_sum); csum32 = vbuff[0] + vbuff[1]; csum32 += vbuff[2]; csum32 += vbuff[3]; } #else csum32 = 0; if(scaling == 0) { for (n = 0; n < PITCH_CORR_LEN2; n++) { csum32 += x[n] * inptr[n]; } } else { for (n = 0; n < PITCH_CORR_LEN2; n++) { csum32 += (x[n] * inptr[n]) >> scaling; } } #endif logcorQ8--; lys=Log2Q8((uint32_t)ysum32); // Q8 lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum); if (csum32>0) { lcs=Log2Q8((uint32_t) csum32); // 2log(csum) in Q8 if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 *logcorQ8 = lcs - lys; // log2(csum/sqrt(ysum)) } else { *logcorQ8 = oneQ8; // 1.00 } } else { *logcorQ8 = 0; } } }