void test_vst1s16 (void) { int16_t *arg0_int16_t; int16x4_t arg1_int16x4_t; vst1_s16 (arg0_int16_t, arg1_int16x4_t); }
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) { #ifdef _M_SSE // Size will always be 16-byte aligned as the hwBlockSize is. while (size >= 8) { __m128i in1 = _mm_loadu_si128((__m128i *)in); __m128i in2 = _mm_loadu_si128((__m128i *)(in + 4)); __m128i packed = _mm_packs_epi32(in1, in2); if (useShift) { packed = _mm_srai_epi16(packed, volShift); } _mm_storeu_si128((__m128i *)out, packed); out += 8; in += 8; size -= 8; } #elif PPSSPP_ARCH(ARM_NEON) int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer while (size >= 8) { int32x4_t in1 = vld1q_s32(in); int32x4_t in2 = vld1q_s32(in + 4); int16x4_t packed1 = vqmovn_s32(in1); int16x4_t packed2 = vqmovn_s32(in2); if (useShift) { packed1 = vshl_s16(packed1, signedVolShift); packed2 = vshl_s16(packed2, signedVolShift); } vst1_s16(out, packed1); vst1_s16(out + 4, packed2); out += 8; in += 8; size -= 8; } #endif // This does the remainder if SIMD was used, otherwise it does it all. for (size_t i = 0; i < size; i++) { out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]); } }
test_vdup_lane_s16 () { int16x4_t a; int16x4_t b; int i; /* Only two first cases are interesting. */ int16_t c[4] = { 0, 1, 2, 3 }; int16_t d[4]; a = vld1_s16 (c); b = wrap_vdup_lane_s16_0 (a); vst1_s16 (d, b); for (i = 0; i < 4; i++) if (c[0] != d[i]) return 1; b = wrap_vdup_lane_s16_1 (a); vst1_s16 (d, b); for (i = 0; i < 4; i++) if (c[1] != d[i]) return 1; return 0; }
static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs, int16_t *block) { int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16; int16x8_t q14s16, q15s16, qzs16; int16x4_t d0s16, d2s16, d3s16, dzs16; uint16x8_t q1u16, q9u16; uint16x4_t d1u16; dzs16 = vdup_n_s16(0); qzs16 = vdupq_n_s16(0); q15s16 = vdupq_n_s16(qscale << 1); q14s16 = vdupq_n_s16(qadd); q13s16 = vnegq_s16(q14s16); if (nCoeffs > 4) { for (; nCoeffs > 8; nCoeffs -= 16, block += 16) { q0s16 = vld1q_s16(block); q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16)); q8s16 = vld1q_s16(block + 8); q1u16 = vceqq_s16(q0s16, qzs16); q2s16 = vmulq_s16(q0s16, q15s16); q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16)); q10s16 = vmulq_s16(q8s16, q15s16); q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16); q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16); q2s16 = vaddq_s16(q2s16, q3s16); q9u16 = vceqq_s16(q8s16, qzs16); q10s16 = vaddq_s16(q10s16, q11s16); q0s16 = vbslq_s16(q1u16, q0s16, q2s16); q8s16 = vbslq_s16(q9u16, q8s16, q10s16); vst1q_s16(block, q0s16); vst1q_s16(block + 8, q8s16); } } if (nCoeffs <= 0) return; d0s16 = vld1_s16(block); d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16)); d1u16 = vceq_s16(d0s16, dzs16); d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16)); d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16), vget_high_s16(q13s16), vget_high_s16(q14s16)); d2s16 = vadd_s16(d2s16, d3s16); d0s16 = vbsl_s16(d1u16, d0s16, d2s16); vst1_s16(block, d0s16); }
test_vreinterpret_s16_f64 () { float64x1_t a; int16x4_t b; float64_t c[1] = { PI_F64 }; int16_t d[4] = { 0x2D18, 0x5444, 0x21FB, 0x4009 }; int16_t e[4]; int i; a = vld1_f64 (c); b = wrap_vreinterpret_s16_f64 (a); vst1_s16 (e, b); for (i = 0; i < 4; i++) if (d[i] != e[i]) return 1; return 0; };
static inline void yuv2rgb_4x2(const uint8_t *y1, const uint8_t *y2, const uint8_t *u, const uint8_t *v, int16_t *r1, int16_t *g1, int16_t *b1, int16_t *r2, int16_t *g2, int16_t *b2){ int32x4_t ry1; int32x4_t ry2; int32x4_t rvug; int32x4_t rvr; int32x4_t rub; int32x4_t rr1,rg1,rb1,rr2,rg2,rb2; int32x4_t max; LOAD_Y_PREMULTS(0) LOAD_Y_PREMULTS(1) LOAD_Y_PREMULTS(2) LOAD_Y_PREMULTS(3) LOAD_UV_PREMULTS(0) LOAD_UV_PREMULTS(1) max=vld1q_s32(yuvmax); /*the following does not work */ //max=vdupq_n_s32(255); rr1=vaddq_s32(ry1,rvr); rr2=vaddq_s32(ry2,rvr); rg1=vaddq_s32(ry1,rvug); rg2=vaddq_s32(ry2,rvug); rb1=vaddq_s32(ry1,rub); rb2=vaddq_s32(ry2,rub); rr1=vminq_s32(vabsq_s32(rr1),max); rr2=vminq_s32(vabsq_s32(rr2),max); rg1=vminq_s32(vabsq_s32(rg1),max); rg2=vminq_s32(vabsq_s32(rg2),max); rb1=vminq_s32(vabsq_s32(rb1),max); rb2=vminq_s32(vabsq_s32(rb2),max); vst1_s16(r1,vqshrn_n_s32(rr1,13)); vst1_s16(r2,vqshrn_n_s32(rr2,13)); vst1_s16(g1,vqshrn_n_s32(rg1,13)); vst1_s16(g2,vqshrn_n_s32(rg2,13)); vst1_s16(b1,vqshrn_n_s32(rb1,13)); vst1_s16(b2,vqshrn_n_s32(rb2,13)); }
// Update the noise estimation information. static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) { const int16_t kExp2Const = 11819; // Q13 int16_t* ptr_noiseEstLogQuantile = NULL; int16_t* ptr_noiseEstQuantile = NULL; int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const); int32x4_t twentyOne32x4 = vdupq_n_s32(21); int32x4_t constA32x4 = vdupq_n_s32(0x1fffff); int32x4_t constB32x4 = vdupq_n_s32(0x200000); int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, inst->magnLen); // Guarantee a Q-domain as high as possible and still fit in int16 inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const, tmp16, 21); int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise); for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset], ptr_noiseEstQuantile = &inst->noiseEstQuantile[0]; ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3]; ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) { // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i]; int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile); int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4); // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4); v32x4A = vorrq_s32(v32x4A, constB32x4); // tmp16 = (int16_t)(tmp32no2 >> 21); v32x4B = vshrq_n_s32(v32x4B, 21); // tmp16 -= 21;// shift 21 to get result in Q0 v32x4B = vsubq_s32(v32x4B, twentyOne32x4); // tmp16 += (int16_t) inst->qNoise; // shift to get result in Q(qNoise) v32x4B = vaddq_s32(v32x4B, qNoise32x4); // if (tmp16 < 0) { // tmp32no1 >>= -tmp16; // } else { // tmp32no1 <<= tmp16; // } v32x4B = vshlq_s32(v32x4A, v32x4B); // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1); v16x4 = vqmovn_s32(v32x4B); //inst->noiseEstQuantile[i] = tmp16; vst1_s16(ptr_noiseEstQuantile, v16x4); } // Last iteration: // inst->quantile[i]=exp(inst->lquantile[offset+i]); // in Q21 int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile; int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac tmp16 = (int16_t)(tmp32no2 >> 21); tmp16 -= 21;// shift 21 to get result in Q0 tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise) if (tmp16 < 0) { tmp32no1 >>= -tmp16; } else {
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); }
static inline void PostShiftAndSeparateNeon(int16_t* inre, int16_t* inim, int16_t* outre, int16_t* outim, int32_t sh) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int16_t* outre1 = outre; int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int16_t* outim1 = outim; int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4]; // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)", // ">> 14" and then ">> 9" as in the C code. int32x4_t shift = vdupq_n_s32(-sh - 23); for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int16x4_t xr = vqadd_s16(inre_0, inre_1); int16x4_t xi = vqsub_s16(inim_0, inim_1); int16x4_t yr = vqadd_s16(inim_0, inim_1); int16x4_t yi = vqsub_s16(inre_1, inre_0); int32x4_t outr0 = vmull_s16(tmpr, xr); int32x4_t outi0 = vmull_s16(tmpi, xr); int32x4_t outr1 = vmull_s16(tmpi, yr); int32x4_t outi1 = vmull_s16(tmpi, yi); outr0 = vmlsl_s16(outr0, tmpi, xi); outi0 = vmlal_s16(outi0, tmpr, xi); outr1 = vmlal_s16(outr1, tmpr, yi); outi1 = vmlsl_s16(outi1, tmpr, yr); outr0 = vshlq_s32(outr0, shift); outi0 = vshlq_s32(outi0, shift); outr1 = vshlq_s32(outr1, shift); outi1 = vshlq_s32(outi1, shift); outr1 = vnegq_s32(outr1); int16x4_t outre_0 = vmovn_s32(outr0); int16x4_t outim_0 = vmovn_s32(outi0); int16x4_t outre_1 = vmovn_s32(outr1); int16x4_t outim_1 = vmovn_s32(outi1); outre_1 = vrev64_s16(outre_1); outim_1 = vrev64_s16(outim_1); vst1_s16(outre1, outre_0); outre1 += 4; vst1_s16(outim1, outim_0); outim1 += 4; vst1_s16(outre2, outre_1); outre2 -= 4; vst1_s16(outim2, outim_1); outim2 -= 4; } }