void test_vtrns32 (void) { int32x2x2_t out_int32x2x2_t; int32x2_t arg0_int32x2_t; int32x2_t arg1_int32x2_t; out_int32x2x2_t = vtrn_s32 (arg0_int32x2_t, arg1_int32x2_t); }
void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { int i; uint32x2_t d6u32 = vdup_n_u32(0); uint8x8_t d1u8; int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; uint16x8_t q1u16; int16x8_t q1s16, q2s16, q3s16, q4s16; int32x2x2_t v2tmp0, v2tmp1; int16x4x2_t v2tmp2, v2tmp3; d2 = vld1_s16(input); d3 = vld1_s16(input + 4); d4 = vld1_s16(input + 8); d5 = vld1_s16(input + 12); // 1st for loop q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here q2s16 = vcombine_s16(d3, d5); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); // 2nd for loop q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); d2 = vrshr_n_s16(d2, 3); d3 = vrshr_n_s16(d3, 3); d4 = vrshr_n_s16(d4, 3); d5 = vrshr_n_s16(d5, 3); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); // dc_only_idct_add for (i = 0; i < 2; i++, q1s16 = q2s16) { d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); pred_ptr += pred_stride; d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); pred_ptr += pred_stride; q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); dst_ptr += dst_stride; vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); dst_ptr += dst_stride; } return; }
int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { // CHECK-LABEL: test_vtrn_s32 return vtrn_s32(a, b); // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} }
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
void vp8_short_fdct4x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, dEmptys16; uint16x4_t d4u16; int16x8_t q0s16, q1s16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x4x2_t v2tmp0, v2tmp1; int32x2x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); q11s32 = vdupq_n_s32(12000); q12s32 = vdupq_n_s32(51000); // Part one pitch >>= 1; d0s16 = vld1_s16(input); input += pitch; d1s16 = vld1_s16(input); input += pitch; d2s16 = vld1_s16(input); input += pitch; d3s16 = vld1_s16(input); v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d4s16 = vshl_n_s16(d4s16, 3); d5s16 = vshl_n_s16(d5s16, 3); d6s16 = vshl_n_s16(d6s16, 3); d7s16 = vshl_n_s16(d7s16, 3); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q9s32 = vmlal_s16(q9s32, d7s16, d16s16); q10s32 = vmlal_s16(q10s32, d7s16, d17s16); q9s32 = vmlal_s16(q9s32, d6s16, d17s16); q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 12); d3s16 = vshrn_n_s32(q10s32, 12); // Part two v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d26s16 = vdup_n_s16(7); d4s16 = vadd_s16(d4s16, d26s16); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q11s32 = vmlal_s16(q11s32, d7s16, d16s16); q12s32 = vmlal_s16(q12s32, d7s16, d17s16); dEmptys16 = vdup_n_s16(0); d4u16 = vceq_s16(d7s16, dEmptys16); d0s16 = vshr_n_s16(d0s16, 4); d2s16 = vshr_n_s16(d2s16, 4); q11s32 = vmlal_s16(q11s32, d6s16, d17s16); q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); d4u16 = vmvn_u16(d4u16); d1s16 = vshrn_n_s32(q11s32, 16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); d3s16 = vshrn_n_s32(q12s32, 16); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); return; }