void test_vreinterprets16_s32 (void) { int16x4_t out_int16x4_t; int32x2_t arg0_int32x2_t; out_int16x4_t = vreinterpret_s16_s32 (arg0_int32x2_t); }
void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { int i; uint32x2_t d6u32 = vdup_n_u32(0); uint8x8_t d1u8; int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; uint16x8_t q1u16; int16x8_t q1s16, q2s16, q3s16, q4s16; int32x2x2_t v2tmp0, v2tmp1; int16x4x2_t v2tmp2, v2tmp3; d2 = vld1_s16(input); d3 = vld1_s16(input + 4); d4 = vld1_s16(input + 8); d5 = vld1_s16(input + 12); // 1st for loop q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here q2s16 = vcombine_s16(d3, d5); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); // 2nd for loop q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); d2 = vrshr_n_s16(d2, 3); d3 = vrshr_n_s16(d3, 3); d4 = vrshr_n_s16(d4, 3); d5 = vrshr_n_s16(d5, 3); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); // dc_only_idct_add for (i = 0; i < 2; i++, q1s16 = q2s16) { d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); pred_ptr += pred_stride; d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); pred_ptr += pred_stride; q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); dst_ptr += dst_stride; vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); dst_ptr += dst_stride; } return; }
void WebRtcIsacfix_AllpassFilter2FixDec16Neon( int16_t* data_ch1, // Input and output in channel 1, in Q0 int16_t* data_ch2, // Input and output in channel 2, in Q0 const int16_t* factor_ch1, // Scaling factor for channel 1, in Q15 const int16_t* factor_ch2, // Scaling factor for channel 2, in Q15 const int length, // Length of the data buffers int32_t* filter_state_ch1, // Filter state for channel 1, in Q16 int32_t* filter_state_ch2) { // Filter state for channel 2, in Q16 assert(length % 2 == 0); int n = 0; int16x4_t factorv; int16x4_t datav; int32x4_t statev; int32x2_t tmp; // Load factor_ch1 and factor_ch2. tmp = vld1_dup_s32((int32_t*)factor_ch1); tmp = vld1_lane_s32((int32_t*)factor_ch2, tmp, 1); factorv = vreinterpret_s16_s32(tmp); // Load filter_state_ch1[0] and filter_state_ch2[0]. statev = vld1q_dup_s32(filter_state_ch1); statev = vld1q_lane_s32(filter_state_ch2, statev, 2); // Loop unrolling preprocessing. int32x4_t a; int16x4_t tmp1, tmp2; // Load data_ch1[0] and data_ch2[0]. datav = vld1_dup_s16(data_ch1); datav = vld1_lane_s16(data_ch2, datav, 2); a = vqdmlal_s16(statev, datav, factorv); tmp1 = vshrn_n_s32(a, 16); // Update filter_state_ch1[0] and filter_state_ch2[0]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv); // Load filter_state_ch1[1] and filter_state_ch2[1]. statev = vld1q_lane_s32(filter_state_ch1 + 1, statev, 1); statev = vld1q_lane_s32(filter_state_ch2 + 1, statev, 3); // Load data_ch1[1] and data_ch2[1]. tmp1 = vld1_lane_s16(data_ch1 + 1, tmp1, 1); tmp1 = vld1_lane_s16(data_ch2 + 1, tmp1, 3); datav = vrev32_s16(tmp1); // Loop unrolling processing. for (n = 0; n < length - 2; n += 2) { a = vqdmlal_s16(statev, datav, factorv); tmp1 = vshrn_n_s32(a, 16); // Store data_ch1[n] and data_ch2[n]. vst1_lane_s16(data_ch1 + n, tmp1, 1); vst1_lane_s16(data_ch2 + n, tmp1, 3); // Update filter_state_ch1[0], filter_state_ch1[1] // and filter_state_ch2[0], filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv); // Load data_ch1[n + 2] and data_ch2[n + 2]. tmp1 = vld1_lane_s16(data_ch1 + n + 2, tmp1, 1); tmp1 = vld1_lane_s16(data_ch2 + n + 2, tmp1, 3); datav = vrev32_s16(tmp1); a = vqdmlal_s16(statev, datav, factorv); tmp2 = vshrn_n_s32(a, 16); // Store data_ch1[n + 1] and data_ch2[n + 1]. vst1_lane_s16(data_ch1 + n + 1, tmp2, 1); vst1_lane_s16(data_ch2 + n + 1, tmp2, 3); // Update filter_state_ch1[0], filter_state_ch1[1] // and filter_state_ch2[0], filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv); // Load data_ch1[n + 3] and data_ch2[n + 3]. tmp2 = vld1_lane_s16(data_ch1 + n + 3, tmp2, 1); tmp2 = vld1_lane_s16(data_ch2 + n + 3, tmp2, 3); datav = vrev32_s16(tmp2); } // Loop unrolling post-processing. a = vqdmlal_s16(statev, datav, factorv); tmp1 = vshrn_n_s32(a, 16); // Store data_ch1[n] and data_ch2[n]. vst1_lane_s16(data_ch1 + n, tmp1, 1); vst1_lane_s16(data_ch2 + n, tmp1, 3); // Update filter_state_ch1[0], filter_state_ch1[1] // and filter_state_ch2[0], filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv); // Store filter_state_ch1[0] and filter_state_ch2[0]. vst1q_lane_s32(filter_state_ch1, statev, 0); vst1q_lane_s32(filter_state_ch2, statev, 2); datav = vrev32_s16(tmp1); a = vqdmlal_s16(statev, datav, factorv); tmp2 = vshrn_n_s32(a, 16); // Store data_ch1[n + 1] and data_ch2[n + 1]. vst1_lane_s16(data_ch1 + n + 1, tmp2, 1); vst1_lane_s16(data_ch2 + n + 1, tmp2, 3); // Update filter_state_ch1[1] and filter_state_ch2[1]. statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv); // Store filter_state_ch1[1] and filter_state_ch2[1]. vst1q_lane_s32(filter_state_ch1 + 1, statev, 1); vst1q_lane_s32(filter_state_ch2 + 1, statev, 3); }
void vp8_short_fdct4x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, dEmptys16; uint16x4_t d4u16; int16x8_t q0s16, q1s16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x4x2_t v2tmp0, v2tmp1; int32x2x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); q11s32 = vdupq_n_s32(12000); q12s32 = vdupq_n_s32(51000); // Part one pitch >>= 1; d0s16 = vld1_s16(input); input += pitch; d1s16 = vld1_s16(input); input += pitch; d2s16 = vld1_s16(input); input += pitch; d3s16 = vld1_s16(input); v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d4s16 = vshl_n_s16(d4s16, 3); d5s16 = vshl_n_s16(d5s16, 3); d6s16 = vshl_n_s16(d6s16, 3); d7s16 = vshl_n_s16(d7s16, 3); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q9s32 = vmlal_s16(q9s32, d7s16, d16s16); q10s32 = vmlal_s16(q10s32, d7s16, d17s16); q9s32 = vmlal_s16(q9s32, d6s16, d17s16); q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 12); d3s16 = vshrn_n_s32(q10s32, 12); // Part two v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d26s16 = vdup_n_s16(7); d4s16 = vadd_s16(d4s16, d26s16); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q11s32 = vmlal_s16(q11s32, d7s16, d16s16); q12s32 = vmlal_s16(q12s32, d7s16, d17s16); dEmptys16 = vdup_n_s16(0); d4u16 = vceq_s16(d7s16, dEmptys16); d0s16 = vshr_n_s16(d0s16, 4); d2s16 = vshr_n_s16(d2s16, 4); q11s32 = vmlal_s16(q11s32, d6s16, d17s16); q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); d4u16 = vmvn_u16(d4u16); d1s16 = vshrn_n_s32(q11s32, 16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); d3s16 = vshrn_n_s32(q12s32, 16); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); return; }