void test_vtrnQu32 (void) { uint32x4x2_t out_uint32x4x2_t; uint32x4_t arg0_uint32x4_t; uint32x4_t arg1_uint32x4_t; out_uint32x4x2_t = vtrnq_u32 (arg0_uint32x4_t, arg1_uint32x4_t); }
uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { // CHECK-LABEL: test_vtrnq_u32 return vtrnq_u32(a, b); // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h) { int width; const uint8_t *s, *psrc; uint8_t *d, *pdst; uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; uint8x16_t q12u8, q13u8, q14u8, q15u8; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; uint16x8x2_t q0x2u16; uint8x8x2_t d0x2u8, d1x2u8; uint32x2x2_t d0x2u32; uint16x4x2_t d0x2u16, d1x2u16; uint32x4x2_t q0x2u32; assert(x_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_y; q0s16 = vld1q_s16(filter_x); src -= 3; // adjust for taps for (; h > 0; h -= 4, src += src_stride * 4, dst += dst_stride * 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); s += src_stride; d25u8 = vld1_u8(s); s += src_stride; d26u8 = vld1_u8(s); s += src_stride; d27u8 = vld1_u8(s); q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); d0x2u8 = vtrn_u8(d24u8, d25u8); d1x2u8 = vtrn_u8(d26u8, d27u8); __builtin_prefetch(src + src_stride * 4); __builtin_prefetch(src + src_stride * 5); __builtin_prefetch(src + src_stride * 6); q8u16 = vmovl_u8(d0x2u8.val[0]); q9u16 = vmovl_u8(d0x2u8.val[1]); q10u16 = vmovl_u8(d1x2u8.val[0]); q11u16 = vmovl_u8(d1x2u8.val[1]); d16u16 = vget_low_u16(q8u16); d17u16 = vget_high_u16(q8u16); d18u16 = vget_low_u16(q9u16); d19u16 = vget_high_u16(q9u16); q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 q9u16 = vcombine_u16(d17u16, d19u16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 for (width = w, psrc = src + 7, pdst = dst; width > 0; width -= 4, psrc += 4, pdst += 4) { // loop_horiz s = psrc; d28u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d29u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d31u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d30u32 = vld1_dup_u32((const uint32_t *)s); __builtin_prefetch(psrc + 64); d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 vreinterpret_u8_u16(d1x2u16.val[1])); // d30 __builtin_prefetch(psrc + 64 + src_stride); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); q12u16 = vmovl_u8(d28u8); q13u16 = vmovl_u8(d29u8); __builtin_prefetch(psrc + 64 + src_stride * 2); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, q0s16); q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, q0s16); q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, q0s16); q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, d25s16, q0s16); __builtin_prefetch(psrc + 60 + src_stride * 3); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), vreinterpret_u8_u32(d0x2u32.val[1])); d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); d = pdst; vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); q8u16 = q9u16; d20s16 = d23s16; q11u16 = q12u16; q9u16 = q13u16; d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); } } return; }
void vp8_mbloop_filter_vertical_edge_uv_neon( unsigned char *u, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh, unsigned char *v) { unsigned char *us, *ud; unsigned char *vs, *vd; uint8x16_t qblimit, qlimit, qthresh, q3, q4; uint8x16_t q5, q6, q7, q8, q9, q10; uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; uint8x8_t d15, d16, d17, d18, d19, d20, d21; uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; qblimit = vdupq_n_u8(blimit); qlimit = vdupq_n_u8(limit); qthresh = vdupq_n_u8(thresh); us = u - 4; vs = v - 4; d6 = vld1_u8(us); us += pitch; d7 = vld1_u8(vs); vs += pitch; d8 = vld1_u8(us); us += pitch; d9 = vld1_u8(vs); vs += pitch; d10 = vld1_u8(us); us += pitch; d11 = vld1_u8(vs); vs += pitch; d12 = vld1_u8(us); us += pitch; d13 = vld1_u8(vs); vs += pitch; d14 = vld1_u8(us); us += pitch; d15 = vld1_u8(vs); vs += pitch; d16 = vld1_u8(us); us += pitch; d17 = vld1_u8(vs); vs += pitch; d18 = vld1_u8(us); us += pitch; d19 = vld1_u8(vs); vs += pitch; d20 = vld1_u8(us); d21 = vld1_u8(vs); q3 = vcombine_u8(d6, d7); q4 = vcombine_u8(d8, d9); q5 = vcombine_u8(d10, d11); q6 = vcombine_u8(d12, d13); q7 = vcombine_u8(d14, d15); q8 = vcombine_u8(d16, d17); q9 = vcombine_u8(d18, d19); q10 = vcombine_u8(d20, d21); q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), vreinterpretq_u16_u32(q2tmp2.val[0])); q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), vreinterpretq_u16_u32(q2tmp3.val[0])); q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), vreinterpretq_u16_u32(q2tmp2.val[1])); q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), vreinterpretq_u16_u32(q2tmp3.val[1])); q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), vreinterpretq_u8_u16(q2tmp5.val[0])); q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), vreinterpretq_u8_u16(q2tmp5.val[1])); q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), vreinterpretq_u8_u16(q2tmp7.val[0])); q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), vreinterpretq_u8_u16(q2tmp7.val[1])); q3 = q2tmp8.val[0]; q4 = q2tmp8.val[1]; q5 = q2tmp9.val[0]; q6 = q2tmp9.val[1]; q7 = q2tmp10.val[0]; q8 = q2tmp10.val[1]; q9 = q2tmp11.val[0]; q10 = q2tmp11.val[1]; vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, q10, &q4, &q5, &q6, &q7, &q8, &q9); q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), vreinterpretq_u16_u32(q2tmp2.val[0])); q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), vreinterpretq_u16_u32(q2tmp3.val[0])); q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), vreinterpretq_u16_u32(q2tmp2.val[1])); q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), vreinterpretq_u16_u32(q2tmp3.val[1])); q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), vreinterpretq_u8_u16(q2tmp5.val[0])); q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), vreinterpretq_u8_u16(q2tmp5.val[1])); q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), vreinterpretq_u8_u16(q2tmp7.val[0])); q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), vreinterpretq_u8_u16(q2tmp7.val[1])); q3 = q2tmp8.val[0]; q4 = q2tmp8.val[1]; q5 = q2tmp9.val[0]; q6 = q2tmp9.val[1]; q7 = q2tmp10.val[0]; q8 = q2tmp10.val[1]; q9 = q2tmp11.val[0]; q10 = q2tmp11.val[1]; ud = u - 4; vst1_u8(ud, vget_low_u8(q3)); ud += pitch; vst1_u8(ud, vget_low_u8(q4)); ud += pitch; vst1_u8(ud, vget_low_u8(q5)); ud += pitch; vst1_u8(ud, vget_low_u8(q6)); ud += pitch; vst1_u8(ud, vget_low_u8(q7)); ud += pitch; vst1_u8(ud, vget_low_u8(q8)); ud += pitch; vst1_u8(ud, vget_low_u8(q9)); ud += pitch; vst1_u8(ud, vget_low_u8(q10)); vd = v - 4; vst1_u8(vd, vget_high_u8(q3)); vd += pitch; vst1_u8(vd, vget_high_u8(q4)); vd += pitch; vst1_u8(vd, vget_high_u8(q5)); vd += pitch; vst1_u8(vd, vget_high_u8(q6)); vd += pitch; vst1_u8(vd, vget_high_u8(q7)); vd += pitch; vst1_u8(vd, vget_high_u8(q8)); vd += pitch; vst1_u8(vd, vget_high_u8(q9)); vd += pitch; vst1_u8(vd, vget_high_u8(q10)); return; }