static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) { uint8x8x4_t x; const uint8x8_t a = vld1_u8(src); const uint8x8_t b = vld1_u8(src + pitch * 1); const uint8x8_t c = vld1_u8(src + pitch * 2); const uint8x8_t d = vld1_u8(src + pitch * 3); const uint8x8_t e = vld1_u8(src + pitch * 4); const uint8x8_t f = vld1_u8(src + pitch * 5); const uint8x8_t g = vld1_u8(src + pitch * 6); const uint8x8_t h = vld1_u8(src + pitch * 7); const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(e)); const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b), vreinterpret_u32_u8(f)); const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c), vreinterpret_u32_u8(g)); const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d), vreinterpret_u32_u8(h)); const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), vreinterpret_u16_u32(r26_u32.val[0])); const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), vreinterpret_u16_u32(r37_u32.val[0])); const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), vreinterpret_u8_u16(r13_u16.val[0])); const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), vreinterpret_u8_u16(r13_u16.val[1])); /* * after vtrn_u32 00 01 02 03 | 40 41 42 43 10 11 12 13 | 50 51 52 53 20 21 22 23 | 60 61 62 63 30 31 32 33 | 70 71 72 73 --- * after vtrn_u16 00 01 20 21 | 40 41 60 61 02 03 22 23 | 42 43 62 63 10 11 30 31 | 50 51 70 71 12 13 32 33 | 52 52 72 73 00 01 20 21 | 40 41 60 61 10 11 30 31 | 50 51 70 71 02 03 22 23 | 42 43 62 63 12 13 32 33 | 52 52 72 73 --- * after vtrn_u8 00 10 20 30 | 40 50 60 70 01 11 21 31 | 41 51 61 71 02 12 22 32 | 42 52 62 72 03 13 23 33 | 43 53 63 73 */ x.val[0] = r01_u8.val[0]; x.val[1] = r01_u8.val[1]; x.val[2] = r23_u8.val[0]; x.val[3] = r23_u8.val[1]; return x; }
static INLINE void write_2x4(unsigned char *dst, int pitch, const uint8x8x2_t result) { /* * uint8x8x2_t result 00 01 02 03 | 04 05 06 07 10 11 12 13 | 14 15 16 17 --- * after vtrn_u8 00 10 02 12 | 04 14 06 16 01 11 03 13 | 05 15 07 17 */ const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], result.val[1]); const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); vst1_lane_u16((uint16_t *)dst, x_0_4, 0); dst += pitch; vst1_lane_u16((uint16_t *)dst, x_1_5, 0); dst += pitch; vst1_lane_u16((uint16_t *)dst, x_0_4, 1); dst += pitch; vst1_lane_u16((uint16_t *)dst, x_1_5, 1); dst += pitch; vst1_lane_u16((uint16_t *)dst, x_0_4, 2); dst += pitch; vst1_lane_u16((uint16_t *)dst, x_1_5, 2); dst += pitch; vst1_lane_u16((uint16_t *)dst, x_0_4, 3); dst += pitch; vst1_lane_u16((uint16_t *)dst, x_1_5, 3); }
void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; uint8_t *s; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint8x8_t d16u8, d17u8, d18u8; uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; uint8x8x4_t d4Result; uint8x8x2_t d2Result; dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); for (i = 0; i < 1; i++) { s = src + (i * (pitch << 3)) - 4; d3u8 = vld1_u8(s); s += pitch; d4u8 = vld1_u8(s); s += pitch; d5u8 = vld1_u8(s); s += pitch; d6u8 = vld1_u8(s); s += pitch; d7u8 = vld1_u8(s); s += pitch; d16u8 = vld1_u8(s); s += pitch; d17u8 = vld1_u8(s); s += pitch; d18u8 = vld1_u8(s); d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), vreinterpret_u16_u32(d2tmp2.val[0])); d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), vreinterpret_u16_u32(d2tmp3.val[0])); d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), vreinterpret_u16_u32(d2tmp2.val[1])); d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), vreinterpret_u16_u32(d2tmp3.val[1])); d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), vreinterpret_u8_u16(d2tmp5.val[0])); d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), vreinterpret_u8_u16(d2tmp5.val[1])); d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), vreinterpret_u8_u16(d2tmp7.val[0])); d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), vreinterpret_u8_u16(d2tmp7.val[1])); d3u8 = d2tmp8.val[0]; d4u8 = d2tmp8.val[1]; d5u8 = d2tmp9.val[0]; d6u8 = d2tmp9.val[1]; d7u8 = d2tmp10.val[0]; d16u8 = d2tmp10.val[1]; d17u8 = d2tmp11.val[0]; d18u8 = d2tmp11.val[1]; mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); d4Result.val[0] = d0u8; d4Result.val[1] = d1u8; d4Result.val[2] = d2u8; d4Result.val[3] = d3u8; d2Result.val[0] = d4u8; d2Result.val[1] = d5u8; s = src - 3; vst4_lane_u8(s, d4Result, 0); s += pitch; vst4_lane_u8(s, d4Result, 1); s += pitch; vst4_lane_u8(s, d4Result, 2); s += pitch; vst4_lane_u8(s, d4Result, 3); s += pitch; vst4_lane_u8(s, d4Result, 4); s += pitch; vst4_lane_u8(s, d4Result, 5); s += pitch; vst4_lane_u8(s, d4Result, 6); s += pitch; vst4_lane_u8(s, d4Result, 7); s = src + 1; vst2_lane_u8(s, d2Result, 0); s += pitch; vst2_lane_u8(s, d2Result, 1); s += pitch; vst2_lane_u8(s, d2Result, 2); s += pitch; vst2_lane_u8(s, d2Result, 3); s += pitch; vst2_lane_u8(s, d2Result, 4); s += pitch; vst2_lane_u8(s, d2Result, 5); s += pitch; vst2_lane_u8(s, d2Result, 6); s += pitch; vst2_lane_u8(s, d2Result, 7); } return; }
uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { // CHECK-LABEL: test_vtrn_u8 return vtrn_u8(a, b); // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }
void vp9_lpf_vertical_4_neon( unsigned char *src, int pitch, unsigned char *blimit, unsigned char *limit, unsigned char *thresh, int count) { int i, pitch8; uint8_t *s; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; uint8x8x4_t d4Result; if (count == 0) // end_vp9_lf_h_edge return; dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); pitch8 = pitch * 8; for (i = 0; i < count; i++, src += pitch8) { s = src - (i + 1) * 4; d3u8 = vld1_u8(s); s += pitch; d4u8 = vld1_u8(s); s += pitch; d5u8 = vld1_u8(s); s += pitch; d6u8 = vld1_u8(s); s += pitch; d7u8 = vld1_u8(s); s += pitch; d16u8 = vld1_u8(s); s += pitch; d17u8 = vld1_u8(s); s += pitch; d18u8 = vld1_u8(s); d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), vreinterpret_u16_u32(d2tmp2.val[0])); d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), vreinterpret_u16_u32(d2tmp3.val[0])); d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), vreinterpret_u16_u32(d2tmp2.val[1])); d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), vreinterpret_u16_u32(d2tmp3.val[1])); d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), vreinterpret_u8_u16(d2tmp5.val[0])); d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), vreinterpret_u8_u16(d2tmp5.val[1])); d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), vreinterpret_u8_u16(d2tmp7.val[0])); d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), vreinterpret_u8_u16(d2tmp7.val[1])); d3u8 = d2tmp8.val[0]; d4u8 = d2tmp8.val[1]; d5u8 = d2tmp9.val[0]; d6u8 = d2tmp9.val[1]; d7u8 = d2tmp10.val[0]; d16u8 = d2tmp10.val[1]; d17u8 = d2tmp11.val[0]; d18u8 = d2tmp11.val[1]; vp9_loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); d4Result.val[0] = d4u8; d4Result.val[1] = d5u8; d4Result.val[2] = d6u8; d4Result.val[3] = d7u8; src -= 2; vst4_lane_u8(src, d4Result, 0); src += pitch; vst4_lane_u8(src, d4Result, 1); src += pitch; vst4_lane_u8(src, d4Result, 2); src += pitch; vst4_lane_u8(src, d4Result, 3); src += pitch; vst4_lane_u8(src, d4Result, 4); src += pitch; vst4_lane_u8(src, d4Result, 5); src += pitch; vst4_lane_u8(src, d4Result, 6); src += pitch; vst4_lane_u8(src, d4Result, 7); } return; }
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h) { int width; const uint8_t *s, *psrc; uint8_t *d, *pdst; uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; uint8x16_t q12u8, q13u8, q14u8, q15u8; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; uint16x8x2_t q0x2u16; uint8x8x2_t d0x2u8, d1x2u8; uint32x2x2_t d0x2u32; uint16x4x2_t d0x2u16, d1x2u16; uint32x4x2_t q0x2u32; assert(x_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_y; q0s16 = vld1q_s16(filter_x); src -= 3; // adjust for taps for (; h > 0; h -= 4, src += src_stride * 4, dst += dst_stride * 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); s += src_stride; d25u8 = vld1_u8(s); s += src_stride; d26u8 = vld1_u8(s); s += src_stride; d27u8 = vld1_u8(s); q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); d0x2u8 = vtrn_u8(d24u8, d25u8); d1x2u8 = vtrn_u8(d26u8, d27u8); __builtin_prefetch(src + src_stride * 4); __builtin_prefetch(src + src_stride * 5); __builtin_prefetch(src + src_stride * 6); q8u16 = vmovl_u8(d0x2u8.val[0]); q9u16 = vmovl_u8(d0x2u8.val[1]); q10u16 = vmovl_u8(d1x2u8.val[0]); q11u16 = vmovl_u8(d1x2u8.val[1]); d16u16 = vget_low_u16(q8u16); d17u16 = vget_high_u16(q8u16); d18u16 = vget_low_u16(q9u16); d19u16 = vget_high_u16(q9u16); q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 q9u16 = vcombine_u16(d17u16, d19u16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 for (width = w, psrc = src + 7, pdst = dst; width > 0; width -= 4, psrc += 4, pdst += 4) { // loop_horiz s = psrc; d28u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d29u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d31u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d30u32 = vld1_dup_u32((const uint32_t *)s); __builtin_prefetch(psrc + 64); d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 vreinterpret_u8_u16(d1x2u16.val[1])); // d30 __builtin_prefetch(psrc + 64 + src_stride); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); q12u16 = vmovl_u8(d28u8); q13u16 = vmovl_u8(d29u8); __builtin_prefetch(psrc + 64 + src_stride * 2); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, q0s16); q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, q0s16); q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, q0s16); q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, d25s16, q0s16); __builtin_prefetch(psrc + 60 + src_stride * 3); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), vreinterpret_u8_u32(d0x2u32.val[1])); d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); d = pdst; vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); q8u16 = q9u16; d20s16 = d23s16; q11u16 = q12u16; q9u16 = q13u16; d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); } } return; }