unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { uint8x8_t v_s0 = vld1_u8(s); const uint8x8_t v_s1 = vld1_u8(s + p); uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); v_s0 = vld1_u8(s + 2 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 3 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 4 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 5 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 6 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 7 * p); v_sum = vaddw_u8(v_sum, v_s0); return (horizontal_add_u16x8(v_sum) + 32) >> 6; }
unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) { uint16x8_t v_sum; uint32x2_t v_s0 = vdup_n_u32(0); uint32x2_t v_s1 = vdup_n_u32(0); v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); return (horizontal_add_u16x8(v_sum) + 8) >> 4; }
int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) { int i; uint16x8_t vec_sum = vdupq_n_u16(0); for (i = 0; i < width; i += 16) { const uint8x16_t vec_row = vld1q_u8(ref); vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); ref += 16; } return horizontal_add_u16x8(vec_sum); }