// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t A1 = vld1q_u8(above + 16); const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top const uint16x8_t p1 = vpaddlq_u8(A1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_top = vcombine_u16(p5, p5); } if (do_left) { const uint8x16_t L0 = vld1q_u8(left); // left row const uint8x16_t L1 = vld1q_u8(left + 16); const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left const uint16x8_t p1 = vpaddlq_u8(L1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_left = vcombine_u16(p5, p5); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 6); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 5); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 5); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 32; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); } } }
void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint8x8_t d2u8 = vdup_n_u8(0); uint8x16_t q0u8 = vdupq_n_u8(0); uint8x16_t q1u8 = vdupq_n_u8(0); (void)above; for (k = 0; k < 2; k++, left += 16) { q1u8 = vld1q_u8(left); d2u8 = vget_low_u8(q1u8); for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { q0u8 = vdupq_lane_u8(d2u8, 0); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 1); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 2); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 3); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 4); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 5); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 6); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 7); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; } } }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A = vld1q_u8(above); // top row const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); } if (do_left) { const uint8x16_t L = vld1q_u8(left); // left row const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_left = vcombine_u16(p3, p3); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 5); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 4); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 4); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 16; ++i) { vst1q_u8(dst + i * stride, dc); } } }
uint8x16_t test_vdupq_lane_u8(uint8x8_t v1) { // CHECK: test_vdupq_lane_u8 return vdupq_lane_u8(v1, 5); // CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5] }