static WEBP_INLINE uint32_t Select(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); const int32_t pa_minus_pb = vget_lane_s32(diff, 0); return (pa_minus_pb <= 0) ? *c0 : *c1; }
static WEBP_INLINE uint32_t Select(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 }; const uint8x8_t p0 = vreinterpret_u8_u64(C0); const uint8x8_t p1 = vreinterpret_u8_u64(C1); const uint8x8_t p2 = vreinterpret_u8_u64(C2); const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); int32_t pa_minus_pb; vst1_lane_s32(&pa_minus_pb, diff, 0); return (pa_minus_pb <= 0) ? *c0 : *c1; }
void test_vpaddlu8 (void) { uint16x4_t out_uint16x4_t; uint8x8_t arg0_uint8x8_t; out_uint16x4_t = vpaddl_u8 (arg0_uint8x8_t); }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_top = vcombine_u16(p2, p2); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_left = vcombine_u16(p2, p2); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 4); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 3); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 3); } else { dc0 = vdup_n_u8(0x80); } { const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 8; ++i) { vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); } } }