static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp return vget_lane_u32(vreinterpret_u32_u8(out), 0); }
static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 }; const uint8x8_t p0 = vreinterpret_u8_u64(C0); const uint8x8_t p1 = vreinterpret_u8_u64(C1); const uint8x8_t p2 = vreinterpret_u8_u64(C2); const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp uint32_t ret; vst1_lane_u32(&ret, vreinterpret_u32_u8(out), 0); return ret; }
/* u16x8 saturated sub */ void mw_neon_mm_qsub_u16x8(unsigned short * A, int Row, int Col, unsigned short * B, unsigned short * C) { uint16x8_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 8; i <= size ; i+=8) { k = i - 8; neon_a = vld1q_u16(A + k); neon_b = vld1q_u16(B + k); neon_c = vqsubq_u16(neon_a, neon_b); vst1q_u16(C + k, neon_c); } k = i - 8; for (i = 0; i < size % 8; i++) { C[k + i] = A[k + i] + B[k + i]; } }
inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); }