// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, __m128i *q2, const __m128i* mask, int hev_thresh) { __m128i a, not_hev; const __m128i sign_bit = _mm_set1_epi8(0x80); // compute hev mask GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); GET_BASE_DELTA(*p1, *p0, *q0, *q1, a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); DO_SIMPLE_FILTER(*p0, *q0, f); } { // do strong filter on pixels with not hev const __m128i zero = _mm_setzero_si128(); const __m128i nine = _mm_set1_epi16(0x0900); const __m128i sixty_three = _mm_set1_epi16(63); const __m128i m = _mm_and_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); const __m128i f_lo = _mm_unpacklo_epi8(zero, f); const __m128i f_hi = _mm_unpackhi_epi8(zero, f); const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine); // Filter (lo) * 9 const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine); // Filter (hi) * 9 const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo); // Filter (lo) * 18 const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi); // Filter (hi) * 18 const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three); // Filter * 9 + 63 const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three); // Filter * 9 + 63 const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three); // F... * 18 + 63 const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three); // F... * 18 + 63 const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo); // Filter * 27 + 63 const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi); // Filter * 27 + 63 UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi); UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi); UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi); } // unoffset FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); }
// Applies filter on 2 pixels (p0 and q0) static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0, __m128i* const q0, __m128i* const q1, int thresh) { __m128i a, mask; const __m128i sign_bit = _mm_set1_epi8(0x80); // convert p1/q1 to int8_t (for GetBaseDelta) __m128i p1s = _mm_xor_si128(*p1, sign_bit); // insieme: dropped const __m128i q1s = _mm_xor_si128(*q1, sign_bit); // insieme: dropped const NeedsFilter(p1, p0, q0, q1, thresh, &mask); FLIP_SIGN_BIT2(*p0, *q0); GetBaseDelta(&p1s, p0, q0, &q1s, &a); a = _mm_and_si128(a, mask); // mask filter values we don't care about DoSimpleFilter(p0, q0, &a); FLIP_SIGN_BIT2(*p0, *q0); }
// Applies filter on 2 pixels (p0 and q0) static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, const __m128i* q1, int thresh) { __m128i a, mask; const __m128i sign_bit = _mm_set1_epi8(0x80); const __m128i p1s = _mm_xor_si128(*p1, sign_bit); const __m128i q1s = _mm_xor_si128(*q1, sign_bit); NeedsFilter(p1, p0, q0, q1, thresh, &mask); // convert to signed values FLIP_SIGN_BIT2(*p0, *q0); GET_BASE_DELTA(p1s, *p0, *q0, q1s, a); a = _mm_and_si128(a, mask); // mask filter values we don't care about DO_SIMPLE_FILTER(*p0, *q0, a); // unoffset FLIP_SIGN_BIT2(*p0, *q0); }
// Applies filter on 4 pixels (p1, p0, q0 and q1) static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, __m128i* const q0, __m128i* const q1, const __m128i* const mask, int hev_thresh) { const __m128i sign_bit = _mm_set1_epi8(0x80); const __m128i k64 = _mm_set1_epi8(0x40); const __m128i zero = _mm_setzero_si128(); __m128i not_hev; __m128i t1, t2, t3; // compute hev mask GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 t1 = _mm_andnot_si128(not_hev, t1); // hev(p1 - q1) t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0 t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about t2 = _mm_set1_epi8(3); t3 = _mm_set1_epi8(4); t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 3 t3 = _mm_adds_epi8(t1, t3); // 3 * (q0 - p0) + (p1 - q1) + 4 SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 *q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3 FLIP_SIGN_BIT2(*p0, *q0); // this is equivalent to signed (a + 1) >> 1 calculation t2 = _mm_add_epi8(t3, sign_bit); t3 = _mm_avg_epu8(t2, zero); t3 = _mm_sub_epi8(t3, k64); t3 = _mm_and_si128(not_hev, t3); // if !hev *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 FLIP_SIGN_BIT2(*p1, *q1); }
// Updates values of 2 pixels at MB edge during complex filtering. // Update operations: // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi, const __m128i* const a0_lo, const __m128i* const a0_hi) { const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7); const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7); const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi); const __m128i sign_bit = _mm_set1_epi8(0x80); *pi = _mm_adds_epi8(*pi, delta); *qi = _mm_subs_epi8(*qi, delta); FLIP_SIGN_BIT2(*pi, *qi); }
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, __m128i* const p0, __m128i* const q0, __m128i* const q1, __m128i* const q2, const __m128i* const mask, int hev_thresh) { const __m128i zero = _mm_setzero_si128(); const __m128i sign_bit = _mm_set1_epi8(0x80); __m128i a, not_hev; // compute hev mask GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); GetBaseDelta(p1, p0, q0, q1, &a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); __m128i f = _mm_and_si128(a, m); // insieme: dropped const DoSimpleFilter(p0, q0, &f); } { // do strong filter on pixels with not hev const __m128i k9 = _mm_set1_epi16(0x0900); const __m128i k63 = _mm_set1_epi16(63); const __m128i m = _mm_and_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); const __m128i f_lo = _mm_unpacklo_epi8(zero, f); const __m128i f_hi = _mm_unpackhi_epi8(zero, f); const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9); // Filter (lo) * 9 const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9); // Filter (hi) * 9 __m128i a2_lo = _mm_add_epi16(f9_lo, k63); // Filter * 9 + 63 // insieme: dropped const __m128i a2_hi = _mm_add_epi16(f9_hi, k63); // Filter * 9 + 63 // insieme: dropped const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo); // Filter * 18 + 63 // insieme: dropped const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi); // Filter * 18 + 63 // insieme: dropped const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63 // insieme: dropped const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63 // insieme: dropped const Update2Pixels(p2, q2, &a2_lo, &a2_hi); Update2Pixels(p1, q1, &a1_lo, &a1_hi); Update2Pixels(p0, q0, &a0_lo, &a0_hi); } }