예제 #1
0
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
                                  __m128i* q0, __m128i* q1, __m128i *q2,
                                  const __m128i* mask, int hev_thresh) {
    __m128i a, not_hev;
    const __m128i sign_bit = _mm_set1_epi8(0x80);

    // compute hev mask
    GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
    FLIP_SIGN_BIT2(*p2, *q2);

    GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);

    {   // do simple filter on pixels with hev
        const __m128i m = _mm_andnot_si128(not_hev, *mask);
        const __m128i f = _mm_and_si128(a, m);
        DO_SIMPLE_FILTER(*p0, *q0, f);
    }
    {   // do strong filter on pixels with not hev
        const __m128i zero = _mm_setzero_si128();
        const __m128i nine = _mm_set1_epi16(0x0900);
        const __m128i sixty_three = _mm_set1_epi16(63);

        const __m128i m = _mm_and_si128(not_hev, *mask);
        const __m128i f = _mm_and_si128(a, m);
        const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
        const __m128i f_hi = _mm_unpackhi_epi8(zero, f);

        const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine);   // Filter (lo) * 9
        const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine);   // Filter (hi) * 9
        const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo);  // Filter (lo) * 18
        const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi);  // Filter (hi) * 18

        const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three);  // Filter * 9 + 63
        const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three);  // Filter * 9 + 63

        const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three);  // F... * 18 + 63
        const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three);  // F... * 18 + 63

        const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo);  // Filter * 27 + 63
        const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi);  // Filter * 27 + 63

        UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
        UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
        UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
    }

    // unoffset
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
    FLIP_SIGN_BIT2(*p2, *q2);
}
예제 #2
0
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
                                  __m128i* q0, __m128i* q1,
                                  const __m128i* mask, int hev_thresh) {
    __m128i not_hev;
    __m128i t1, t2, t3;
    const __m128i sign_bit = _mm_set1_epi8(0x80);

    // compute hev mask
    GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    // Do +4 side
    t2 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    t3 = t2;                           // save t2
    *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2

    // Now do +3 side
    t2 = _mm_set1_epi8(3);
    t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
    SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2

    t2 = _mm_set1_epi8(1);
    t3 = _mm_adds_epi8(t3, t2);
    SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3

    // unoffset
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
}
예제 #3
0
파일: dec_sse2.c 프로젝트: 8l/insieme
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
                                  __m128i* const p0, __m128i* const q0,
                                  __m128i* const q1, __m128i* const q2,
                                  const __m128i* const mask, int hev_thresh) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    __m128i a, not_hev;

    // compute hev mask
    GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
    FLIP_SIGN_BIT2(*p2, *q2);
    GetBaseDelta(p1, p0, q0, q1, &a);

    {   // do simple filter on pixels with hev
        const __m128i m = _mm_andnot_si128(not_hev, *mask);
        __m128i f = _mm_and_si128(a, m);   // insieme: dropped const
        DoSimpleFilter(p0, q0, &f);
    }

    {   // do strong filter on pixels with not hev
        const __m128i k9 = _mm_set1_epi16(0x0900);
        const __m128i k63 = _mm_set1_epi16(63);

        const __m128i m = _mm_and_si128(not_hev, *mask);
        const __m128i f = _mm_and_si128(a, m);

        const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
        const __m128i f_hi = _mm_unpackhi_epi8(zero, f);

        const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9);    // Filter (lo) * 9
        const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9);    // Filter (hi) * 9

        __m128i a2_lo = _mm_add_epi16(f9_lo, k63);    // Filter * 9 + 63   // insieme: dropped const
        __m128i a2_hi = _mm_add_epi16(f9_hi, k63);    // Filter * 9 + 63   // insieme: dropped const

        __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo);  // Filter * 18 + 63   // insieme: dropped const
        __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi);  // Filter * 18 + 63   // insieme: dropped const

        __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63   // insieme: dropped const
        __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63   // insieme: dropped const

        Update2Pixels(p2, q2, &a2_lo, &a2_hi);
        Update2Pixels(p1, q1, &a1_lo, &a1_hi);
        Update2Pixels(p0, q0, &a0_lo, &a0_hi);
    }
}
예제 #4
0
파일: dec_sse2.c 프로젝트: 8l/insieme
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1,
                                  const __m128i* const mask, int hev_thresh) {
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    const __m128i k64 = _mm_set1_epi8(0x40);
    const __m128i zero = _mm_setzero_si128();
    __m128i not_hev;
    __m128i t1, t2, t3;

    // compute hev mask
    GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    t2 = _mm_set1_epi8(3);
    t3 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
    t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
    *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
    FLIP_SIGN_BIT2(*p0, *q0);

    // this is equivalent to signed (a + 1) >> 1 calculation
    t2 = _mm_add_epi8(t3, sign_bit);
    t3 = _mm_avg_epu8(t2, zero);
    t3 = _mm_sub_epi8(t3, k64);

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
    FLIP_SIGN_BIT2(*p1, *q1);
}