예제 #1
0
// on macroblock edges
static void VFilter16SSE2(uint8_t* p, int stride,
                          int thresh, int ithresh, int hev_thresh) {
    __m128i t1;
    __m128i mask;
    __m128i p2, p1, p0, q0, q1, q2;

    // Load p3, p2, p1, p0
    LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
    MAX_DIFF1(t1, p2, p1, p0, mask);

    // Load q0, q1, q2, q3
    LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
    MAX_DIFF2(t1, q2, q1, q0, mask);

    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

    // Store
    _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
    _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
}
예제 #2
0
// on three inner edges
static void VFilter16iSSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) {
    int k;
    __m128i mask;
    __m128i t1, t2, p1, p0, q0, q1;

    for (k = 3; k > 0; --k) {
        // Load p3, p2, p1, p0
        LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
        MAX_DIFF1(t2, t1, p1, p0, mask);

        p += 4 * stride;

        // Load q0, q1, q2, q3
        LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
        MAX_DIFF2(t2, t1, q1, q0, mask);

        COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
        DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

        // Store
        _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
        _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
        _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
        _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
    }
}
예제 #3
0
파일: dec_sse2.c 프로젝트: 8l/insieme
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
    int k;
    __m128i p3, p2, p1, p0;   // loop invariants

    LOAD_H_EDGES4(p, stride, p3, p2, p1, p0);  // prologue

    for (k = 3; k > 0; --k) {
        __m128i mask, tmp1, tmp2;
        uint8_t* const b = p + 2 * stride;   // beginning of p1
        p += 4 * stride;

        MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
        LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
        MAX_DIFF2(p3, p2, tmp1, tmp2, mask);

        // p3 and p2 are not just temporary variables here: they will be
        // re-used for next span. And q2/q3 will become p1/p0 accordingly.
        ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
        DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);

        // Store
        _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
        _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
        _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
        _mm_storeu_si128((__m128i*)&b[3 * stride], p2);

        // rotate samples
        p1 = tmp1;
        p0 = tmp2;
    }
}