Beispiel #1
0
// on three inner edges
static void VFilter16iSSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) {
    int k;
    __m128i mask;
    __m128i t1, t2, p1, p0, q0, q1;

    for (k = 3; k > 0; --k) {
        // Load p3, p2, p1, p0
        LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
        MAX_DIFF1(t2, t1, p1, p0, mask);

        p += 4 * stride;

        // Load q0, q1, q2, q3
        LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
        MAX_DIFF2(t2, t1, q1, q0, mask);

        COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
        DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

        // Store
        _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
        _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
        _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
        _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
    }
}
Beispiel #2
0
static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) {
    __m128i mask;
    __m128i t1, t2, p1, p0, q0, q1;

    // Load p3, p2, p1, p0
    LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0);
    MAX_DIFF1(t2, t1, p1, p0, mask);

    u += 4 * stride;
    v += 4 * stride;

    // Load q0, q1, q2, q3
    LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
    MAX_DIFF2(t2, t1, q1, q0, mask);

    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

    // Store
    STOREUV(p1, u, v, -2 * stride);
    STOREUV(p0, u, v, -1 * stride);
    STOREUV(q0, u, v, 0 * stride);
    STOREUV(q1, u, v, 1 * stride);
}
Beispiel #3
0
static void HFilter16iSSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) {
    int k;
    uint8_t* b;
    __m128i mask;
    __m128i t1, t2, p1, p0, q0, q1;

    for (k = 3; k > 0; --k) {
        b = p;
        Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
        MAX_DIFF1(t2, t1, p1, p0, mask);

        b += 4;  // beginning of q0
        Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
        MAX_DIFF2(t2, t1, q1, q0, mask);

        COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
        DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

        b -= 2;  // beginning of p1
        Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);

        p += 4;
    }
}
Beispiel #4
0
// on macroblock edges
static void VFilter16SSE2(uint8_t* p, int stride,
                          int thresh, int ithresh, int hev_thresh) {
    __m128i t1;
    __m128i mask;
    __m128i p2, p1, p0, q0, q1, q2;

    // Load p3, p2, p1, p0
    LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
    MAX_DIFF1(t1, p2, p1, p0, mask);

    // Load q0, q1, q2, q3
    LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
    MAX_DIFF2(t1, q2, q1, q0, mask);

    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

    // Store
    _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
    _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
}
Beispiel #5
0
static void HFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
    int k;
    __m128i p3, p2, p1, p0;   // loop invariants

    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue

    for (k = 3; k > 0; --k) {
        __m128i mask, tmp1, tmp2;
        uint8_t* const b = p + 2;   // beginning of p1

        p += 4;  // beginning of q0 (and next span)

        MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
        Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
        MAX_DIFF2(p3, p2, tmp1, tmp2, mask);

        ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
        DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);

        Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);

        // rotate samples
        p1 = tmp1;
        p0 = tmp2;
    }
}
Beispiel #6
0
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
    int k;
    __m128i p3, p2, p1, p0;   // loop invariants

    LOAD_H_EDGES4(p, stride, p3, p2, p1, p0);  // prologue

    for (k = 3; k > 0; --k) {
        __m128i mask, tmp1, tmp2;
        uint8_t* const b = p + 2 * stride;   // beginning of p1
        p += 4 * stride;

        MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
        LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
        MAX_DIFF2(p3, p2, tmp1, tmp2, mask);

        // p3 and p2 are not just temporary variables here: they will be
        // re-used for next span. And q2/q3 will become p1/p0 accordingly.
        ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
        DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);

        // Store
        _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
        _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
        _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
        _mm_storeu_si128((__m128i*)&b[3 * stride], p2);

        // rotate samples
        p1 = tmp1;
        p0 = tmp2;
    }
}
Beispiel #7
0
static void HFilter16SSE2(uint8_t* p, int stride,
                          int thresh, int ithresh, int hev_thresh) {
    __m128i mask;
    __m128i p3, p2, p1, p0, q0, q1, q2, q3;

    uint8_t* const b = p - 4;
    Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
    MAX_DIFF1(p3, p2, p1, p0, mask);

    Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
    MAX_DIFF2(q3, q2, q1, q0, mask);

    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

    Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
    Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
}
Beispiel #8
0
static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) {
    __m128i mask;
    __m128i t1, t2, p1, p0, q0, q1;
    Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
    MAX_DIFF1(t2, t1, p1, p0, mask);

    u += 4;  // beginning of q0
    v += 4;
    Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
    MAX_DIFF2(t2, t1, q1, q0, mask);

    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

    u -= 2;  // beginning of p1
    v -= 2;
    Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
}
Beispiel #9
0
static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
                         int thresh, int ithresh, int hev_thresh) {
    __m128i mask;
    __m128i p3, p2, p1, p0, q0, q1, q2, q3;

    uint8_t* const tu = u - 4;
    uint8_t* const tv = v - 4;
    Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
    MAX_DIFF1(p3, p2, p1, p0, mask);

    Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
    MAX_DIFF2(q3, q2, q1, q0, mask);

    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

    Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
    Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
}
Beispiel #10
0
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
    __m128i mask;
    __m128i t1, p2, p1, p0, q0, q1, q2;

    // Load p3, p2, p1, p0
    LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0);
    MAX_DIFF1(t1, p2, p1, p0, mask);

    // Load q0, q1, q2, q3
    LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
    MAX_DIFF2(t1, q2, q1, q0, mask);

    ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
    DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

    // Store
    STOREUV(p2, u, v, -3 * stride);
    STOREUV(p1, u, v, -2 * stride);
    STOREUV(p0, u, v, -1 * stride);
    STOREUV(q0, u, v, 0 * stride);
    STOREUV(q1, u, v, 1 * stride);
    STOREUV(q2, u, v, 2 * stride);
}