// on three inner edges static void VFilter16iSSE2(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { int k; __m128i mask; __m128i t1, t2, p1, p0, q0, q1; for (k = 3; k > 0; --k) { // Load p3, p2, p1, p0 LOAD_H_EDGES4(p, stride, t2, t1, p1, p0); MAX_DIFF1(t2, t1, p1, p0, mask); p += 4 * stride; // Load q0, q1, q2, q3 LOAD_H_EDGES4(p, stride, q0, q1, t1, t2); MAX_DIFF2(t2, t1, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); // Store _mm_storeu_si128((__m128i*)&p[-2 * stride], p1); _mm_storeu_si128((__m128i*)&p[-1 * stride], p0); _mm_storeu_si128((__m128i*)&p[0 * stride], q0); _mm_storeu_si128((__m128i*)&p[1 * stride], q1); } }
static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; // Load p3, p2, p1, p0 LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0); MAX_DIFF1(t2, t1, p1, p0, mask); u += 4 * stride; v += 4 * stride; // Load q0, q1, q2, q3 LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2); MAX_DIFF2(t2, t1, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); // Store STOREUV(p1, u, v, -2 * stride); STOREUV(p0, u, v, -1 * stride); STOREUV(q0, u, v, 0 * stride); STOREUV(q1, u, v, 1 * stride); }
static void HFilter16iSSE2(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { int k; uint8_t* b; __m128i mask; __m128i t1, t2, p1, p0, q0, q1; for (k = 3; k > 0; --k) { b = p; Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 MAX_DIFF1(t2, t1, p1, p0, mask); b += 4; // beginning of q0 Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 MAX_DIFF2(t2, t1, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); b -= 2; // beginning of p1 Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1); p += 4; } }
// on macroblock edges static void VFilter16SSE2(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { __m128i t1; __m128i mask; __m128i p2, p1, p0, q0, q1, q2; // Load p3, p2, p1, p0 LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0); MAX_DIFF1(t1, p2, p1, p0, mask); // Load q0, q1, q2, q3 LOAD_H_EDGES4(p, stride, q0, q1, q2, t1); MAX_DIFF2(t1, q2, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); // Store _mm_storeu_si128((__m128i*)&p[-3 * stride], p2); _mm_storeu_si128((__m128i*)&p[-2 * stride], p1); _mm_storeu_si128((__m128i*)&p[-1 * stride], p0); _mm_storeu_si128((__m128i*)&p[0 * stride], q0); _mm_storeu_si128((__m128i*)&p[1 * stride], q1); _mm_storeu_si128((__m128i*)&p[2 * stride], q2); }
static void HFilter16i(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { int k; __m128i p3, p2, p1, p0; // loop invariants Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue for (k = 3; k > 0; --k) { __m128i mask, tmp1, tmp2; uint8_t* const b = p + 2; // beginning of p1 p += 4; // beginning of q0 (and next span) MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2); MAX_DIFF2(p3, p2, tmp1, tmp2, mask); ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride); // rotate samples p1 = tmp1; p0 = tmp2; } }
// on three inner edges static void VFilter16i(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { int k; __m128i p3, p2, p1, p0; // loop invariants LOAD_H_EDGES4(p, stride, p3, p2, p1, p0); // prologue for (k = 3; k > 0; --k) { __m128i mask, tmp1, tmp2; uint8_t* const b = p + 2 * stride; // beginning of p1 p += 4 * stride; MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2); MAX_DIFF2(p3, p2, tmp1, tmp2, mask); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); // Store _mm_storeu_si128((__m128i*)&b[0 * stride], p1); _mm_storeu_si128((__m128i*)&b[1 * stride], p0); _mm_storeu_si128((__m128i*)&b[2 * stride], p3); _mm_storeu_si128((__m128i*)&b[3 * stride], p2); // rotate samples p1 = tmp1; p0 = tmp2; } }
static void HFilter16SSE2(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; uint8_t* const b = p - 4; Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 MAX_DIFF1(p3, p2, p1, p0, mask); Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 MAX_DIFF2(q3, q2, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); }
static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 MAX_DIFF1(t2, t1, p1, p0, mask); u += 4; // beginning of q0 v += 4; Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 MAX_DIFF2(t2, t1, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); u -= 2; // beginning of p1 v -= 2; Store16x4(u, v, stride, &p1, &p0, &q0, &q1); }
static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; uint8_t* const tu = u - 4; uint8_t* const tv = v - 4; Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 MAX_DIFF1(p3, p2, p1, p0, mask); Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 MAX_DIFF2(q3, q2, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0); Store16x4(u, v, stride, &q0, &q1, &q2, &q3); }
// 8-pixels wide variant, for chroma filtering static void VFilter8(uint8_t* u, uint8_t* v, int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, p2, p1, p0, q0, q1, q2; // Load p3, p2, p1, p0 LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0); MAX_DIFF1(t1, p2, p1, p0, mask); // Load q0, q1, q2, q3 LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1); MAX_DIFF2(t1, q2, q1, q0, mask); ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); // Store STOREUV(p2, u, v, -3 * stride); STOREUV(p1, u, v, -2 * stride); STOREUV(p0, u, v, -1 * stride); STOREUV(q0, u, v, 0 * stride); STOREUV(q1, u, v, 1 * stride); STOREUV(q2, u, v, 2 * stride); }