static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; // Load p3, p2, p1, p0 LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0); MAX_DIFF1(t2, t1, p1, p0, mask); u += 4 * stride; v += 4 * stride; // Load q0, q1, q2, q3 LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2); MAX_DIFF2(t2, t1, q1, q0, mask); COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); // Store STOREUV(p1, u, v, -2 * stride); STOREUV(p0, u, v, -1 * stride); STOREUV(q0, u, v, 0 * stride); STOREUV(q1, u, v, 1 * stride); }
// 8-pixels wide variant, for chroma filtering static void VFilter8(uint8_t* u, uint8_t* v, int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, p2, p1, p0, q0, q1, q2; // Load p3, p2, p1, p0 LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0); MAX_DIFF1(t1, p2, p1, p0, mask); // Load q0, q1, q2, q3 LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1); MAX_DIFF2(t1, q2, q1, q0, mask); ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); // Store STOREUV(p2, u, v, -3 * stride); STOREUV(p1, u, v, -2 * stride); STOREUV(p0, u, v, -1 * stride); STOREUV(q0, u, v, 0 * stride); STOREUV(q1, u, v, 1 * stride); STOREUV(q2, u, v, 2 * stride); }