static void YuvToBgrRowSSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { int n; for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory YuvToBgrSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes dst += 3; ++y; u += (n & 1); v += (n & 1); } VP8YuvToBgr(y[0], u[0], v[0], dst + 0); if (len > 1) { VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3); } }
static void YuvToBgrRow_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1); YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2); YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as BBBBGGGGRRRR. bgr0 = _mm_packus_epi16(B0, B1); bgr1 = _mm_packus_epi16(B2, B3); bgr2 = _mm_packus_epi16(G0, G1); bgr3 = _mm_packus_epi16(G2, G3); bgr4 = _mm_packus_epi16(R0, R1); bgr5 = _mm_packus_epi16(R2, R3); // Pack as BGRBGRBGRBGR. PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); y += 32; u += 16; v += 16; } for (; n < len; ++n) { // Finish off VP8YuvToBgr(y[0], u[0], v[0], dst); dst += 3; y += 1; u += (n & 1); v += (n & 1); } }