static void YuvToRgbRowSSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { int n; for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory YuvToRgbSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes dst += 3; ++y; u += (n & 1); v += (n & 1); } VP8YuvToRgb(y[0], u[0], v[0], dst); if (len > 1) { VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3); } }
static void YuvToRgbRow_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1); YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2); YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as RRRRGGGGBBBB. rgb0 = _mm_packus_epi16(R0, R1); rgb1 = _mm_packus_epi16(R2, R3); rgb2 = _mm_packus_epi16(G0, G1); rgb3 = _mm_packus_epi16(G2, G3); rgb4 = _mm_packus_epi16(B0, B1); rgb5 = _mm_packus_epi16(B2, B3); // Pack as RGBRGBRGBRGB. PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); y += 32; u += 16; v += 16; } for (; n < len; ++n) { // Finish off VP8YuvToRgb(y[0], u[0], v[0], dst); dst += 3; y += 1; u += (n & 1); v += (n & 1); } }