static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; bgr += 3 * 16 * 2) { __m128i bgr_plane[6]; int j; RGB24PackedToPlanar_SSE41(bgr, bgr_plane); for (j = 0; j < 2; ++j, i += 16) { const __m128i zero = _mm_setzero_si128(); __m128i r, g, b, Y0, Y1; // Convert to 16-bit Y. b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); ConvertRGBToY_SSE41(&r, &g, &b, &Y0); // Convert to 16-bit Y. b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); ConvertRGBToY_SSE41(&r, &g, &b, &Y1); // Cast to 8-bit and store. STORE_16(_mm_packus_epi16(Y0, Y1), y + i); } } for (; i < width; ++i, bgr += 3) { // left-over y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } }
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { int i; for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { const uint8x8x3_t BGR = vld3_u8(bgr); const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]); vst1_u8(y + i, Y); } for (; i < width; ++i, bgr += 3) { // left-over y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } }
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { int i; for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { const uint8x8x3_t RGB = vld3_u8(rgb); const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]); vst1_u8(y + i, Y); } for (; i < width; ++i, rgb += 3) { // left-over y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } }
void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) { const int red = (background_rgb >> 16) & 0xff; const int green = (background_rgb >> 8) & 0xff; const int blue = (background_rgb >> 0) & 0xff; int x, y; if (pic == NULL) return; if (!pic->use_argb) { const int uv_width = (pic->width >> 1); // omit last pixel during u/v loop const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF); // VP8RGBToU/V expects the u/v values summed over four pixels const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF); const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF); const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT; if (!has_alpha || pic->a == NULL) return; // nothing to do for (y = 0; y < pic->height; ++y) { // Luma blending uint8_t* const y_ptr = pic->y + y * pic->y_stride; uint8_t* const a_ptr = pic->a + y * pic->a_stride; for (x = 0; x < pic->width; ++x) { const int alpha = a_ptr[x]; if (alpha < 0xff) { y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]); } } // Chroma blending every even line if ((y & 1) == 0) { uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride; uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride; uint8_t* const a_ptr2 = (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride; for (x = 0; x < uv_width; ++x) { // Average four alpha values into a single blending weight. // TODO(skal): might lead to visible contouring. Can we do better? const int alpha = a_ptr[2 * x + 0] + a_ptr[2 * x + 1] + a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1]; u[x] = BLEND_10BIT(U0, u[x], alpha); v[x] = BLEND_10BIT(V0, v[x], alpha); } if (pic->width & 1) { // rightmost pixel const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]); u[x] = BLEND_10BIT(U0, u[x], alpha); v[x] = BLEND_10BIT(V0, v[x], alpha); } } memset(a_ptr, 0xff, pic->width); } } else {