static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 32) { // Load the BGRA buffers. __m128i in0 = _mm_loadu_si128(in + 0); __m128i in1 = _mm_loadu_si128(in + 1); __m128i in2 = _mm_loadu_si128(in + 2); __m128i in3 = _mm_loadu_si128(in + 3); __m128i in4 = _mm_loadu_si128(in + 4); __m128i in5 = _mm_loadu_si128(in + 5); __m128i in6 = _mm_loadu_si128(in + 6); __m128i in7 = _mm_loadu_si128(in + 7); VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3); VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7); // At this points, in1/in5 contains red only, in2/in6 green only ... // Pack the colors in 24b RGB. VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7); _mm_storeu_si128(out + 0, in1); _mm_storeu_si128(out + 1, in5); _mm_storeu_si128(out + 2, in2); _mm_storeu_si128(out + 3, in6); _mm_storeu_si128(out + 4, in3); _mm_storeu_si128(out + 5, in7); in += 8; out += 6; num_pixels -= 32; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } }; vst3q_u8(dst, tmp); dst += 48; } VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs }
static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]); for (; src < end; src += 8) { uint8x8x4_t pixels; INIT_VECTOR4(pixels, vld1_u8((const uint8_t*)(src + 0)), vld1_u8((const uint8_t*)(src + 2)), vld1_u8((const uint8_t*)(src + 4)), vld1_u8((const uint8_t*)(src + 6))); vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); dst += 8 * 3; } VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs }