static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); const uint8x8_t shuffle = vld1_u8(kGreenShuffle); for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); }
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const __m128i mask = _mm_set1_epi32(0x0000ff00); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|... const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g); const __m128i out = _mm_add_epi8(in, in_0g0g); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i); }
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels, uint32_t* dst) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_add_epi8(in, C); _mm_storeu_si128((__m128i*)&dst[i], out); } // fallthrough and finish off with plain-C if (i != num_pixels) { VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); } }