static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, int src_width, int do_store) { int i; for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]); const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds const uint16x8_t G = vpaddlq_u8(RGB.val[1]); const uint16x8_t B = vpaddlq_u8(RGB.val[0]); int16x8_t U_tmp, V_tmp; CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp); { const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1); const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1); if (do_store) { vst1_u8(u, U); vst1_u8(v, V); } else { const uint8x8_t prev_u = vld1_u8(u); const uint8x8_t prev_v = vld1_u8(v); vst1_u8(u, vrhadd_u8(U, prev_u)); vst1_u8(v, vrhadd_u8(V, prev_v)); } } } if (i < src_width) { // left-over WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); } }
static void ConvertARGBToUV_SSE41(const uint32_t* argb, uint8_t* u, uint8_t* v, int src_width, int do_store) { const int max_width = src_width & ~31; int i; for (i = 0; i < max_width; i += 32, u += 16, v += 16) { __m128i rgb[6], U0, V0, U1, V1; RGB32PackedToPlanar_SSE41(&argb[i], rgb); HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb); HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); U0 = _mm_packus_epi16(U0, U1); V0 = _mm_packus_epi16(V0, V1); if (!do_store) { const __m128i prev_u = LOAD_16(u); const __m128i prev_v = LOAD_16(v); U0 = _mm_avg_epu8(U0, prev_u); V0 = _mm_avg_epu8(V0, prev_v); } STORE_16(U0, u); STORE_16(V0, v); } if (i < src_width) { // left-over WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); } }