static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { // if (2 * sc <= sa) __m128i tmp1 = _mm_slli_epi32(sc, 1); __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); __m128i rc1 = _mm_mullo_epi16(sc, dc); // sc * dc; rc1 = _mm_slli_epi32(rc1, 1); // 2 * sc * dc rc1 = _mm_andnot_si128(cmp1, rc1); // else tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc), _mm_sub_epi32(sa, sc)); tmp2 = _mm_slli_epi32(tmp2, 1); __m128i rc2 = _mm_sub_epi32(tmp1, tmp2); rc2 = _mm_and_si128(cmp1, rc2); __m128i rc = _mm_or_si128(rc1, rc2); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); tmp1 = _mm_mullo_epi16(sc, ida); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); tmp2 = _mm_mullo_epi16(dc, isa); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i&, __m128i&) { __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc tmp1 = _mm_add_epi32(tmp1, tmp2); tmp2 = _mm_mullo_epi16(sc, dc); // sc * dc tmp2 = _mm_slli_epi32(tmp2, 1); // 2 * sc * dc __m128i r = _mm_sub_epi32(tmp1, tmp2); return clamp_div255round_SSE2(r); }
static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); // if (dc == da) __m128i cmp1 = _mm_cmpeq_epi32(dc, da); __m128i tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = _mm_mullo_epi16(sc, ida); __m128i tmp3 = _mm_mullo_epi16(dc, isa); __m128i rc1 = _mm_add_epi32(tmp1, tmp2); rc1 = _mm_add_epi32(rc1, tmp3); rc1 = clamp_div255round_SSE2(rc1); rc1 = _mm_and_si128(cmp1, rc1); // else if (0 == sc) __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128()); __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa); __m128i cmp = _mm_andnot_si128(cmp1, cmp2); rc2 = _mm_and_si128(cmp, rc2); // else __m128i cmp3 = _mm_or_si128(cmp1, cmp2); __m128i tmp4 = _mm_sub_epi32(da, dc); tmp4 = Multiply32_SSE2(tmp4, sa); tmp4 = shim_mm_div_epi32(tmp4, sc); __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4)); tmp5 = Multiply32_SSE2(sa, tmp5); __m128i rc3 = _mm_add_epi32(tmp5, tmp2); rc3 = _mm_add_epi32(rc3, tmp3); rc3 = clamp_div255round_SSE2(rc3); rc3 = _mm_andnot_si128(cmp3, rc3); __m128i rc = _mm_or_si128(rc1, rc2); rc = _mm_or_si128(rc, rc3); return rc; }
static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i diff = _mm_sub_epi32(sa, sc); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); // if (0 == dc) __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128()); __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida)); // else if (0 == diff) __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128()); __m128i cmp = _mm_andnot_si128(cmp1, cmp2); __m128i tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = _mm_mullo_epi16(sc, ida); __m128i tmp3 = _mm_mullo_epi16(dc, isa); __m128i rc2 = _mm_add_epi32(tmp1, tmp2); rc2 = _mm_add_epi32(rc2, tmp3); rc2 = clamp_div255round_SSE2(rc2); rc2 = _mm_and_si128(cmp, rc2); // else __m128i cmp3 = _mm_or_si128(cmp1, cmp2); __m128i value = _mm_mullo_epi16(dc, sa); diff = shim_mm_div_epi32(value, diff); __m128i tmp4 = SkMin32_SSE2(da, diff); tmp4 = Multiply32_SSE2(sa, tmp4); __m128i rc3 = _mm_add_epi32(tmp4, tmp2); rc3 = _mm_add_epi32(rc3, tmp3); rc3 = clamp_div255round_SSE2(rc3); rc3 = _mm_andnot_si128(cmp3, rc3); __m128i rc = _mm_or_si128(rc1, rc2); rc = _mm_or_si128(rc, rc3); return rc; }
static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { // sc * (255 - da) __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da); ret1 = _mm_mullo_epi16(sc, ret1); // dc * (255 - sa) __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); ret2 = _mm_mullo_epi16(dc, ret2); // sc * dc __m128i ret3 = _mm_mullo_epi16(sc, dc); __m128i ret = _mm_add_epi32(ret1, ret2); ret = _mm_add_epi32(ret, ret3); return clamp_div255round_SSE2(ret); }
// Portable version overlay_byte() is in SkXfermode.cpp. static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i tmp1 = _mm_mullo_epi16(sc, ida); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); __m128i tmp2 = _mm_mullo_epi16(dc, isa); __m128i tmp = _mm_add_epi32(tmp1, tmp2); __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da); __m128i rc1 = _mm_slli_epi32(sc, 1); // 2 * sc rc1 = Multiply32_SSE2(rc1, dc); // *dc __m128i rc2 = _mm_mullo_epi16(sa, da); // sa * da __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1); // 2 * (da - dc) tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc)); // * (sa - sc) rc2 = _mm_sub_epi32(rc2, tmp3); __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1), _mm_and_si128(cmp, rc2)); return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp)); }
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i tmp1, tmp2, tmp3; // int m = da ? dc * 256 / da : 0; __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); __m128i m = _mm_slli_epi32(dc, 8); __m128 x = _mm_cvtepi32_ps(m); __m128 y = _mm_cvtepi32_ps(da); m = _mm_cvttps_epi32(_mm_div_ps(x, y)); m = _mm_andnot_si128(cmp, m); // if (2 * sc <= sa) tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); tmp1 = _mm_sub_epi32(tmp1, sa); // 2 * sc - sa tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m tmp1 = Multiply32_SSE2(tmp1, tmp2); tmp1 = _mm_srai_epi32(tmp1, 8); tmp1 = _mm_add_epi32(sa, tmp1); tmp1 = Multiply32_SSE2(dc, tmp1); __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); // else if (4 * dc <= da) tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); __m128i i = _mm_slli_epi32(m, 2); // 4 * m __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) i = _mm_srai_epi32(i, 16); // >> 16 j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m tmp2 = _mm_add_epi32(i, j); i = Multiply32_SSE2(dc, sa); // dc * sa j = _mm_slli_epi32(sc, 1); // 2 * sc j = _mm_sub_epi32(j, sa); // 2 * sc - sa j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) tmp2 = Multiply32_SSE2(j, tmp2); // * tmp tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 tmp2 = _mm_add_epi32(i, tmp2); cmp = _mm_andnot_si128(cmp2, cmp1); __m128i rc2 = _mm_and_si128(cmp, tmp2); __m128i rc = _mm_or_si128(rc1, rc2); // else tmp3 = sqrt_unit_byte_SSE2(m); tmp3 = _mm_sub_epi32(tmp3, m); tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) tmp3 = _mm_srai_epi32(tmp3, 8); tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa cmp = _mm_and_si128(cmp1, cmp2); __m128i rc3 = _mm_and_si128(cmp, tmp3); rc = _mm_or_si128(rc, rc3); tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da tmp1 = _mm_mullo_epi16(sc, tmp1); tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa tmp2 = _mm_mullo_epi16(dc, tmp2); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }