Example #1
0
static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    // if (2 * sc <= sa)
    __m128i tmp1 = _mm_slli_epi32(sc, 1);
    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    __m128i rc1 = _mm_mullo_epi16(sc, dc);                // sc * dc;
    rc1 = _mm_slli_epi32(rc1, 1);                         // 2 * sc * dc
    rc1 = _mm_andnot_si128(cmp1, rc1);

    // else
    tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc),
                                   _mm_sub_epi32(sa, sc));
    tmp2 = _mm_slli_epi32(tmp2, 1);
    __m128i rc2 = _mm_sub_epi32(tmp1, tmp2);
    rc2 = _mm_and_si128(cmp1, rc2);

    __m128i rc = _mm_or_si128(rc1, rc2);

    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    tmp1 = _mm_mullo_epi16(sc, ida);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    tmp2 = _mm_mullo_epi16(dc, isa);
    rc = _mm_add_epi32(rc, tmp1);
    rc = _mm_add_epi32(rc, tmp2);
    return clamp_div255round_SSE2(rc);
}
static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc,
        const __m128i&, __m128i&) {
    __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc
    __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc
    tmp1 = _mm_add_epi32(tmp1, tmp2);
    tmp2 = _mm_mullo_epi16(sc, dc);                          // sc * dc
    tmp2 = _mm_slli_epi32(tmp2, 1);                          // 2 * sc * dc

    __m128i r = _mm_sub_epi32(tmp1, tmp2);
    return clamp_div255round_SSE2(r);
}
Example #3
0
static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);

    // if (dc == da)
    __m128i cmp1 = _mm_cmpeq_epi32(dc, da);
    __m128i tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    __m128i rc1 = _mm_add_epi32(tmp1, tmp2);
    rc1 = _mm_add_epi32(rc1, tmp3);
    rc1 = clamp_div255round_SSE2(rc1);
    rc1 = _mm_and_si128(cmp1, rc1);

    // else if (0 == sc)
    __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128());
    __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa);
    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    rc2 = _mm_and_si128(cmp, rc2);

    // else
    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    __m128i tmp4 = _mm_sub_epi32(da, dc);
    tmp4 = Multiply32_SSE2(tmp4, sa);
    tmp4 = shim_mm_div_epi32(tmp4, sc);

    __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4));
    tmp5 = Multiply32_SSE2(sa, tmp5);
    __m128i rc3 = _mm_add_epi32(tmp5, tmp2);
    rc3 = _mm_add_epi32(rc3, tmp3);
    rc3 = clamp_div255round_SSE2(rc3);
    rc3 = _mm_andnot_si128(cmp3, rc3);

    __m128i rc = _mm_or_si128(rc1, rc2);
    rc = _mm_or_si128(rc, rc3);

    return rc;
}
Example #4
0
static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                           const __m128i& sa, const __m128i& da) {
    __m128i diff = _mm_sub_epi32(sa, sc);
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);

    // if (0 == dc)
    __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128());
    __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida));

    // else if (0 == diff)
    __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128());
    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    __m128i tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    __m128i rc2 = _mm_add_epi32(tmp1, tmp2);
    rc2 = _mm_add_epi32(rc2, tmp3);
    rc2 = clamp_div255round_SSE2(rc2);
    rc2 = _mm_and_si128(cmp, rc2);

    // else
    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    __m128i value = _mm_mullo_epi16(dc, sa);
    diff = shim_mm_div_epi32(value, diff);

    __m128i tmp4 = SkMin32_SSE2(da, diff);
    tmp4 = Multiply32_SSE2(sa, tmp4);
    __m128i rc3 = _mm_add_epi32(tmp4, tmp2);
    rc3 = _mm_add_epi32(rc3, tmp3);
    rc3 = clamp_div255round_SSE2(rc3);
    rc3 = _mm_andnot_si128(cmp3, rc3);

    __m128i rc = _mm_or_si128(rc1, rc2);
    rc = _mm_or_si128(rc, rc3);

    return rc;
}
static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc,
        const __m128i& sa, const __m128i& da) {
    // sc * (255 - da)
    __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da);
    ret1 = _mm_mullo_epi16(sc, ret1);

    // dc * (255 - sa)
    __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    ret2 = _mm_mullo_epi16(dc, ret2);

    // sc * dc
    __m128i ret3 = _mm_mullo_epi16(sc, dc);

    __m128i ret = _mm_add_epi32(ret1, ret2);
    ret = _mm_add_epi32(ret, ret3);

    return clamp_div255round_SSE2(ret);
}
Example #6
0
// Portable version overlay_byte() is in SkXfermode.cpp.
static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                        const __m128i& sa, const __m128i& da) {
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i tmp1 = _mm_mullo_epi16(sc, ida);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    __m128i tmp2 = _mm_mullo_epi16(dc, isa);
    __m128i tmp = _mm_add_epi32(tmp1, tmp2);

    __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da);
    __m128i rc1 = _mm_slli_epi32(sc, 1);                        // 2 * sc
    rc1 = Multiply32_SSE2(rc1, dc);                             // *dc

    __m128i rc2 = _mm_mullo_epi16(sa, da);                      // sa * da
    __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1);    // 2 * (da - dc)
    tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc));        // * (sa - sc)
    rc2 = _mm_sub_epi32(rc2, tmp3);

    __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1),
                              _mm_and_si128(cmp, rc2));
    return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp));
}
Example #7
0
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    __m128i tmp1, tmp2, tmp3;

    // int m = da ? dc * 256 / da : 0;
    __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128());
    __m128i m = _mm_slli_epi32(dc, 8);
    __m128 x = _mm_cvtepi32_ps(m);
    __m128 y = _mm_cvtepi32_ps(da);
    m = _mm_cvttps_epi32(_mm_div_ps(x, y));
    m = _mm_andnot_si128(cmp, m);

    // if (2 * sc <= sa)
    tmp1 = _mm_slli_epi32(sc, 1);                      // 2 * sc
    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    tmp1 = _mm_sub_epi32(tmp1, sa);                    // 2 * sc - sa
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m);      // 256 - m
    tmp1 = Multiply32_SSE2(tmp1, tmp2);
    tmp1 = _mm_srai_epi32(tmp1, 8);
    tmp1 = _mm_add_epi32(sa, tmp1);
    tmp1 = Multiply32_SSE2(dc, tmp1);
    __m128i rc1 = _mm_andnot_si128(cmp1, tmp1);

    // else if (4 * dc <= da)
    tmp2 = _mm_slli_epi32(dc, 2);                      // dc * 4
    __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da);
    __m128i i = _mm_slli_epi32(m, 2);                  // 4 * m
    __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256
    __m128i k = Multiply32_SSE2(i, j);                 // 4 * m * (4 * m + 256)
    __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256
    i = Multiply32_SSE2(k, t);                         // 4 * m * (4 * m + 256) * (m - 256)
    i = _mm_srai_epi32(i, 16);                         // >> 16
    j = Multiply32_SSE2(_mm_set1_epi32(7), m);         // 7 * m
    tmp2 = _mm_add_epi32(i, j);
    i = Multiply32_SSE2(dc, sa);                       // dc * sa
    j = _mm_slli_epi32(sc, 1);                         // 2 * sc
    j = _mm_sub_epi32(j, sa);                          // 2 * sc - sa
    j = Multiply32_SSE2(da, j);                        // da * (2 * sc - sa)
    tmp2 = Multiply32_SSE2(j, tmp2);                   // * tmp
    tmp2 = _mm_srai_epi32(tmp2, 8);                    // >> 8
    tmp2 = _mm_add_epi32(i, tmp2);
    cmp = _mm_andnot_si128(cmp2, cmp1);
    __m128i rc2 = _mm_and_si128(cmp, tmp2);
    __m128i rc = _mm_or_si128(rc1, rc2);

    // else
    tmp3 = sqrt_unit_byte_SSE2(m);
    tmp3 = _mm_sub_epi32(tmp3, m);
    tmp3 = Multiply32_SSE2(j, tmp3);                   // j = da * (2 * sc - sa)
    tmp3 = _mm_srai_epi32(tmp3, 8);
    tmp3 = _mm_add_epi32(i, tmp3);                     // i = dc * sa
    cmp = _mm_and_si128(cmp1, cmp2);
    __m128i rc3 = _mm_and_si128(cmp, tmp3);
    rc = _mm_or_si128(rc, rc3);

    tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da);     // 255 - da
    tmp1 = _mm_mullo_epi16(sc, tmp1);
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);     // 255 - sa
    tmp2 = _mm_mullo_epi16(dc, tmp2);
    rc = _mm_add_epi32(rc, tmp1);
    rc = _mm_add_epi32(rc, tmp2);
    return clamp_div255round_SSE2(rc);
}