template <DstType D> void src_1(const SkXfermode*, uint32_t dst[], const SkPM4f* src, int count, const SkAlpha aa[]) { const Sk4f s4 = src->to4f_pmorder(); if (aa) { if (D == kLinear_Dst) { // operate in bias-255 space for src and dst const Sk4f& s4_255 = s4 * Sk4f(255); while (count >= 4) { Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f); Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f); Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f); Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f); Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f); Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3); dst += 4; aa += 4; count -= 4; } } else { // kSRGB while (count >= 4) { Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f); /* If we ever natively support convert 255_linear -> 255_srgb, then perhaps * it would be faster (and possibly allow more code sharing with kLinear) to * stay in that space. */ Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4[0])); Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4[1])); Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4[2])); Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4[3])); Sk4f_ToBytes((uint8_t*)dst, linear_unit_to_srgb_255f(r0), linear_unit_to_srgb_255f(r1), linear_unit_to_srgb_255f(r2), linear_unit_to_srgb_255f(r3)); dst += 4; aa += 4; count -= 4; } } for (int i = 0; i < count; ++i) { unsigned a = aa[i]; Sk4f d4 = load_dst<D>(dst[i]); dst[i] = store_dst<D>(lerp(s4, d4, a)); } } else { sk_memset32(dst, store_dst<D>(s4), count); } }
static void pm4f_to_linear_32(SkPMColor dst[], const SkPM4f src[], int count) { while (count >= 4) { src[0].assertIsUnit(); src[1].assertIsUnit(); src[2].assertIsUnit(); src[3].assertIsUnit(); Sk4f_ToBytes((uint8_t*)dst, scale_255_round(src[0]), scale_255_round(src[1]), scale_255_round(src[2]), scale_255_round(src[3])); src += 4; dst += 4; count -= 4; } for (int i = 0; i < count; ++i) { src[i].assertIsUnit(); SkNx_cast<uint8_t>(scale_255_round(src[i])).store((uint8_t*)&dst[i]); } }
static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[], const SkPM4f* src, int count, const SkAlpha aa[]) { const Sk4f s4 = src->to4f_pmorder(); const Sk4f dst_scale = Sk4f(1 - get_alpha(s4)); if (aa) { for (int i = 0; i < count; ++i) { unsigned a = aa[i]; if (0 == a) { continue; } Sk4f d4 = Sk4f_fromL32(dst[i]); Sk4f r4; if (a != 0xFF) { Sk4f s4_aa = scale_by_coverage(s4, a); r4 = s4_aa + d4 * Sk4f(1 - get_alpha(s4_aa)); } else { r4 = s4 + d4 * dst_scale; } dst[i] = Sk4f_toL32(r4); } } else { const Sk4f s4_255 = s4 * Sk4f(255) + Sk4f(0.5f); // +0.5 to pre-bias for rounding while (count >= 4) { Sk4f d0 = to_4f(dst[0]); Sk4f d1 = to_4f(dst[1]); Sk4f d2 = to_4f(dst[2]); Sk4f d3 = to_4f(dst[3]); Sk4f_ToBytes((uint8_t*)dst, s4_255 + d0 * dst_scale, s4_255 + d1 * dst_scale, s4_255 + d2 * dst_scale, s4_255 + d3 * dst_scale); dst += 4; count -= 4; } for (int i = 0; i < count; ++i) { Sk4f d4 = to_4f(dst[i]); dst[i] = to_4b(s4_255 + d4 * dst_scale); } } }