void filter_span(const float array[], const T src[], int count, T dst[]) {
    // c0-c3 are already in [0,1].
    const Sk4f c0 = Sk4f::Load(array + 0);
    const Sk4f c1 = Sk4f::Load(array + 4);
    const Sk4f c2 = Sk4f::Load(array + 8);
    const Sk4f c3 = Sk4f::Load(array + 12);
    // c4 (the translate vector) is in [0, 255].  Bring it back to [0,1].
    const Sk4f c4 = Sk4f::Load(array + 16)*Sk4f(1.0f/255);

    // todo: we could cache this in the constructor...
    T matrix_translate_pmcolor = Adaptor::From4f(premul(clamp_0_1(c4)));

    for (int i = 0; i < count; i++) {
        Sk4f srcf = Adaptor::To4f(src[i]);
        float srcA = srcf.kth<SkPM4f::A>();

        if (0 == srcA) {
            dst[i] = matrix_translate_pmcolor;
            continue;
        }
        if (1 != srcA) {
            srcf = unpremul(srcf);
        }

        Sk4f r4 = SkNx_dup<SK_R32_SHIFT/8>(srcf);
        Sk4f g4 = SkNx_dup<SK_G32_SHIFT/8>(srcf);
        Sk4f b4 = SkNx_dup<SK_B32_SHIFT/8>(srcf);
        Sk4f a4 = SkNx_dup<SK_A32_SHIFT/8>(srcf);

        // apply matrix
        Sk4f dst4 = c0 * r4 + c1 * g4 + c2 * b4 + c3 * a4 + c4;

        dst[i] = Adaptor::From4f(premul(clamp_0_1(dst4)));
    }
}
void filter_span(const float array[], const T src[], int count, T dst[]) {
    const Sk4f c0 = Sk4f::Load(array + 0);
    const Sk4f c1 = Sk4f::Load(array + 4);
    const Sk4f c2 = Sk4f::Load(array + 8);
    const Sk4f c3 = Sk4f::Load(array + 12);
    const Sk4f c4 = Sk4f::Load(array + 16);

    // todo: we could cache this in the constructor...
    T matrix_translate_pmcolor = Adaptor::From4f(premul(clamp_0_1(c4)));

    for (int i = 0; i < count; i++) {
        Sk4f srcf = Adaptor::To4f(src[i]);
        float srcA = srcf[SkPM4f::A];

        if (0 == srcA) {
            dst[i] = matrix_translate_pmcolor;
            continue;
        }
        if (1 != srcA) {
            srcf = unpremul(srcf);
        }

        Sk4f r4 = srcf[Adaptor::R];
        Sk4f g4 = srcf[Adaptor::G];
        Sk4f b4 = srcf[Adaptor::B];
        Sk4f a4 = srcf[Adaptor::A];
        // apply matrix
        Sk4f dst4 = c0 * r4 + c1 * g4 + c2 * b4 + c3 * a4 + c4;

        dst[i] = Adaptor::From4f(premul(clamp_0_1(dst4)));
    }
}