static void detectEdge(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride,
                       const int mode, const int op) noexcept {
    float * srcpp = blur;
    float * srcp = blur;
    float * srcpn = blur + bgStride;

    srcp[-1] = srcp[0];
    srcp[width] = srcp[width - 1];

    for (int y = 0; y < height; y++) {
        srcpn[-1] = srcpn[0];
        srcpn[width] = srcpn[width - 1];

        for (int x = 0; x < width; x += 4) {
            const Vec4f topLeft = Vec4f().load(srcpp + x - 1);
            const Vec4f top = Vec4f().load_a(srcpp + x);
            const Vec4f topRight = Vec4f().load(srcpp + x + 1);
            const Vec4f left = Vec4f().load(srcp + x - 1);
            const Vec4f right = Vec4f().load(srcp + x + 1);
            const Vec4f bottomLeft = Vec4f().load(srcpn + x - 1);
            const Vec4f bottom = Vec4f().load_a(srcpn + x);
            const Vec4f bottomRight = Vec4f().load(srcpn + x + 1);

            Vec4f gx, gy;

            if (op == 0) {
                gx = right - left;
                gy = top - bottom;
            } else if (op == 1) {
                gx = (topRight + right + bottomRight - topLeft - left - bottomLeft) * 0.5f;
                gy = (topLeft + top + topRight - bottomLeft - bottom - bottomRight) * 0.5f;
            } else if (op == 2) {
                gx = topRight + mul_add(2.f, right, bottomRight) - topLeft - mul_add(2.f, left, bottomLeft);
                gy = topLeft + mul_add(2.f, top, topRight) - bottomLeft - mul_add(2.f, bottom, bottomRight);
            } else {
                gx = mul_add(3.f, topRight, mul_add(10.f, right, 3.f * bottomRight)) - mul_add(3.f, topLeft, mul_add(10.f, left, 3.f * bottomLeft));
                gy = mul_add(3.f, topLeft, mul_add(10.f, top, 3.f * topRight)) - mul_add(3.f, bottomLeft, mul_add(10.f, bottom, 3.f * bottomRight));
            }

            sqrt(mul_add(gx, gx, gy * gy)).stream(gradient + x);

            if (mode == 0) {
                Vec4f dr = atan2(gy, gx);
                dr = if_add(dr < 0.f, dr, M_PIF);

                const Vec4ui bin = Vec4ui(truncate_to_int(mul_add(dr, 4.f * M_1_PIF, 0.5f)));
                select(bin >= 4, zero_128b(), bin).stream(direction + x);
            }
        }

        srcpp = srcp;
        srcp = srcpn;
        if (y < height - 2)
            srcpn += bgStride;
        gradient += bgStride;
        direction += stride;
    }
}
void binarizeCE(const float * srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
                const uint16_t peak, const float lower, const float upper) noexcept {
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 8) {
            const Vec8ib mask_8ib = Vec8ib(Vec8f().load_a(srcp + x) == fltMax);
            const Vec8sb mask = Vec8sb(compress_saturated(mask_8ib.get_low(), mask_8ib.get_high()));
            select(mask, Vec8us(peak), zero_128b()).stream(dstp + x);
        }

        srcp += srcStride;
        dstp += dstStride;
    }
}
void binarizeCE(const float * srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
                const uint16_t peak, const float lower, const float upper) noexcept {
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            const Vec4ib mask_4ib_0 = Vec4ib(Vec4f().load_a(srcp + x) == fltMax);
            const Vec4ib mask_4ib_1 = Vec4ib(Vec4f().load_a(srcp + x + 4) == fltMax);
            const Vec4ib mask_4ib_2 = Vec4ib(Vec4f().load_a(srcp + x + 8) == fltMax);
            const Vec4ib mask_4ib_3 = Vec4ib(Vec4f().load_a(srcp + x + 12) == fltMax);
            const Vec8sb mask_8sb_0 = Vec8sb(compress_saturated(mask_4ib_0, mask_4ib_1));
            const Vec8sb mask_8sb_1 = Vec8sb(compress_saturated(mask_4ib_2, mask_4ib_3));
            const Vec16cb mask = Vec16cb(compress_saturated(mask_8sb_0, mask_8sb_1));
            select(mask, Vec16uc(255), zero_128b()).stream(dstp + x);
        }

        srcp += srcStride;
        dstp += dstStride;
    }
}
void processCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3CLData * d, const VSAPI * vsapi) {
    for (int plane = 0; plane < d->vi.format->numPlanes; plane++) {
        if (d->process[plane]) {
            copyPad<T>(src, pad[plane], plane, 1 - field_n, d->dh, vsapi);

            const int srcWidth = vsapi->getFrameWidth(pad[plane], 0);
            const int dstWidth = vsapi->getFrameWidth(dst, plane);
            const int srcHeight = vsapi->getFrameHeight(pad[plane], 0);
            const int dstHeight = vsapi->getFrameHeight(dst, plane);
            const int srcStride = vsapi->getStride(pad[plane], 0) / sizeof(T);
            const int dstStride = vsapi->getStride(dst, plane) / sizeof(T);
            const T * _srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(pad[plane], 0));
            T * VS_RESTRICT _dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane));

            const auto threadId = std::this_thread::get_id();
            auto queue = d->queue.at(threadId);
            auto calculateConnectionCosts = d->calculateConnectionCosts.at(threadId);
            auto srcImage = d->src.at(threadId);
            auto _ccosts = d->ccosts.at(threadId);
            float * pcosts = d->pcosts.at(threadId) + d->mdisVector;
            int * _pbackt = d->pbackt.at(threadId) + d->mdisVector;
            int * fpath = d->fpath.at(threadId);
            int * _dmap = d->dmap.at(threadId);
            float * tline = d->tline.at(threadId);

            const size_t globalWorkSize[] = { static_cast<size_t>((dstWidth + 15) & -16), static_cast<size_t>(d->vectorSize) };
            constexpr size_t localWorkSize[] = { 16, 4 };
            const int bufferSize = dstWidth * d->tpitchVector * sizeof(cl_float);

            vs_bitblt(_dstp + dstStride * (1 - field_n), vsapi->getStride(dst, plane) * 2,
                      _srcp + srcStride * (4 + 1 - field_n) + 12, vsapi->getStride(pad[plane], 0) * 2,
                      dstWidth * sizeof(T), dstHeight / 2);

            queue.enqueue_write_image(srcImage, compute::dim(0, 0), compute::dim(srcWidth, srcHeight), _srcp, vsapi->getStride(pad[plane], 0));

            for (int y = 4 + field_n; y < srcHeight - 4; y += 2 * d->vectorSize) {
                const int off = (y - 4 - field_n) >> 1;

                calculateConnectionCosts.set_args(srcImage, _ccosts, dstWidth, srcHeight - 4, y);
                queue.enqueue_nd_range_kernel(calculateConnectionCosts, 2, nullptr, globalWorkSize, localWorkSize);

                float * ccosts = reinterpret_cast<float *>(queue.enqueue_map_buffer(_ccosts, CL_MAP_READ, 0, bufferSize)) + d->mdisVector;

                // calculate path costs
                Vec4f().load(ccosts).store_a(pcosts);
                for (int x = 1; x < dstWidth; x++) {
                    const float * tT = ccosts + d->tpitchVector * x;
                    const float * ppT = pcosts + d->tpitchVector * (x - 1);
                    float * pT = pcosts + d->tpitchVector * x;
                    int * piT = _pbackt + d->tpitchVector * (x - 1);

                    const int umax = std::min({ x, dstWidth - 1 - x, d->mdis });
                    const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis });

                    for (int u = -umax; u <= umax; u++) {
                        Vec4i idx = zero_128b();
                        Vec4f bval = FLT_MAX;

                        for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) {
                            const Vec4f z = Vec4f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v);
                            const Vec4f ccost = min(z, FLT_MAX * 0.9f);
                            idx = select(Vec4ib(ccost < bval), v, idx);
                            bval = min(ccost, bval);
                        }

                        const Vec4f z = bval + Vec4f().load(tT + u * d->vectorSize);
                        min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize);
                        idx.stream(piT + u * d->vectorSize);
                    }
                }

                for (int vs = 0; vs < d->vectorSize; vs++) {
                    const int realY = 4 + field_n + 2 * (off + vs);
                    if (realY >= srcHeight - 4)
                        break;

                    const T * srcp = _srcp + srcStride * realY + 12;
                    T * dstp = _dstp + dstStride * (field_n + 2 * (off + vs));
                    int * dmap = _dmap + dstWidth * (off + vs);

                    const T * src3p = srcp - srcStride * 3;
                    const T * src1p = srcp - srcStride;
                    const T * src1n = srcp + srcStride;
                    const T * src3n = srcp + srcStride * 3;

                    const int * pbackt = _pbackt + vs;

                    // backtrack
                    fpath[dstWidth - 1] = 0;
                    for (int x = dstWidth - 2; x >= 0; x--)
                        fpath[x] = pbackt[(d->tpitch * x + fpath[x + 1]) * d->vectorSize];

                    interpolate<T>(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak);
                }

                queue.enqueue_unmap_buffer(_ccosts, ccosts - d->mdisVector);
            }

            if (d->vcheck) {
                const T * srcp = _srcp + srcStride * (4 + field_n) + 12;
                const T * scpp = nullptr;
                if (d->sclip)
                    scpp = reinterpret_cast<const T *>(vsapi->getReadPtr(scp, plane)) + dstStride * field_n;
                T * dstp = _dstp + dstStride * field_n;;

                vCheck<T>(srcp, scpp, dstp, _dmap, tline, field_n, dstWidth, srcHeight, srcStride, dstStride, d->vcheck, d->vthresh2, d->rcpVthresh0, d->rcpVthresh1, d->rcpVthresh2, d->peak);
            }
        }
    }