static void detectEdge(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op) noexcept { float * srcpp = blur; float * srcp = blur; float * srcpn = blur + bgStride; srcp[-1] = srcp[0]; srcp[width] = srcp[width - 1]; for (int y = 0; y < height; y++) { srcpn[-1] = srcpn[0]; srcpn[width] = srcpn[width - 1]; for (int x = 0; x < width; x += 4) { const Vec4f topLeft = Vec4f().load(srcpp + x - 1); const Vec4f top = Vec4f().load_a(srcpp + x); const Vec4f topRight = Vec4f().load(srcpp + x + 1); const Vec4f left = Vec4f().load(srcp + x - 1); const Vec4f right = Vec4f().load(srcp + x + 1); const Vec4f bottomLeft = Vec4f().load(srcpn + x - 1); const Vec4f bottom = Vec4f().load_a(srcpn + x); const Vec4f bottomRight = Vec4f().load(srcpn + x + 1); Vec4f gx, gy; if (op == 0) { gx = right - left; gy = top - bottom; } else if (op == 1) { gx = (topRight + right + bottomRight - topLeft - left - bottomLeft) * 0.5f; gy = (topLeft + top + topRight - bottomLeft - bottom - bottomRight) * 0.5f; } else if (op == 2) { gx = topRight + mul_add(2.f, right, bottomRight) - topLeft - mul_add(2.f, left, bottomLeft); gy = topLeft + mul_add(2.f, top, topRight) - bottomLeft - mul_add(2.f, bottom, bottomRight); } else { gx = mul_add(3.f, topRight, mul_add(10.f, right, 3.f * bottomRight)) - mul_add(3.f, topLeft, mul_add(10.f, left, 3.f * bottomLeft)); gy = mul_add(3.f, topLeft, mul_add(10.f, top, 3.f * topRight)) - mul_add(3.f, bottomLeft, mul_add(10.f, bottom, 3.f * bottomRight)); } sqrt(mul_add(gx, gx, gy * gy)).stream(gradient + x); if (mode == 0) { Vec4f dr = atan2(gy, gx); dr = if_add(dr < 0.f, dr, M_PIF); const Vec4ui bin = Vec4ui(truncate_to_int(mul_add(dr, 4.f * M_1_PIF, 0.5f))); select(bin >= 4, zero_128b(), bin).stream(direction + x); } } srcpp = srcp; srcp = srcpn; if (y < height - 2) srcpn += bgStride; gradient += bgStride; direction += stride; } }
void binarizeCE(const float * srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float lower, const float upper) noexcept { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 8) { const Vec8ib mask_8ib = Vec8ib(Vec8f().load_a(srcp + x) == fltMax); const Vec8sb mask = Vec8sb(compress_saturated(mask_8ib.get_low(), mask_8ib.get_high())); select(mask, Vec8us(peak), zero_128b()).stream(dstp + x); } srcp += srcStride; dstp += dstStride; } }
void binarizeCE(const float * srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float lower, const float upper) noexcept { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { const Vec4ib mask_4ib_0 = Vec4ib(Vec4f().load_a(srcp + x) == fltMax); const Vec4ib mask_4ib_1 = Vec4ib(Vec4f().load_a(srcp + x + 4) == fltMax); const Vec4ib mask_4ib_2 = Vec4ib(Vec4f().load_a(srcp + x + 8) == fltMax); const Vec4ib mask_4ib_3 = Vec4ib(Vec4f().load_a(srcp + x + 12) == fltMax); const Vec8sb mask_8sb_0 = Vec8sb(compress_saturated(mask_4ib_0, mask_4ib_1)); const Vec8sb mask_8sb_1 = Vec8sb(compress_saturated(mask_4ib_2, mask_4ib_3)); const Vec16cb mask = Vec16cb(compress_saturated(mask_8sb_0, mask_8sb_1)); select(mask, Vec16uc(255), zero_128b()).stream(dstp + x); } srcp += srcStride; dstp += dstStride; } }
void processCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3CLData * d, const VSAPI * vsapi) { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad<T>(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); const int srcWidth = vsapi->getFrameWidth(pad[plane], 0); const int dstWidth = vsapi->getFrameWidth(dst, plane); const int srcHeight = vsapi->getFrameHeight(pad[plane], 0); const int dstHeight = vsapi->getFrameHeight(dst, plane); const int srcStride = vsapi->getStride(pad[plane], 0) / sizeof(T); const int dstStride = vsapi->getStride(dst, plane) / sizeof(T); const T * _srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(pad[plane], 0)); T * VS_RESTRICT _dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane)); const auto threadId = std::this_thread::get_id(); auto queue = d->queue.at(threadId); auto calculateConnectionCosts = d->calculateConnectionCosts.at(threadId); auto srcImage = d->src.at(threadId); auto _ccosts = d->ccosts.at(threadId); float * pcosts = d->pcosts.at(threadId) + d->mdisVector; int * _pbackt = d->pbackt.at(threadId) + d->mdisVector; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); float * tline = d->tline.at(threadId); const size_t globalWorkSize[] = { static_cast<size_t>((dstWidth + 15) & -16), static_cast<size_t>(d->vectorSize) }; constexpr size_t localWorkSize[] = { 16, 4 }; const int bufferSize = dstWidth * d->tpitchVector * sizeof(cl_float); vs_bitblt(_dstp + dstStride * (1 - field_n), vsapi->getStride(dst, plane) * 2, _srcp + srcStride * (4 + 1 - field_n) + 12, vsapi->getStride(pad[plane], 0) * 2, dstWidth * sizeof(T), dstHeight / 2); queue.enqueue_write_image(srcImage, compute::dim(0, 0), compute::dim(srcWidth, srcHeight), _srcp, vsapi->getStride(pad[plane], 0)); for (int y = 4 + field_n; y < srcHeight - 4; y += 2 * d->vectorSize) { const int off = (y - 4 - field_n) >> 1; calculateConnectionCosts.set_args(srcImage, _ccosts, dstWidth, srcHeight - 4, y); queue.enqueue_nd_range_kernel(calculateConnectionCosts, 2, nullptr, globalWorkSize, localWorkSize); float * ccosts = reinterpret_cast<float *>(queue.enqueue_map_buffer(_ccosts, CL_MAP_READ, 0, bufferSize)) + d->mdisVector; // calculate path costs Vec4f().load(ccosts).store_a(pcosts); for (int x = 1; x < dstWidth; x++) { const float * tT = ccosts + d->tpitchVector * x; const float * ppT = pcosts + d->tpitchVector * (x - 1); float * pT = pcosts + d->tpitchVector * x; int * piT = _pbackt + d->tpitchVector * (x - 1); const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); for (int u = -umax; u <= umax; u++) { Vec4i idx = zero_128b(); Vec4f bval = FLT_MAX; for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { const Vec4f z = Vec4f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); const Vec4f ccost = min(z, FLT_MAX * 0.9f); idx = select(Vec4ib(ccost < bval), v, idx); bval = min(ccost, bval); } const Vec4f z = bval + Vec4f().load(tT + u * d->vectorSize); min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); idx.stream(piT + u * d->vectorSize); } } for (int vs = 0; vs < d->vectorSize; vs++) { const int realY = 4 + field_n + 2 * (off + vs); if (realY >= srcHeight - 4) break; const T * srcp = _srcp + srcStride * realY + 12; T * dstp = _dstp + dstStride * (field_n + 2 * (off + vs)); int * dmap = _dmap + dstWidth * (off + vs); const T * src3p = srcp - srcStride * 3; const T * src1p = srcp - srcStride; const T * src1n = srcp + srcStride; const T * src3n = srcp + srcStride * 3; const int * pbackt = _pbackt + vs; // backtrack fpath[dstWidth - 1] = 0; for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[(d->tpitch * x + fpath[x + 1]) * d->vectorSize]; interpolate<T>(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); } queue.enqueue_unmap_buffer(_ccosts, ccosts - d->mdisVector); } if (d->vcheck) { const T * srcp = _srcp + srcStride * (4 + field_n) + 12; const T * scpp = nullptr; if (d->sclip) scpp = reinterpret_cast<const T *>(vsapi->getReadPtr(scp, plane)) + dstStride * field_n; T * dstp = _dstp + dstStride * field_n;; vCheck<T>(srcp, scpp, dstp, _dmap, tline, field_n, dstWidth, srcHeight, srcStride, dstStride, d->vcheck, d->vthresh2, d->rcpVthresh0, d->rcpVthresh1, d->rcpVthresh2, d->peak); } } }