int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4))
            return 0;
        len *= cn;

        int x = 0;
        v_int32 v_sum = vx_setzero_s32();

        for (; x <= len - v_int16::nlanes; x += v_int16::nlanes)
        {
            v_int32 v_src0, v_src1;
            v_expand(vx_load(src0 + x), v_src0, v_src1);
            v_sum += v_src0 + v_src1;
        }
        if (x <= len - v_int32::nlanes)
        {
            v_sum += vx_load_expand(src0 + x);
            x += v_int32::nlanes;
        }

        if (cn == 1)
            *dst += v_reduce_sum(v_sum);
        else
        {
            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
            v_store_aligned(ar, v_sum);
            for (int i = 0; i < v_int32::nlanes; ++i)
                dst[i % cn] += ar[i];
        }
        v_cleanup();

        return x / cn;
    }
Beispiel #2
0
inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
{
    int CV_DECL_ALIGNED(32) idx[4];
    v_store_aligned(idx, idxvec);
    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
}
    int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4))
            return 0;
        len *= cn;

        int x = 0;
        v_int32 v_sum = vx_setzero_s32();

        int len0 = len & -v_int8::nlanes;
        while (x < len0)
        {
            const int len_tmp = min(x + 256*v_int16::nlanes, len0);
            v_int16 v_sum16 = vx_setzero_s16();
            for (; x < len_tmp; x += v_int8::nlanes)
            {
                v_int16 v_src0, v_src1;
                v_expand(vx_load(src0 + x), v_src0, v_src1);
                v_sum16 += v_src0 + v_src1;
            }
            v_int32 v_half0, v_half1;
            v_expand(v_sum16, v_half0, v_half1);
            v_sum += v_half0 + v_half1;
        }
        if (x <= len - v_int16::nlanes)
        {
            v_int32 v_half0, v_half1;
            v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
            v_sum += v_half0 + v_half1;
            x += v_int16::nlanes;
        }
        if (x <= len - v_int32::nlanes)
        {
            v_sum += vx_load_expand_q(src0 + x);
            x += v_int32::nlanes;
        }

        if (cn == 1)
            *dst += v_reduce_sum(v_sum);
        else
        {
            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
            v_store_aligned(ar, v_sum);
            for (int i = 0; i < v_int32::nlanes; ++i)
                dst[i % cn] += ar[i];
        }
        v_cleanup();

        return x / cn;
    }
Beispiel #4
0
void exp64f( const double *_x, double *y, int n )
{
    CV_INSTRUMENT_REGION();

    const double* const expTab = cv::details::getExpTab64f();

    const double
    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
    A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
    A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
    A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
    A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;

    int i = 0;
    const Cv64suf* x = (const Cv64suf*)_x;
    double minval = (-exp_max_val/exp_prescale);
    double maxval = (exp_max_val/exp_prescale);

#if CV_SIMD_64F
    const int VECSZ = v_float64::nlanes;
    const v_float64 vprescale = vx_setall_f64(exp_prescale);
    const v_float64 vpostscale = vx_setall_f64(exp_postscale);
    const v_float64 vminval = vx_setall_f64(minval);
    const v_float64 vmaxval = vx_setall_f64(maxval);

    const v_float64 vA1 = vx_setall_f64(A1);
    const v_float64 vA2 = vx_setall_f64(A2);
    const v_float64 vA3 = vx_setall_f64(A3);
    const v_float64 vA4 = vx_setall_f64(A4);
    const v_float64 vA5 = vx_setall_f64(A5);

    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;

    for( ; i < n; i += VECSZ*2 )
    {
        if( i + VECSZ*2 > n )
        {
            if( i == 0 || _x == y )
                break;
            i = n - VECSZ*2;
            y_aligned = false;
        }

        v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);

        xf0 = v_min(v_max(xf0, vminval), vmaxval);
        xf1 = v_min(v_max(xf1, vminval), vmaxval);

        xf0 *= vprescale;
        xf1 *= vprescale;

        v_int32 xi0 = v_round(xf0);
        v_int32 xi1 = v_round(xf1);
        xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
        xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;

        v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
        v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);

        v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);

        v_int64 xq0, xq1, dummy;
        v_expand(xi0, xq0, dummy);
        v_expand(xi1, xq1, dummy);

        yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
        yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));

        v_float64 zf0 = xf0 + vA1;
        v_float64 zf1 = xf1 + vA1;

        zf0 = v_fma(zf0, xf0, vA2);
        zf1 = v_fma(zf1, xf1, vA2);

        zf0 = v_fma(zf0, xf0, vA3);
        zf1 = v_fma(zf1, xf1, vA3);

        zf0 = v_fma(zf0, xf0, vA4);
        zf1 = v_fma(zf1, xf1, vA4);

        zf0 = v_fma(zf0, xf0, vA5);
        zf1 = v_fma(zf1, xf1, vA5);

        zf0 *= yf0;
        zf1 *= yf1;

        if( y_aligned )
        {
            v_store_aligned(y + i, zf0);
            v_store_aligned(y + i + VECSZ, zf1);
        }
        else
        {
            v_store(y + i, zf0);
            v_store(y + i + VECSZ, zf1);
        }
    }
    vx_cleanup();
#endif

    for( ; i < n; i++ )
    {
        double x0 = x[i].f;
        x0 = std::min(std::max(x0, minval), maxval);
        x0 *= exp_prescale;
        Cv64suf buf;

        int xi = saturate_cast<int>(x0);
        x0 = (x0 - xi)*exp_postscale;

        int t = (xi >> EXPTAB_SCALE) + 1023;
        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
        buf.i = (int64)t << 52;

        y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5);
    }
}
Beispiel #5
0
void exp32f( const float *_x, float *y, int n )
{
    CV_INSTRUMENT_REGION();

    const float* const expTab_f = cv::details::getExpTab32f();

    const float
    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
    A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
    A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);

    int i = 0;
    const Cv32suf* x = (const Cv32suf*)_x;
    float minval = (float)(-exp_max_val/exp_prescale);
    float maxval = (float)(exp_max_val/exp_prescale);
    float postscale = (float)exp_postscale;

#if CV_SIMD
    const int VECSZ = v_float32::nlanes;
    const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
    const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
    const v_float32 vminval = vx_setall_f32(minval);
    const v_float32 vmaxval = vx_setall_f32(maxval);

    const v_float32 vA1 = vx_setall_f32((float)A1);
    const v_float32 vA2 = vx_setall_f32((float)A2);
    const v_float32 vA3 = vx_setall_f32((float)A3);
    const v_float32 vA4 = vx_setall_f32((float)A4);

    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;

    for( ; i < n; i += VECSZ*2 )
    {
        if( i + VECSZ*2 > n )
        {
            if( i == 0 || _x == y )
                break;
            i = n - VECSZ*2;
            y_aligned = false;
        }

        v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);

        xf0 = v_min(v_max(xf0, vminval), vmaxval);
        xf1 = v_min(v_max(xf1, vminval), vmaxval);

        xf0 *= vprescale;
        xf1 *= vprescale;

        v_int32 xi0 = v_round(xf0);
        v_int32 xi1 = v_round(xf1);
        xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
        xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;

        v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
        v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);

        v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);

        yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
        yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));

        v_float32 zf0 = xf0 + vA1;
        v_float32 zf1 = xf1 + vA1;

        zf0 = v_fma(zf0, xf0, vA2);
        zf1 = v_fma(zf1, xf1, vA2);

        zf0 = v_fma(zf0, xf0, vA3);
        zf1 = v_fma(zf1, xf1, vA3);

        zf0 = v_fma(zf0, xf0, vA4);
        zf1 = v_fma(zf1, xf1, vA4);

        zf0 *= yf0;
        zf1 *= yf1;

        if( y_aligned )
        {
            v_store_aligned(y + i, zf0);
            v_store_aligned(y + i + VECSZ, zf1);
        }
        else
        {
            v_store(y + i, zf0);
            v_store(y + i + VECSZ, zf1);
        }
    }
    vx_cleanup();
#endif

    for( ; i < n; i++ )
    {
        float x0 = x[i].f;
        x0 = std::min(std::max(x0, minval), maxval);
        x0 *= (float)exp_prescale;
        Cv32suf buf;

        int xi = saturate_cast<int>(x0);
        x0 = (x0 - xi)*postscale;

        int t = (xi >> EXPTAB_SCALE) + 127;
        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
        buf.i = t << 23;

        y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);
    }
}
Beispiel #6
0
    TheTest & test_loadstore()
    {
        AlignedData<R> data;
        AlignedData<R> out;

        // check if addresses are aligned and unaligned respectively
        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);

        // check some initialization methods
        R r1 = data.a;
        R r2 = v_load(data.u.d);
        R r3 = v_load_aligned(data.a.d);
        R r4(r2);
        EXPECT_EQ(data.a[0], r1.get0());
        EXPECT_EQ(data.u[0], r2.get0());
        EXPECT_EQ(data.a[0], r3.get0());
        EXPECT_EQ(data.u[0], r4.get0());

        R r_low = v_load_low((LaneType*)data.u.d);
        EXPECT_EQ(data.u[0], r_low.get0());
        v_store(out.u.d, r_low);
        for (int i = 0; i < R::nlanes/2; ++i)
        {
            EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
        }

        R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8));
        EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
        v_store(out.u.d, r_low_align8byte);
        for (int i = 0; i < R::nlanes/2; ++i)
        {
            EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
        }

        // check some store methods
        out.u.clear();
        out.a.clear();
        v_store(out.u.d, r1);
        v_store_aligned(out.a.d, r2);
        EXPECT_EQ(data.a, out.a);
        EXPECT_EQ(data.u, out.u);

        // check more store methods
        Data<R> d, res(0);
        R r5 = d;
        v_store_high(res.mid(), r5);
        v_store_low(res.d, r5);
        EXPECT_EQ(d, res);

        // check halves load correctness
        res.clear();
        R r6 = v_load_halves(d.d, d.mid());
        v_store(res.d, r6);
        EXPECT_EQ(d, res);

        // zero, all
        Data<R> resZ = V_RegTrait128<LaneType>::zero();
        Data<R> resV = V_RegTrait128<LaneType>::all(8);
        for (int i = 0; i < R::nlanes; ++i)
        {
            EXPECT_EQ((LaneType)0, resZ[i]);
            EXPECT_EQ((LaneType)8, resV[i]);
        }

        // reinterpret_as
        v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a);
        v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a);
        v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a);
        v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a);
        v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a);
        v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a);
        v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
        v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
        v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
#if CV_SIMD128_64F
        v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
#endif

        return *this;
    }
Beispiel #7
0
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
{
    int CV_DECL_ALIGNED(32) idx[4];
    v_store_aligned(idx, idxvec);
    return v_float64x2(tab[idx[0]], tab[idx[1]]);
}
Beispiel #8
0
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
{
    int CV_DECL_ALIGNED(32) idx[4];
    v_store_aligned(idx, idxvec);
    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
Beispiel #9
0
    void operator()(const Range &boundaries) const
    {
        CV_TRACE_FUNCTION();

        Mat dx, dy;
        AutoBuffer<short> dxMax(0), dyMax(0);
        std::deque<uchar*> stack, borderPeaksLocal;
        const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1);
        int *_mag_p, *_mag_a, *_mag_n;
        short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL;
        uchar *_pmap;
        double scale = 1.0;

        CV_TRACE_REGION("gradient")
        if(needGradient)
        {
            if (aperture_size == 7)
            {
                scale = 1 / 16.0;
            }
            Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE);
            Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE);
        }
        else
        {
            dx = src.rowRange(rowStart, rowEnd);
            dy = src2.rowRange(rowStart, rowEnd);
        }

        CV_TRACE_REGION_NEXT("magnitude");
        if(cn > 1)
        {
            dxMax.allocate(2 * dx.cols);
            dyMax.allocate(2 * dy.cols);
            _dx_a = (short*)dxMax;
            _dx_n = _dx_a + dx.cols;
            _dy_a = (short*)dyMax;
            _dy_n = _dy_a + dy.cols;
        }

        // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
#if CV_SIMD128
        AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128));
        _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128);
        _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128);
        _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128);
#else
        AutoBuffer<int> buffer(3 * (mapstep * cn));
        _mag_p = (int*)buffer + 1;
        _mag_a = _mag_p + mapstep * cn;
        _mag_n = _mag_a + mapstep * cn;
#endif

        // For the first time when just 2 rows are filled and for left and right borders
        if(rowStart == boundaries.start)
            memset(_mag_n - 1, 0, mapstep * sizeof(int));
        else
            _mag_n[src.cols] = _mag_n[-1] = 0;

        _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0;

        // calculate magnitude and angle of gradient, perform non-maxima suppression.
        // fill the map with one of the following values:
        //   0 - the pixel might belong to an edge
        //   1 - the pixel can not belong to an edge
        //   2 - the pixel does belong to an edge
        for (int i = rowStart; i <= boundaries.end; ++i)
        {
            // Scroll the ring buffer
            std::swap(_mag_n, _mag_a);
            std::swap(_mag_n, _mag_p);

            if(i < rowEnd)
            {
                // Next row calculation
                _dx = dx.ptr<short>(i - rowStart);
                _dy = dy.ptr<short>(i - rowStart);

                if (L2gradient)
                {
                    int j = 0, width = src.cols * cn;
#if CV_SIMD128
                    if (haveSIMD)
                    {
                       for ( ; j <= width - 8; j += 8)
                        {
                            v_int16x8 v_dx = v_load((const short*)(_dx + j));
                            v_int16x8 v_dy = v_load((const short*)(_dy + j));

                            v_int32x4 v_dxp_low, v_dxp_high;
                            v_int32x4 v_dyp_low, v_dyp_high;
                            v_expand(v_dx, v_dxp_low, v_dxp_high);
                            v_expand(v_dy, v_dyp_low, v_dyp_high);

                            v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
                            v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
                        }
                    }
#endif
                    for ( ; j < width; ++j)
                        _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
                }
                else
                {
                    int j = 0, width = src.cols * cn;
#if CV_SIMD128
                    if (haveSIMD)
                    {
                        for(; j <= width - 8; j += 8)
                        {
                            v_int16x8 v_dx = v_load((const short *)(_dx + j));
                            v_int16x8 v_dy = v_load((const short *)(_dy + j));

                            v_dx = v_reinterpret_as_s16(v_abs(v_dx));
                            v_dy = v_reinterpret_as_s16(v_abs(v_dy));

                            v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh;
                            v_expand(v_dx, v_dx_ml, v_dx_mh);
                            v_expand(v_dy, v_dy_ml, v_dy_mh);

                            v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
                            v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh);
                        }
                    }
#endif
                    for ( ; j < width; ++j)
                        _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
                }

                if(cn > 1)
                {
                    std::swap(_dx_n, _dx_a);
                    std::swap(_dy_n, _dy_a);

                    for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
                    {
                        int maxIdx = jn;
                        for(int k = 1; k < cn; ++k)
                            if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k;

                        _mag_n[j] = _mag_n[maxIdx];
                        _dx_n[j] = _dx[maxIdx];
                        _dy_n[j] = _dy[maxIdx];
                    }

                    _mag_n[src.cols] = 0;
                }

                // at the very beginning we do not have a complete ring
                // buffer of 3 magnitude rows for non-maxima suppression
                if (i <= boundaries.start)
                    continue;
            }
            else
            {
                memset(_mag_n - 1, 0, mapstep * sizeof(int));

                if(cn > 1)
                {
                    std::swap(_dx_n, _dx_a);
                    std::swap(_dy_n, _dy_a);
                }
            }

            // From here actual src row is (i - 1)
            // Set left and right border to 1
#if CV_SIMD128
            if(haveSIMD)
                _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128;
            else
#endif
                _pmap = map.ptr<uchar>(i) + 1;

            _pmap[src.cols] =_pmap[-1] = 1;

            if(cn == 1)
            {
                _dx = dx.ptr<short>(i - rowStart - 1);
                _dy = dy.ptr<short>(i - rowStart - 1);
            }
            else
            {
                _dx = _dx_a;
                _dy = _dy_a;
            }

            const int TG22 = 13573;
            int j = 0;
#if CV_SIMD128
            if (haveSIMD)
            {
                const v_int32x4 v_low = v_setall_s32(low);
                const v_int8x16 v_one = v_setall_s8(1);

                for (; j <= src.cols - 32; j += 32)
                {
                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));

                    v_int32x4 v_cmp1 = v_m1 > v_low;
                    v_int32x4 v_cmp2 = v_m2 > v_low;
                    v_int32x4 v_cmp3 = v_m3 > v_low;
                    v_int32x4 v_cmp4 = v_m4 > v_low;

                    v_m1 = v_load_aligned((const int*)(_mag_a + j + 16));
                    v_m2 = v_load_aligned((const int*)(_mag_a + j + 20));
                    v_m3 = v_load_aligned((const int*)(_mag_a + j + 24));
                    v_m4 = v_load_aligned((const int*)(_mag_a + j + 28));

                    v_store_aligned((signed char*)(_pmap + j), v_one);
                    v_store_aligned((signed char*)(_pmap + j + 16), v_one);

                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    v_cmp1 = v_m1 > v_low;
                    v_cmp2 = v_m2 > v_low;
                    v_cmp3 = v_m3 > v_low;
                    v_cmp4 = v_m4 > v_low;

                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);

                    v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    unsigned int mask = v_signmask(v_cmp);

                    v_cmp = v_pack(v_cmp80, v_cmp81);
                    mask |= v_signmask(v_cmp) << 16;

                    if (mask)
                    {
                        int k = j;

                        do
                        {
                            int l = trailingZeros32(mask);
                            k += l;
                            mask >>= l;

                            int m = _mag_a[k];
                            short xs = _dx[k];
                            short ys = _dy[k];
                            int x = (int)std::abs(xs);
                            int y = (int)std::abs(ys) << 15;

                            int tg22x = x * TG22;

                            if (y < tg22x)
                            {
                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                                {
                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                }
                            }
                            else
                            {
                                int tg67x = tg22x + (x << 16);
                                if (y > tg67x)
                                {
                                    if (m > _mag_p[k] && m >= _mag_n[k])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                                else
                                {
                                    int s = (xs ^ ys) < 0 ? -1 : 1;
                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                            }
                            ++k;
                        } while((mask >>= 1));
                    }
                }

                if (j <= src.cols - 16)
                {
                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));

                    v_store_aligned((signed char*)(_pmap + j), v_one);

                    v_int32x4 v_cmp1 = v_m1 > v_low;
                    v_int32x4 v_cmp2 = v_m2 > v_low;
                    v_int32x4 v_cmp3 = v_m3 > v_low;
                    v_int32x4 v_cmp4 = v_m4 > v_low;

                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
                    unsigned int mask = v_signmask(v_cmp);

                    if (mask)
                    {
                        int k = j;

                        do
                        {
                            int l = trailingZeros32(mask);
                            k += l;
                            mask >>= l;

                            int m = _mag_a[k];
                            short xs = _dx[k];
                            short ys = _dy[k];
                            int x = (int)std::abs(xs);
                            int y = (int)std::abs(ys) << 15;

                            int tg22x = x * TG22;

                            if (y < tg22x)
                            {
                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                                {
                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                }
                            }
                            else
                            {
                                int tg67x = tg22x + (x << 16);
                                if (y > tg67x)
                                {
                                    if (m > _mag_p[k] && m >= _mag_n[k])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                                else
                                {
                                    int s = (xs ^ ys) < 0 ? -1 : 1;
                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                            }
                            ++k;
                        } while((mask >>= 1));
                    }
                    j += 16;
                }
            }
#endif
            for (; j < src.cols; j++)
            {
                int m = _mag_a[j];

                if (m > low)
                {
                    short xs = _dx[j];
                    short ys = _dy[j];
                    int x = (int)std::abs(xs);
                    int y = (int)std::abs(ys) << 15;

                    int tg22x = x * TG22;

                    if (y < tg22x)
                    {
                        if (m > _mag_a[j - 1] && m >= _mag_a[j + 1])
                        {
                            CANNY_CHECK(m, high, (_pmap+j), stack);
                        }
                    }
                    else
                    {
                        int tg67x = tg22x + (x << 16);
                        if (y > tg67x)
                        {
                            if (m > _mag_p[j] && m >= _mag_n[j])
                            {
                                CANNY_CHECK(m, high, (_pmap+j), stack);
                            }
                        }
                        else
                        {
                            int s = (xs ^ ys) < 0 ? -1 : 1;
                            if(m > _mag_p[j - s] && m > _mag_n[j + s])
                            {
                                CANNY_CHECK(m, high, (_pmap+j), stack);
                            }
                        }
                    }
                }
                _pmap[j] = 1;
            }
        }