コード例 #1
0
template<typename _Ts, typename _Td> inline void
cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
            Size size, float a, float b )
{
#if CV_SIMD
    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
    const int VECSZ = v_float32::nlanes*2;
#endif
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
#if CV_SIMD
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
            {
                if( j == 0 || src == (_Ts*)dst )
                    break;
                j = size.width - VECSZ;
            }
            v_float32 v0, v1;
            vx_load_pair_as(src + j, v0, v1);
            v0 = v_fma(v0, va, vb);
            v1 = v_fma(v1, va, vb);
            v_store_pair_as(dst + j, v_abs(v0), v_abs(v1));
        }
#endif
        for( ; j < size.width; j++ )
            dst[j] = saturate_cast<_Td>(std::abs(src[j]*a + b));
    }
}
コード例 #2
0
double match_hexadecagon(hexadecagon_t* local,hexadecagon_t* visitor)
{
  double result=0; int i;
  for(i=0; i<NUMPTOS; i++)
    result+=v_abs(visitor->m_list[i]-local->m_list[i]);
  return result;
}
コード例 #3
0
ファイル: bhi_kepler.c プロジェクト: cmaureir/bhint
/*
 * Retrieve constants of motion (as if) in 2-body problem
 * (of first two particles) reduced to 1-body-problem.
 * Returns |j|, |e|, a, T in fields provided.
 */
void get_reduced(struct particle parts[], int pos, double *red_e, double *red_a, double *red_t, double *red_j)
{
    _enter_function(_UL_KEPLER, _UL_KEPLER_GET_REDUCED);
    int i;
    double r_[3], v_[3], j[3], e[3], a, omega;

    for(i = 0; i < 3; i++)
    {
        r_[i] = parts[pos].xp[i] - parts[0].xp[i];
        v_[i] = parts[pos].vp[i] - parts[0].vp[i];
    }
    get_constants(r_, v_, parts[0].m//+parts[pos].m
                    , j, e, &a, &omega);
    *red_e = v_abs(e);
    *red_a = a;
    *red_t = 2*M_PI/omega;
    *red_j = v_abs(j);

    _exit_function();
}
コード例 #4
0
ファイル: test_intrin_utils.hpp プロジェクト: gini/opencv
    TheTest & test_sqrt_abs()
    {
        Data<R> dataA, dataD;
        dataD *= -1.0;
        R a = dataA, d = dataD;

        Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
        for (int i = 0; i < R::nlanes; ++i)
        {
            EXPECT_COMPARE_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
            EXPECT_COMPARE_EQ(1/(float)std::sqrt(dataA[i]), (float)resC[i]);
            EXPECT_COMPARE_EQ((float)abs(dataA[i]), (float)resE[i]);
        }

        return *this;
    }
コード例 #5
0
ファイル: test_intrin_utils.hpp プロジェクト: gini/opencv
    TheTest & test_abs()
    {
        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
        typedef typename Ru::lane_type u_type;
        Data<R> dataA, dataB(10);
        R a = dataA, b = dataB;
        a = a - b;

        Data<Ru> resC = v_abs(a);

        for (int i = 0; i < Ru::nlanes; ++i)
        {
            EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]);
        }

        return *this;
    }
コード例 #6
0
double
SpectralDifferenceAudioCurve::processDouble(const double *mag, int increment)
{
    double result = 0.0;

    const int hs1 = m_lastPerceivedBin + 1;

    v_convert(m_tmpbuf, mag, hs1);
    v_square(m_tmpbuf, hs1);
    v_subtract(m_mag, m_tmpbuf, hs1);
    v_abs(m_mag, hs1);
    v_sqrt(m_mag, hs1);
    
    for (int i = 0; i < hs1; ++i) {
        result += m_mag[i];
    }

    v_copy(m_mag, m_tmpbuf, hs1);
    return result;
}
コード例 #7
0
ファイル: bhi_timestep.c プロジェクト: cmaureir/bhint
int check_fast_approaches(  struct particle *parts,
                            struct particle *p, struct particle *pk/*,
                            double r_2*/)
{
    _enter_function(_UL_TIMESTEP, _UL_TIMESTEP_CHECK_FAST_APPROACHES);
    int i, collision=0;
    double temp, r_close_2=-1., r_temp_2, t_close, dt=.0, dt2=.0, dt3=.0, dt4=.0, dt5=.0;
    double *px=p->x, *pv=p->v, *pkx, *pkv, r_2;
    double x[3], v[3], a[3], a_[3];

    assert(p != pk);
    add_over(1, &count_approach_checks, &count_approach_checks_over);

    if(pk->active)
    {
        pkx = pk->x;
        pkv = pk->v;
    }

    else
        // have to use predicted x, v and corrected derivatives
    {
#ifndef USE_GRAPE
        pkx = pk->xp;
#else
        pkx = pk->x;
#endif
        pkv = pk->v;
        dt = p->t - pk->t;
        dt2 = .5 * dt * dt;
        dt3 = dt * dt2 * _1_3;
        dt4 = .25 * dt * dt3;
        dt5 = .2 * dt * dt4;
    }

    for(i = 0; i < 3; i++)
    {
        if( 1
#ifdef USE_GRAPE
                && pk->active
#endif
          )
        {
            x[i]  = px[i] - pkx[i];
        }
        else
        {
            x[i]  = px[i] - (pkx[i]
                             + dt * pk->v[i]
                             + dt2 * (pk->a[i] + pk->ha[i])
                             + dt3 * (pk->a_[i] + pk->ha_[i])
                             + dt4 * (pk->a_2[i] + pk->ha_2[i])
#ifndef USE_GRAPE
                             + dt5 * (pk->a_3[i] + pk->ha_3[i])
#endif
                            );
        }

        if(pk->active)
        {
            v[i] = pv[i] - pkv[i];
            a[i]  = p->ha[i]  + p->a[i]  - pk->ha[i]  - pk->a[i];
            a_[i] = p->ha_[i] + p->a_[i] - pk->ha_[i] - pk->a_[i];
        }

        else
        {
            v[i] = pv[i] - (pkv[i]
                            + dt * (pk->a[i] + pk->ha[i])
                            + dt2 * (pk->a_[i] + pk->ha_[i])
                            + dt3 * (pk->a_2[i] + pk->ha_2[i])
#ifndef USE_GRAPE
                            + dt4 * (pk->a_3[i] + pk->ha_3[i])
#endif
                           );
            a[i]  = p->ha[i]  + p->a[i]  - (pk->a[i] + pk->ha[i]
                                            + dt * (pk->a_[i] + pk->ha_[i])
                                            + dt2 * (pk->a_2[i] + pk->ha_2[i])
#ifndef USE_GRAPE
                                            + dt3 * (pk->a_3[i] + pk->ha_3[i])
#endif
                                           );
            a_[i] = p->ha_[i] + p->a_[i] - (pk->a_[i] + pk->ha_[i]
                                            + dt * (pk->a_2[i] + pk->ha_2[i])
#ifndef USE_GRAPE
                                            + dt2* (pk->a_3[i] + pk->ha_3[i])
#endif
                                           );
        }
    }

    r_2 = scal_prod(x, x);
    // linear approximation of time of closest encounter
    //t_close = -scal_prod(x, v) / scal_prod(v, v);
    // 2nd order approximation of time of closest encounter

    double xv, xa, v2, va, a2, _p, _q, _p3, _q2, _d, _u, _v, dy, t2, t3, _1_a2;
    xv = scal_prod(x, v);
    xa = scal_prod(x, a);
    v2 = scal_prod(v, v);
    va = scal_prod(v, a);
    a2 = scal_prod(a, a);
    _1_a2 = 1. / a2;
    dy = - va * _1_a2;
    _p = (a2 * 2.* (v2 + xa) - 3. * va * va) * (_1_a2 * _1_a2);
    _p3 = _p * _p * _p;
    _q = (2. * va * va * va - va * 2.*(v2 + xa) * a2 + 2. * xv * a2 * a2) * (_1_a2 * _1_a2 * _1_a2);
    _q2 = _q * _q;
    _d = 4. * _p3 + 27. * _q2;

    if(_d > 0)
    {
        _u = -.5 * _q;
        _v = sqrt(.25 * _q2 + _p3 * _1_27);
        t_close = cbrt(_u + _v) + cbrt(_u - _v) + dy;
    }
    else if(_d == 0)
    {
        t_close = cbrt(.5 * _q)  + dy;
        t3      = cbrt(-4. * _q) + dy;
        if(t3 > 0 && (t3 < t_close || t_close <= 0))
        {
            t_close = t3;
        }
    }

    else // _d < 0
    {
        _u = sqrt(-4. *_1_3 * _p);
        _v = acos(-.5 * _q * sqrt(-27. / _p3)) * _1_3;
        t_close =  _u *  _v               + dy;
        t2      = -_u * (_v + M_PI * _1_3) + dy;
        t3      = -_u * (_v - M_PI * _1_3) + dy;
        if(t2 > 0 && (t2 < t_close || t_close <= 0)) t_close = t2;
        if(t3 > 0 && (t3 < t_close || t_close <= 0)) t_close = t3;
    }
    while(1)
    {
        // check distance after next step
        r_temp_2 = .0;
        for(i = 0; i < DIMENSIONS; i++)
        {
            temp = x[i] + p->dt * (v[i] + p->dt * .5 * (a[i] /*+ p->dt / 3. * a_[i]*/));
            r_temp_2 += temp * temp;
        }

        if(r_2 > r_temp_2 * MAX_APPROACH_FACTOR_2 || r_2 * MAX_APPROACH_FACTOR_2 < r_temp_2)
        {
            add_over(1, &count_approach_reduce_t, &count_approach_reduce_t_over);
#ifdef DEBUG_ALL
            fprintf(get_file(FILE_DEBUG),
                    "\t# halving dt: approach  m%d - m%d: \tr(t=%1.6e)=%1.2e\t\tr(t=%1.6e)=%1.2e\n",
                    pk->name, p->name,
                    t_total(p->t), convert_length(sqrt(r_2), 0),
                    t_total(p->t+p->dt), convert_length(sqrt(r_temp_2), 0));
            fflush(get_file(FILE_DEBUG));
#endif
            p->dt *= .5;

#ifdef SYNCHRONIZE_APPROACHING_TIMESTEPS
            if(!pk->active)
            {
                while(pk->htlast + .5 * pk->dt > p->t + p->dt + DT_TOLERANCE)
                {
                    pk->dt *= .5;
#ifdef DEBUG_ALL
                    fprintf(get_file(FILE_DEBUG),
                            "#### [t=%1.12e] shrinking timestep for m%d as of close encounter with m%d to %e ####\n",
                            t_total(p->t),
                            pk->name,
                            p->name,
                            t_total(pk->dt));
                    fflush(get_file(FILE_DEBUG));
#endif
                }
            }
#endif

            continue;
        }

        if(r_2 < 9. * C_2G_C2 * C_2G_C2 * (pk->m + p->m) * (pk->m + p->m))
        {
            // collision in 3 Schwarzschild-radii
            collision = 1;
            fprintf(get_file(FILE_WARNING), "#### [t=%1.12e] COLLISION of m%d and m%d at %1.12e: %e (r_S = %e) ####\n",
                    t_total(p->t),
                    p->name,
                    pk->name,
                    t_total(p->t + t_close),
                    convert_length(sqrt(r_2), 0),
                    convert_length(C_2G_C2 * (pk->m + p->m), 0));
            fflush(get_file(FILE_WARNING));
        }


        if(t_close > .0 && t_close < p->dt)
        {
            // close encounter will happen _during_ next step, now calculate distance
            if(r_close_2 <.0)
            {
                r_close_2 = .0;
                for(i = 0; i < DIMENSIONS; i++)
                {
                    temp = (x[i] + t_close * (v[i] + .5 * t_close * a[i]));
                    r_close_2 += temp * temp;
                }
            }
            if(r_close_2 < square(3. * C_2G_C2 * (pk->m + p->m)))
            {
                // collision in 3 Schwarzschild-radii
                collision = 1;
                fprintf(get_file(FILE_WARNING), "#### [t=%1.12e] COLLISION of m%d and m%d at %1.12e: %e (r_S = %e) ####\n",
                        t_total(p->t),
                        p->name,
                        pk->name,
                        t_total(p->t + t_close),
                        convert_length(sqrt(r_close_2), 0),
                        convert_length(C_2G_C2 * (pk->m + p->m), 0));
                fflush(get_file(FILE_WARNING));
            }

            // approach to small multiple of impact parameter:
            // r'_12 < warn_fact * b = warn_fact * 2 * r_1 * m / M

#ifdef WARN_CLOSEENC
            if(r_close_2 * parts->m * parts->m < square(WARN_APPROACH_FACT * 2 * pk->m) * scal_prod(p->xp, p->xp)
                    && (N_MAX_DETAIL < -1 || p->name <= N_MAX_DETAIL || pk->name <= N_MAX_DETAIL)
              )
            {
                fprintf(get_file(FILE_WARNING),
                        "\t# predicted close encounter m%d - m%d: \tr(t=%1.6e)=%1.2e\t\tr(t=%1.6e)=%1.2e=%1.2fb\tp.dt=%1.2e\tpk->dt=%1.2e (%1.2e el.) [%d:%d]\n",
                        pk->name, p->name,
                        t_total(p->t), convert_length(sqrt(r_2), 0),
                        t_total(p->t + t_close), convert_length(sqrt(r_close_2), 0),
                        sqrt(r_close_2) / (2. * v_abs(p->xp) * pk->m) * parts->m,
                        convert_time(p->dt, 0),
                        convert_time(pk->dt, 0),
                        convert_time(p->t - pk->t, 0),
                        pk->nearestneighbour, p->nearestneighbour);
                fprintf(get_file(FILE_WARNING),
                        " PCE %1.12e\t%d\t%e\t%1.10e\t%1.10e\t%1.10e\t%1.10e\t%1.10e\t%1.10e\t%d\t%e\t%1.10e\t%1.10e\t%1.10e\t%1.10e\t%1.10e\t%1.10e\n",
                        t_total(p->t),
                        pk->name, convert_mass(pk->m, 0),
                        convert_length(pkx[0], 0), convert_length(pkx[1], 0), convert_length(pkx[2], 0),
                        convert_length(convert_time(pkv[0], 1), 0), convert_length(convert_time(pkv[1], 1), 0), convert_length(convert_time(pkv[2], 1), 0),
                        p->name, convert_mass(p->m, 0),
                        convert_length(px[0], 0), convert_length(px[1], 0), convert_length(px[2], 0),
                        convert_length(convert_time(pv[0], 1), 0), convert_length(convert_time(pv[1], 1), 0), convert_length(convert_time(pv[2], 1), 0)
                       );
                fflush(get_file(FILE_WARNING));
            }
#endif

            if(r_2 > MAX_APPROACH_FACTOR_2 * r_close_2)
            {
                add_over(1, &count_approach_reduce_t, &count_approach_reduce_t_over);
#ifdef DEBUG_ALL
                fprintf(get_file(FILE_DEBUG),
                        "\t# halving dt: encounter m%d - m%d: \tr(t=%1.6e)=%1.2e\t\tr(t=%1.6e)=%1.2e\tstep: t=%1.6e\n",
                        pk->name, p->name,
                        t_total(p->t), convert_length(sqrt(r_2), 0),
                        t_total(p->t + t_close), convert_length(sqrt(r_close_2), 0),
                        t_total(p->t + p->dt));
                fflush(get_file(FILE_DEBUG));
#endif
                p->dt *= .5;

#ifdef SYNCHRONIZE_APPROACHING_TIMESTEPS
                if(!pk->active)
                    while(pk->htlast + .5 * pk->dt > p->t + p->dt + DT_TOLERANCE)
                    {
                        pk->dt *= .5;
#ifdef DEBUG_ALL
                        fprintf(get_file(FILE_DEBUG),
                                "####  shrinking. timestep for m%d as of close encounter with m%d to %e ####\n",
                                t_total(p->t),
                                pk->name,
                                p->name,
                                t_total(pk->dt));
#endif
                    }
#endif
                continue;
            }
        }
        break;
    }

    _exit_function();
    return collision;
}
コード例 #8
0
ファイル: bhi_kepler.c プロジェクト: cmaureir/bhint
/*
 * Calculate Kepler position and velocity for given timestep _dt_
 * for particle no. _pos_.
 * _xp_ and _vp_ will be updated.
 */
void step_kepler_1(struct particle parts[], int pcount, int pos, double dt,
                   double *out_a, double *out_a_, double *out_a_2, double *out_a_3,
                   double *curr_a, double *curr_e)
{
    _enter_function(_UL_KEPLER, _UL_KEPLER_STEP_KEPLER_1);
    int i;
    struct particle *p0 = parts, *p1 = parts + pos;
    double r_[3], v_[3], j_[3], ecc_[3], a_[3], b_[3], _1_r2, afact, v_r_, v_v_, r_a_, v_a_, r_a__;
    double ecc, a, r, v, b, omega, e, mean, cosp, sinp;
    double m_c=p0->m, _cosp_ecc, e2, _1_ecc, _cosp_1, de_dt;//+p1->m;

    // get relative position / motion
    for(i = 0; i < 3; i++)
    {
        r_[i] = p1->xp[i] - p0->xp[i];
        v_[i] = p1->vp[i] - p0->vp[i];
    }

    // calculate ellipse constants
    get_constants(r_, v_, m_c, j_, ecc_, &a, &omega);
    //printf("#  [%d]:\t%e\t%e\t%e\n", pos, v_abs(ecc_), a, omega);

    ecc = v_abs(ecc_);
    // b_ = a * sqrt(|1-e²|) * (j_ x e_) / |j_ x e_|
    vec_prod(j_, ecc_, b_);
    b = a * sqrt(fabs(1-ecc*ecc)) / v_abs(b_);
    for(i = 0; i < 3; i++)
    {
        a_[i]  = a*ecc_[i]/ecc;            // semi major vector
        b_[i] *= b;                        // semi minor vector
    }

    if(curr_a != NULL) *curr_a = a;
    if(curr_e != NULL) *curr_e = ecc;

    if(ecc < 1)
    // elliptical orbit
    {
        if(!p1->is_elliptical)
        {
            fprintf(get_file(FILE_WARNING),
                    "#### [t=%1.12e] Particle #%d captured onto elliptical orbit with e=%e ####\n",
                    t_total(p1->t), pos, ecc);
                    p1->is_elliptical = 1;
        }
        // calculate eccentric anomaly e at t+dt
        e = (a - v_abs(r_)) / (ecc * a);
        if(e >= 1.0) e = .0;
        else if(e <= -1.0) e = M_PI;
        else e = acos(e);
        if(scal_prod(r_, b_) < 0)
            e = 2*M_PI - e;
        mean = (e - ecc*sin(e)) + dt * omega;
        while(mean >= 2. * M_PI)
            mean -= 2. * M_PI;

        e = solve_kepler(mean, ecc);

        cosp = cos(e);
        sinp = sin(e);
        _cosp_ecc = cosp - ecc;
        de_dt = omega / (1. - ecc * cosp);
        if(ecc > .99)
        {
            e2 = (e > 2. * M_PI - 1e-3) ? e - 2. * M_PI : e;
            if(e2 < 1e-3)
            {
                e2 *= e2;
                _1_ecc    = scal_prod(j_, j_)/(p0->m*a*(1+ecc));
                _cosp_1   =  - .5 * e2 * (1 - e2 / 12. * (1 - e2 / 30.));
                _cosp_ecc = _1_ecc + _cosp_1;
                de_dt     = omega / (_1_ecc - ecc * _cosp_1);
            }
        }
        for(i = 0; i < DIMENSIONS; i++)
        {
            r_[i] =   a_[i] * _cosp_ecc + b_[i] * sinp ;  // new location
            v_[i] = (-a_[i] * sinp      + b_[i] * cosp) * de_dt;   // direction of v only
        }
    }
    else
    // hyperbolic orbit  // parabolic?
    {
        if(p1->is_elliptical)
        {
            fprintf(get_file(FILE_WARNING), "#### [t=%1.12e+%1.12e] Particle #%d thrown onto hyperbolic orbit with e=%e (E=%e, a=%e) ####\n",
                    t_total(p1->t), convert_time(dt, 0), pos, ecc, p1->energy, convert_length(a, 0));
            p1->is_elliptical = 0;
        }
        if(ecc == 1)
            fprintf(get_file(FILE_WARNING), "# # # %e\tParabolic orbit of m%d treated as hyperbolic: e=%e\t(x=%e)\n",
                    t_total(p1->t), pos, ecc, convert_length(v_abs(p1->xp), 0));

        // calculate eccentric anomaly e at t+dt
        e = (a + v_abs(r_)) / (ecc * a);
        if(e < 1.0) e = .0;
        else if(scal_prod(r_, v_) < 0) e = -acosh(e);
        else e = acosh(e);

        e = kepler(ecc, ecc * sinh(e) - e + dt * omega);
        cosp = cosh(e);
        sinp = sinh(e);
        de_dt = omega / (ecc * cosp - 1.);
        for(i = 0; i < DIMENSIONS; i++)
        {
            r_[i] =   a_[i] * (ecc - cosp)  + b_[i] * sinp;  // new location
            v_[i] = (-a_[i] * sinp          + b_[i] * cosp) * de_dt;  // direction of v only
        }
    }

    // get |v_| from j_ = r_ x v_
    v = v_abs(v_);
    r = v_abs(r_);
    v = v_abs(j_) / (r * v * sin(acos(scal_prod(r_, v_)/ (r * v))));

    for(i = 0; i < DIMENSIONS; i++)
    {
        //v_[i] *= v;
        // total motion relative to fix central mass
        p1->xp[i] = p0->xp[i] + r_[i];
        p1->vp[i] = p0->vp[i] + v_[i];
    }

    if(out_a != NULL)
    {
        _1_r2 = 1. / scal_prod(r_, r_);
        afact = - m_c * _1_r2 * sqrt(_1_r2);
        //printf("4  %e %e %e\n", *(out_a), *(out_a+1), *(out_a+2));
        for(i = 0; i < DIMENSIONS; i++)
            out_a[i] = afact * r_[i];
            if(out_a_ != NULL)
            {
                v_r_ = scal_prod(v_, r_);
                for(i = 0; i < DIMENSIONS; i++)
                    out_a_[i] = afact * (v_[i] - 3 * _1_r2 * v_r_ * r_[i]);
                    if(out_a_2 != NULL)
                    {
                        v_v_ = scal_prod(v_, v_);
                        r_a_ = scal_prod(r_, out_a);
                        for(i = 0; i < DIMENSIONS; i++)
                            out_a_2[i] = afact * (out_a[i] - 3. * _1_r2 * (v_r_ * (2. * v_[i] - 5. * v_r_ * r_[i] * _1_r2)
                                         + (v_v_ + r_a_) * r_[i]));
                        if(out_a_3 != NULL)
                        {
                            v_a_  = scal_prod(v_, out_a);
                            r_a__  = scal_prod(r_, out_a_);
                            for(i = 0; i < DIMENSIONS; i++)
                                out_a_3[i] = afact * (out_a_[i]
                                            - 3. * _1_r2 * (3. * v_r_ * out_a[i]
                                            + 3. * (v_v_ + r_a_)
                                            * (v_[i] - 5. * v_r_ * _1_r2 * r_[i])
                                            + (3. * v_a_ + r_a__) * r_[i]
                                            + v_r_ * v_r_ * _1_r2
                                            * (-15. * v_[i] + 35. * v_r_ * _1_r2 * r_[i])));
                        }
                    }
            }
    }

    _exit_function();
}
コード例 #9
0
ファイル: canny.cpp プロジェクト: cyberCBM/DetectO
    void operator()(const Range &boundaries) const
    {
        CV_TRACE_FUNCTION();

        Mat dx, dy;
        AutoBuffer<short> dxMax(0), dyMax(0);
        std::deque<uchar*> stack, borderPeaksLocal;
        const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1);
        int *_mag_p, *_mag_a, *_mag_n;
        short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL;
        uchar *_pmap;
        double scale = 1.0;

        CV_TRACE_REGION("gradient")
        if(needGradient)
        {
            if (aperture_size == 7)
            {
                scale = 1 / 16.0;
            }
            Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE);
            Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE);
        }
        else
        {
            dx = src.rowRange(rowStart, rowEnd);
            dy = src2.rowRange(rowStart, rowEnd);
        }

        CV_TRACE_REGION_NEXT("magnitude");
        if(cn > 1)
        {
            dxMax.allocate(2 * dx.cols);
            dyMax.allocate(2 * dy.cols);
            _dx_a = (short*)dxMax;
            _dx_n = _dx_a + dx.cols;
            _dy_a = (short*)dyMax;
            _dy_n = _dy_a + dy.cols;
        }

        // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
#if CV_SIMD128
        AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128));
        _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128);
        _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128);
        _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128);
#else
        AutoBuffer<int> buffer(3 * (mapstep * cn));
        _mag_p = (int*)buffer + 1;
        _mag_a = _mag_p + mapstep * cn;
        _mag_n = _mag_a + mapstep * cn;
#endif

        // For the first time when just 2 rows are filled and for left and right borders
        if(rowStart == boundaries.start)
            memset(_mag_n - 1, 0, mapstep * sizeof(int));
        else
            _mag_n[src.cols] = _mag_n[-1] = 0;

        _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0;

        // calculate magnitude and angle of gradient, perform non-maxima suppression.
        // fill the map with one of the following values:
        //   0 - the pixel might belong to an edge
        //   1 - the pixel can not belong to an edge
        //   2 - the pixel does belong to an edge
        for (int i = rowStart; i <= boundaries.end; ++i)
        {
            // Scroll the ring buffer
            std::swap(_mag_n, _mag_a);
            std::swap(_mag_n, _mag_p);

            if(i < rowEnd)
            {
                // Next row calculation
                _dx = dx.ptr<short>(i - rowStart);
                _dy = dy.ptr<short>(i - rowStart);

                if (L2gradient)
                {
                    int j = 0, width = src.cols * cn;
#if CV_SIMD128
                    if (haveSIMD)
                    {
                       for ( ; j <= width - 8; j += 8)
                        {
                            v_int16x8 v_dx = v_load((const short*)(_dx + j));
                            v_int16x8 v_dy = v_load((const short*)(_dy + j));

                            v_int32x4 v_dxp_low, v_dxp_high;
                            v_int32x4 v_dyp_low, v_dyp_high;
                            v_expand(v_dx, v_dxp_low, v_dxp_high);
                            v_expand(v_dy, v_dyp_low, v_dyp_high);

                            v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
                            v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
                        }
                    }
#endif
                    for ( ; j < width; ++j)
                        _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
                }
                else
                {
                    int j = 0, width = src.cols * cn;
#if CV_SIMD128
                    if (haveSIMD)
                    {
                        for(; j <= width - 8; j += 8)
                        {
                            v_int16x8 v_dx = v_load((const short *)(_dx + j));
                            v_int16x8 v_dy = v_load((const short *)(_dy + j));

                            v_dx = v_reinterpret_as_s16(v_abs(v_dx));
                            v_dy = v_reinterpret_as_s16(v_abs(v_dy));

                            v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh;
                            v_expand(v_dx, v_dx_ml, v_dx_mh);
                            v_expand(v_dy, v_dy_ml, v_dy_mh);

                            v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
                            v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh);
                        }
                    }
#endif
                    for ( ; j < width; ++j)
                        _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
                }

                if(cn > 1)
                {
                    std::swap(_dx_n, _dx_a);
                    std::swap(_dy_n, _dy_a);

                    for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
                    {
                        int maxIdx = jn;
                        for(int k = 1; k < cn; ++k)
                            if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k;

                        _mag_n[j] = _mag_n[maxIdx];
                        _dx_n[j] = _dx[maxIdx];
                        _dy_n[j] = _dy[maxIdx];
                    }

                    _mag_n[src.cols] = 0;
                }

                // at the very beginning we do not have a complete ring
                // buffer of 3 magnitude rows for non-maxima suppression
                if (i <= boundaries.start)
                    continue;
            }
            else
            {
                memset(_mag_n - 1, 0, mapstep * sizeof(int));

                if(cn > 1)
                {
                    std::swap(_dx_n, _dx_a);
                    std::swap(_dy_n, _dy_a);
                }
            }

            // From here actual src row is (i - 1)
            // Set left and right border to 1
#if CV_SIMD128
            if(haveSIMD)
                _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128;
            else
#endif
                _pmap = map.ptr<uchar>(i) + 1;

            _pmap[src.cols] =_pmap[-1] = 1;

            if(cn == 1)
            {
                _dx = dx.ptr<short>(i - rowStart - 1);
                _dy = dy.ptr<short>(i - rowStart - 1);
            }
            else
            {
                _dx = _dx_a;
                _dy = _dy_a;
            }

            const int TG22 = 13573;
            int j = 0;
#if CV_SIMD128
            if (haveSIMD)
            {
                const v_int32x4 v_low = v_setall_s32(low);
                const v_int8x16 v_one = v_setall_s8(1);

                for (; j <= src.cols - 32; j += 32)
                {
                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));

                    v_int32x4 v_cmp1 = v_m1 > v_low;
                    v_int32x4 v_cmp2 = v_m2 > v_low;
                    v_int32x4 v_cmp3 = v_m3 > v_low;
                    v_int32x4 v_cmp4 = v_m4 > v_low;

                    v_m1 = v_load_aligned((const int*)(_mag_a + j + 16));
                    v_m2 = v_load_aligned((const int*)(_mag_a + j + 20));
                    v_m3 = v_load_aligned((const int*)(_mag_a + j + 24));
                    v_m4 = v_load_aligned((const int*)(_mag_a + j + 28));

                    v_store_aligned((signed char*)(_pmap + j), v_one);
                    v_store_aligned((signed char*)(_pmap + j + 16), v_one);

                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    v_cmp1 = v_m1 > v_low;
                    v_cmp2 = v_m2 > v_low;
                    v_cmp3 = v_m3 > v_low;
                    v_cmp4 = v_m4 > v_low;

                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);

                    v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    unsigned int mask = v_signmask(v_cmp);

                    v_cmp = v_pack(v_cmp80, v_cmp81);
                    mask |= v_signmask(v_cmp) << 16;

                    if (mask)
                    {
                        int k = j;

                        do
                        {
                            int l = trailingZeros32(mask);
                            k += l;
                            mask >>= l;

                            int m = _mag_a[k];
                            short xs = _dx[k];
                            short ys = _dy[k];
                            int x = (int)std::abs(xs);
                            int y = (int)std::abs(ys) << 15;

                            int tg22x = x * TG22;

                            if (y < tg22x)
                            {
                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                                {
                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                }
                            }
                            else
                            {
                                int tg67x = tg22x + (x << 16);
                                if (y > tg67x)
                                {
                                    if (m > _mag_p[k] && m >= _mag_n[k])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                                else
                                {
                                    int s = (xs ^ ys) < 0 ? -1 : 1;
                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                            }
                            ++k;
                        } while((mask >>= 1));
                    }
                }

                if (j <= src.cols - 16)
                {
                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));

                    v_store_aligned((signed char*)(_pmap + j), v_one);

                    v_int32x4 v_cmp1 = v_m1 > v_low;
                    v_int32x4 v_cmp2 = v_m2 > v_low;
                    v_int32x4 v_cmp3 = v_m3 > v_low;
                    v_int32x4 v_cmp4 = v_m4 > v_low;

                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
                    unsigned int mask = v_signmask(v_cmp);

                    if (mask)
                    {
                        int k = j;

                        do
                        {
                            int l = trailingZeros32(mask);
                            k += l;
                            mask >>= l;

                            int m = _mag_a[k];
                            short xs = _dx[k];
                            short ys = _dy[k];
                            int x = (int)std::abs(xs);
                            int y = (int)std::abs(ys) << 15;

                            int tg22x = x * TG22;

                            if (y < tg22x)
                            {
                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                                {
                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                }
                            }
                            else
                            {
                                int tg67x = tg22x + (x << 16);
                                if (y > tg67x)
                                {
                                    if (m > _mag_p[k] && m >= _mag_n[k])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                                else
                                {
                                    int s = (xs ^ ys) < 0 ? -1 : 1;
                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                            }
                            ++k;
                        } while((mask >>= 1));
                    }
                    j += 16;
                }
            }
#endif
            for (; j < src.cols; j++)
            {
                int m = _mag_a[j];

                if (m > low)
                {
                    short xs = _dx[j];
                    short ys = _dy[j];
                    int x = (int)std::abs(xs);
                    int y = (int)std::abs(ys) << 15;

                    int tg22x = x * TG22;

                    if (y < tg22x)
                    {
                        if (m > _mag_a[j - 1] && m >= _mag_a[j + 1])
                        {
                            CANNY_CHECK(m, high, (_pmap+j), stack);
                        }
                    }
                    else
                    {
                        int tg67x = tg22x + (x << 16);
                        if (y > tg67x)
                        {
                            if (m > _mag_p[j] && m >= _mag_n[j])
                            {
                                CANNY_CHECK(m, high, (_pmap+j), stack);
                            }
                        }
                        else
                        {
                            int s = (xs ^ ys) < 0 ? -1 : 1;
                            if(m > _mag_p[j - s] && m > _mag_n[j + s])
                            {
                                CANNY_CHECK(m, high, (_pmap+j), stack);
                            }
                        }
                    }
                }
                _pmap[j] = 1;
            }
        }