TheTest & test_pack() { typedef typename V_RegTrait128<LaneType>::w_reg Rx2; typedef typename Rx2::lane_type w_type; Data<Rx2> dataA, dataB; dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10; dataB *= 10; Rx2 a = dataA, b = dataB; Data<R> resC = v_pack(a, b); Data<R> resD = v_rshr_pack<s>(a, b); Data<R> resE(0); v_pack_store(resE.d, b); Data<R> resF(0); v_rshr_pack_store<s>(resF.d, b); const int n = Rx2::nlanes; const w_type add = (w_type)1 << (s - 1); for (int i = 0; i < n; ++i) { EXPECT_EQ(saturate_cast<LaneType>(dataA[i]), resC[i]); EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resC[i + n]); EXPECT_EQ(saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]); EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resD[i + n]); EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resE[i]); EXPECT_EQ((LaneType)0, resE[i + n]); EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resF[i]); EXPECT_EQ((LaneType)0, resF[i + n]); } return *this; }
void log64f( const double *x, double *y, int n ) { CV_INSTRUMENT_REGION(); const double* const logTab = cv::details::getLogTab64f(); const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1; const double A7 = 1.0, A6 = -0.5, A5 = 0.333333333333333314829616256247390992939472198486328125, A4 = -0.25, A3 = 0.2, A2 = -0.1666666666666666574148081281236954964697360992431640625, A1 = 0.1428571428571428769682682968777953647077083587646484375, A0 = -0.125; int i = 0; #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; const v_float64 vln2 = vx_setall_f64(ln_2); const v_float64 vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1), vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3), vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5), vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7); for( ; i < n; i += VECSZ ) { if( i + VECSZ > n ) { if( i == 0 || x == y ) break; i = n - VECSZ; } v_int64 h0 = vx_load((const int64*)x + i); v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64()); yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023); v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52); h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0); v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2); v_float64 xf0, yf0; v_lut_deinterleave(logTab, idx, yf0, xf0); yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0); v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512); xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta); v_float64 xq = xf0*xf0; v_float64 zf0 = v_fma(xq, vA0, vA2); v_float64 zf1 = v_fma(xq, vA1, vA3); zf0 = v_fma(zf0, xq, vA4); zf1 = v_fma(zf1, xq, vA5); zf0 = v_fma(zf0, xq, vA6); zf1 = v_fma(zf1, xq, vA7); zf1 = v_fma(zf1, xf0, yf0); zf0 = v_fma(zf0, xq, zf1); v_store(y + i, zf0); } #endif for( ; i < n; i++ ) { Cv64suf buf; int64 i0 = ((const int64*)x)[i]; buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52); int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2); double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx]; double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.); double xq = x0*x0; y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0; } }
void operator()(const Range &boundaries) const { CV_TRACE_FUNCTION(); Mat dx, dy; AutoBuffer<short> dxMax(0), dyMax(0); std::deque<uchar*> stack, borderPeaksLocal; const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1); int *_mag_p, *_mag_a, *_mag_n; short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL; uchar *_pmap; double scale = 1.0; CV_TRACE_REGION("gradient") if(needGradient) { if (aperture_size == 7) { scale = 1 / 16.0; } Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE); Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE); } else { dx = src.rowRange(rowStart, rowEnd); dy = src2.rowRange(rowStart, rowEnd); } CV_TRACE_REGION_NEXT("magnitude"); if(cn > 1) { dxMax.allocate(2 * dx.cols); dyMax.allocate(2 * dy.cols); _dx_a = (short*)dxMax; _dx_n = _dx_a + dx.cols; _dy_a = (short*)dyMax; _dy_n = _dy_a + dy.cols; } // _mag_p: previous row, _mag_a: actual row, _mag_n: next row #if CV_SIMD128 AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128)); _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128); _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128); _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128); #else AutoBuffer<int> buffer(3 * (mapstep * cn)); _mag_p = (int*)buffer + 1; _mag_a = _mag_p + mapstep * cn; _mag_n = _mag_a + mapstep * cn; #endif // For the first time when just 2 rows are filled and for left and right borders if(rowStart == boundaries.start) memset(_mag_n - 1, 0, mapstep * sizeof(int)); else _mag_n[src.cols] = _mag_n[-1] = 0; _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0; // calculate magnitude and angle of gradient, perform non-maxima suppression. // fill the map with one of the following values: // 0 - the pixel might belong to an edge // 1 - the pixel can not belong to an edge // 2 - the pixel does belong to an edge for (int i = rowStart; i <= boundaries.end; ++i) { // Scroll the ring buffer std::swap(_mag_n, _mag_a); std::swap(_mag_n, _mag_p); if(i < rowEnd) { // Next row calculation _dx = dx.ptr<short>(i - rowStart); _dy = dy.ptr<short>(i - rowStart); if (L2gradient) { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for ( ; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short*)(_dx + j)); v_int16x8 v_dy = v_load((const short*)(_dy + j)); v_int32x4 v_dxp_low, v_dxp_high; v_int32x4 v_dyp_low, v_dyp_high; v_expand(v_dx, v_dxp_low, v_dxp_high); v_expand(v_dy, v_dyp_low, v_dyp_high); v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); } } #endif for ( ; j < width; ++j) _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; } else { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for(; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short *)(_dx + j)); v_int16x8 v_dy = v_load((const short *)(_dy + j)); v_dx = v_reinterpret_as_s16(v_abs(v_dx)); v_dy = v_reinterpret_as_s16(v_abs(v_dy)); v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh; v_expand(v_dx, v_dx_ml, v_dx_mh); v_expand(v_dy, v_dy_ml, v_dy_mh); v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh); } } #endif for ( ; j < width; ++j) _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); } if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn) { int maxIdx = jn; for(int k = 1; k < cn; ++k) if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k; _mag_n[j] = _mag_n[maxIdx]; _dx_n[j] = _dx[maxIdx]; _dy_n[j] = _dy[maxIdx]; } _mag_n[src.cols] = 0; } // at the very beginning we do not have a complete ring // buffer of 3 magnitude rows for non-maxima suppression if (i <= boundaries.start) continue; } else { memset(_mag_n - 1, 0, mapstep * sizeof(int)); if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); } } // From here actual src row is (i - 1) // Set left and right border to 1 #if CV_SIMD128 if(haveSIMD) _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128; else #endif _pmap = map.ptr<uchar>(i) + 1; _pmap[src.cols] =_pmap[-1] = 1; if(cn == 1) { _dx = dx.ptr<short>(i - rowStart - 1); _dy = dy.ptr<short>(i - rowStart - 1); } else { _dx = _dx_a; _dy = _dy_a; } const int TG22 = 13573; int j = 0; #if CV_SIMD128 if (haveSIMD) { const v_int32x4 v_low = v_setall_s32(low); const v_int8x16 v_one = v_setall_s8(1); for (; j <= src.cols - 32; j += 32) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_m1 = v_load_aligned((const int*)(_mag_a + j + 16)); v_m2 = v_load_aligned((const int*)(_mag_a + j + 20)); v_m3 = v_load_aligned((const int*)(_mag_a + j + 24)); v_m4 = v_load_aligned((const int*)(_mag_a + j + 28)); v_store_aligned((signed char*)(_pmap + j), v_one); v_store_aligned((signed char*)(_pmap + j + 16), v_one); v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_cmp1 = v_m1 > v_low; v_cmp2 = v_m2 > v_low; v_cmp3 = v_m3 > v_low; v_cmp4 = v_m4 > v_low; v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); v_cmp80 = v_pack(v_cmp1, v_cmp2); v_cmp81 = v_pack(v_cmp3, v_cmp4); unsigned int mask = v_signmask(v_cmp); v_cmp = v_pack(v_cmp80, v_cmp81); mask |= v_signmask(v_cmp) << 16; if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } } if (j <= src.cols - 16) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_store_aligned((signed char*)(_pmap + j), v_one); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); unsigned int mask = v_signmask(v_cmp); if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } j += 16; } } #endif for (; j < src.cols; j++) { int m = _mag_a[j]; if (m > low) { short xs = _dx[j]; short ys = _dy[j]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[j - 1] && m >= _mag_a[j + 1]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[j] && m >= _mag_n[j]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[j - s] && m > _mag_n[j + s]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } } } _pmap[j] = 1; } }