TheTest & test_mask() { Data<R> dataA, dataB, dataC, dataD(1), dataE(2); dataA[1] *= (LaneType)-1; dataC *= (LaneType)-1; R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE; int m = v_signmask(a); EXPECT_EQ(2, m); EXPECT_EQ(false, v_check_all(a)); EXPECT_EQ(false, v_check_all(b)); EXPECT_EQ(true, v_check_all(c)); EXPECT_EQ(true, v_check_any(a)); EXPECT_EQ(false, v_check_any(b)); EXPECT_EQ(true, v_check_any(c)); typedef V_TypeTraits<LaneType> Traits; typedef typename Traits::int_type int_type; R f = v_select(b, d, e); Data<R> resF = f; for (int i = 0; i < R::nlanes; ++i) { int_type m2 = Traits::reinterpret_int(dataB[i]); EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2) | (Traits::reinterpret_int(dataE[i]) & ~m2), Traits::reinterpret_int(resF[i])); } return *this; }
inline int v_signmask(const v_float32x4& a) { return v_signmask(v_reinterpret_as_u32(a)); }
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression) { Mat img = _img.getMat(); const int K = patternSize/2, N = patternSize + K + 1; int i, j, k, pixel[25]; makeOffsets(pixel, (int)img.step, patternSize); #if CV_SIMD128 const int quarterPatternSize = patternSize/4; v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K); bool hasSimd = hasSIMD128(); #if CV_TRY_AVX2 Ptr<opt_AVX2::FAST_t_patternSize16_AVX2> fast_t_impl_avx2; if(CV_CPU_HAS_SUPPORT_AVX2) fast_t_impl_avx2 = opt_AVX2::FAST_t_patternSize16_AVX2::getImpl(img.cols, threshold, nonmax_suppression, pixel); #endif #endif keypoints.clear(); threshold = std::min(std::max(threshold, 0), 255); uchar threshold_tab[512]; for( i = -255; i <= 255; i++ ) threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); uchar* buf[3]; buf[0] = _buf.data(); buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; int* cpbuf[3]; cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; cpbuf[1] = cpbuf[0] + img.cols + 1; cpbuf[2] = cpbuf[1] + img.cols + 1; memset(buf[0], 0, img.cols*3); for(i = 3; i < img.rows-2; i++) { const uchar* ptr = img.ptr<uchar>(i) + 3; uchar* curr = buf[(i - 3)%3]; int* cornerpos = cpbuf[(i - 3)%3]; memset(curr, 0, img.cols); int ncorners = 0; if( i < img.rows - 3 ) { j = 3; #if CV_SIMD128 if( hasSimd ) { if( patternSize == 16 ) { #if CV_TRY_AVX2 if (fast_t_impl_avx2) fast_t_impl_avx2->process(j, ptr, curr, cornerpos, ncorners); #endif //vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements { for (; j < img.cols - 16 - 3; j += 16, ptr += 16) { v_uint8x16 v = v_load(ptr); v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta); v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta); v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta)); v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta)); v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta)); v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta)); v_int8x16 m0, m1; m0 = (v0 < x0) & (v0 < x1); m1 = (x0 < v1) & (x1 < v1); m0 = m0 | ((v0 < x1) & (v0 < x2)); m1 = m1 | ((x1 < v1) & (x2 < v1)); m0 = m0 | ((v0 < x2) & (v0 < x3)); m1 = m1 | ((x2 < v1) & (x3 < v1)); m0 = m0 | ((v0 < x3) & (v0 < x0)); m1 = m1 | ((x3 < v1) & (x0 < v1)); m0 = m0 | m1; int mask = v_signmask(m0); if( mask == 0 ) continue; if( (mask & 255) == 0 ) { j -= 8; ptr -= 8; continue; } v_int8x16 c0 = v_setzero_s8(); v_int8x16 c1 = v_setzero_s8(); v_uint8x16 max0 = v_setzero_u8(); v_uint8x16 max1 = v_setzero_u8(); for( k = 0; k < N; k++ ) { v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta); m0 = v0 < x; m1 = x < v1; c0 = v_sub_wrap(c0, m0) & m0; c1 = v_sub_wrap(c1, m1) & m1; max0 = v_max(max0, v_reinterpret_as_u8(c0)); max1 = v_max(max1, v_reinterpret_as_u8(c1)); } max0 = v_max(max0, max1); int m = v_signmask(K16 < max0); for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) { if(m & 1) { cornerpos[ncorners++] = j+k; if(nonmax_suppression) curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold); } } } } } } #endif for( ; j < img.cols - 3; j++, ptr++ ) { int v = ptr[0]; const uchar* tab = &threshold_tab[0] - v + 255; int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; if( d & 1 ) { int vt = v - threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x < vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } if( d & 2 ) { int vt = v + threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } } } cornerpos[-1] = ncorners; if( i == 3 ) continue; const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3]; cornerpos = cpbuf[(i - 4 + 3)%3]; ncorners = cornerpos[-1]; for( k = 0; k < ncorners; k++ ) { j = cornerpos[k]; int score = prev[j]; if( !nonmax_suppression || (score > prev[j+1] && score > prev[j-1] && score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) { keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score)); } } }
inline int v_signmask(const v_int16x8& a) { return v_signmask(v_reinterpret_as_u16(a)); }
inline int v_signmask(const v_int8x16& a) { return v_signmask(v_reinterpret_as_u8(a)); }
inline int v_signmask(const v_float64x2& a) { return v_signmask(v_reinterpret_as_s64(a)); }
inline int v_signmask(const v_uint32x4& a) { return v_signmask(v_reinterpret_as_s32(a)); }
void operator()(const Range &boundaries) const { CV_TRACE_FUNCTION(); Mat dx, dy; AutoBuffer<short> dxMax(0), dyMax(0); std::deque<uchar*> stack, borderPeaksLocal; const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1); int *_mag_p, *_mag_a, *_mag_n; short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL; uchar *_pmap; double scale = 1.0; CV_TRACE_REGION("gradient") if(needGradient) { if (aperture_size == 7) { scale = 1 / 16.0; } Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE); Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE); } else { dx = src.rowRange(rowStart, rowEnd); dy = src2.rowRange(rowStart, rowEnd); } CV_TRACE_REGION_NEXT("magnitude"); if(cn > 1) { dxMax.allocate(2 * dx.cols); dyMax.allocate(2 * dy.cols); _dx_a = (short*)dxMax; _dx_n = _dx_a + dx.cols; _dy_a = (short*)dyMax; _dy_n = _dy_a + dy.cols; } // _mag_p: previous row, _mag_a: actual row, _mag_n: next row #if CV_SIMD128 AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128)); _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128); _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128); _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128); #else AutoBuffer<int> buffer(3 * (mapstep * cn)); _mag_p = (int*)buffer + 1; _mag_a = _mag_p + mapstep * cn; _mag_n = _mag_a + mapstep * cn; #endif // For the first time when just 2 rows are filled and for left and right borders if(rowStart == boundaries.start) memset(_mag_n - 1, 0, mapstep * sizeof(int)); else _mag_n[src.cols] = _mag_n[-1] = 0; _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0; // calculate magnitude and angle of gradient, perform non-maxima suppression. // fill the map with one of the following values: // 0 - the pixel might belong to an edge // 1 - the pixel can not belong to an edge // 2 - the pixel does belong to an edge for (int i = rowStart; i <= boundaries.end; ++i) { // Scroll the ring buffer std::swap(_mag_n, _mag_a); std::swap(_mag_n, _mag_p); if(i < rowEnd) { // Next row calculation _dx = dx.ptr<short>(i - rowStart); _dy = dy.ptr<short>(i - rowStart); if (L2gradient) { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for ( ; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short*)(_dx + j)); v_int16x8 v_dy = v_load((const short*)(_dy + j)); v_int32x4 v_dxp_low, v_dxp_high; v_int32x4 v_dyp_low, v_dyp_high; v_expand(v_dx, v_dxp_low, v_dxp_high); v_expand(v_dy, v_dyp_low, v_dyp_high); v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); } } #endif for ( ; j < width; ++j) _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; } else { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for(; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short *)(_dx + j)); v_int16x8 v_dy = v_load((const short *)(_dy + j)); v_dx = v_reinterpret_as_s16(v_abs(v_dx)); v_dy = v_reinterpret_as_s16(v_abs(v_dy)); v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh; v_expand(v_dx, v_dx_ml, v_dx_mh); v_expand(v_dy, v_dy_ml, v_dy_mh); v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh); } } #endif for ( ; j < width; ++j) _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); } if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn) { int maxIdx = jn; for(int k = 1; k < cn; ++k) if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k; _mag_n[j] = _mag_n[maxIdx]; _dx_n[j] = _dx[maxIdx]; _dy_n[j] = _dy[maxIdx]; } _mag_n[src.cols] = 0; } // at the very beginning we do not have a complete ring // buffer of 3 magnitude rows for non-maxima suppression if (i <= boundaries.start) continue; } else { memset(_mag_n - 1, 0, mapstep * sizeof(int)); if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); } } // From here actual src row is (i - 1) // Set left and right border to 1 #if CV_SIMD128 if(haveSIMD) _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128; else #endif _pmap = map.ptr<uchar>(i) + 1; _pmap[src.cols] =_pmap[-1] = 1; if(cn == 1) { _dx = dx.ptr<short>(i - rowStart - 1); _dy = dy.ptr<short>(i - rowStart - 1); } else { _dx = _dx_a; _dy = _dy_a; } const int TG22 = 13573; int j = 0; #if CV_SIMD128 if (haveSIMD) { const v_int32x4 v_low = v_setall_s32(low); const v_int8x16 v_one = v_setall_s8(1); for (; j <= src.cols - 32; j += 32) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_m1 = v_load_aligned((const int*)(_mag_a + j + 16)); v_m2 = v_load_aligned((const int*)(_mag_a + j + 20)); v_m3 = v_load_aligned((const int*)(_mag_a + j + 24)); v_m4 = v_load_aligned((const int*)(_mag_a + j + 28)); v_store_aligned((signed char*)(_pmap + j), v_one); v_store_aligned((signed char*)(_pmap + j + 16), v_one); v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_cmp1 = v_m1 > v_low; v_cmp2 = v_m2 > v_low; v_cmp3 = v_m3 > v_low; v_cmp4 = v_m4 > v_low; v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); v_cmp80 = v_pack(v_cmp1, v_cmp2); v_cmp81 = v_pack(v_cmp3, v_cmp4); unsigned int mask = v_signmask(v_cmp); v_cmp = v_pack(v_cmp80, v_cmp81); mask |= v_signmask(v_cmp) << 16; if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } } if (j <= src.cols - 16) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_store_aligned((signed char*)(_pmap + j), v_one); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); unsigned int mask = v_signmask(v_cmp); if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } j += 16; } } #endif for (; j < src.cols; j++) { int m = _mag_a[j]; if (m > low) { short xs = _dx[j]; short ys = _dy[j]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[j - 1] && m >= _mag_a[j + 1]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[j] && m >= _mag_n[j]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[j - s] && m > _mag_n[j + s]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } } } _pmap[j] = 1; } }