int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; len *= cn; int x = 0; v_int32 v_sum = vx_setzero_s32(); int len0 = len & -v_int8::nlanes; while (x < len0) { const int len_tmp = min(x + 256*v_int16::nlanes, len0); v_int16 v_sum16 = vx_setzero_s16(); for (; x < len_tmp; x += v_int8::nlanes) { v_int16 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); v_sum16 += v_src0 + v_src1; } v_int32 v_half0, v_half1; v_expand(v_sum16, v_half0, v_half1); v_sum += v_half0 + v_half1; } if (x <= len - v_int16::nlanes) { v_int32 v_half0, v_half1; v_expand(vx_load_expand(src0 + x), v_half0, v_half1); v_sum += v_half0 + v_half1; x += v_int16::nlanes; } if (x <= len - v_int32::nlanes) { v_sum += vx_load_expand_q(src0 + x); x += v_int32::nlanes; } if (cn == 1) *dst += v_reduce_sum(v_sum); else { int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; v_store_aligned(ar, v_sum); for (int i = 0; i < v_int32::nlanes; ++i) dst[i % cn] += ar[i]; } v_cleanup(); return x / cn; }
int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; len *= cn; int x = 0; v_int32 v_sum = vx_setzero_s32(); for (; x <= len - v_int16::nlanes; x += v_int16::nlanes) { v_int32 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); v_sum += v_src0 + v_src1; } if (x <= len - v_int32::nlanes) { v_sum += vx_load_expand(src0 + x); x += v_int32::nlanes; } if (cn == 1) *dst += v_reduce_sum(v_sum); else { int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; v_store_aligned(ar, v_sum); for (int i = 0; i < v_int32::nlanes; ++i) dst[i % cn] += ar[i]; } v_cleanup(); return x / cn; }
// v_expand and v_load_expand TheTest & test_expand() { typedef typename V_RegTrait128<LaneType>::w_reg Rx2; Data<R> dataA; R a = dataA; Data<Rx2> resB = v_load_expand(dataA.d); Rx2 c, d; v_expand(a, c, d); Data<Rx2> resC = c, resD = d; const int n = Rx2::nlanes; for (int i = 0; i < n; ++i) { EXPECT_EQ(dataA[i], resB[i]); EXPECT_EQ(dataA[i], resC[i]); EXPECT_EQ(dataA[i + n], resD[i]); } return *this; }
void exp64f( const double *_x, double *y, int n ) { CV_INSTRUMENT_REGION(); const double* const expTab = cv::details::getExpTab64f(); const double A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; int i = 0; const Cv64suf* x = (const Cv64suf*)_x; double minval = (-exp_max_val/exp_prescale); double maxval = (exp_max_val/exp_prescale); #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; const v_float64 vprescale = vx_setall_f64(exp_prescale); const v_float64 vpostscale = vx_setall_f64(exp_postscale); const v_float64 vminval = vx_setall_f64(minval); const v_float64 vmaxval = vx_setall_f64(maxval); const v_float64 vA1 = vx_setall_f64(A1); const v_float64 vA2 = vx_setall_f64(A2); const v_float64 vA3 = vx_setall_f64(A3); const v_float64 vA4 = vx_setall_f64(A4); const v_float64 vA5 = vx_setall_f64(A5); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) { if( i + VECSZ*2 > n ) { if( i == 0 || _x == y ) break; i = n - VECSZ*2; y_aligned = false; } v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); xf0 *= vprescale; xf1 *= vprescale; v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale; xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale; v_float64 yf0 = v_lut(expTab, xi0 & vidxmask); v_float64 yf1 = v_lut(expTab, xi1 & vidxmask); v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047); xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047); xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047); v_int64 xq0, xq1, dummy; v_expand(xi0, xq0, dummy); v_expand(xi1, xq1, dummy); yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0)); yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1)); v_float64 zf0 = xf0 + vA1; v_float64 zf1 = xf1 + vA1; zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); zf0 = v_fma(zf0, xf0, vA3); zf1 = v_fma(zf1, xf1, vA3); zf0 = v_fma(zf0, xf0, vA4); zf1 = v_fma(zf1, xf1, vA4); zf0 = v_fma(zf0, xf0, vA5); zf1 = v_fma(zf1, xf1, vA5); zf0 *= yf0; zf1 *= yf1; if( y_aligned ) { v_store_aligned(y + i, zf0); v_store_aligned(y + i + VECSZ, zf1); } else { v_store(y + i, zf0); v_store(y + i + VECSZ, zf1); } } vx_cleanup(); #endif for( ; i < n; i++ ) { double x0 = x[i].f; x0 = std::min(std::max(x0, minval), maxval); x0 *= exp_prescale; Cv64suf buf; int xi = saturate_cast<int>(x0); x0 = (x0 - xi)*exp_postscale; int t = (xi >> EXPTAB_SCALE) + 1023; t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; buf.i = (int64)t << 52; y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5); } }
void operator()(const Range &boundaries) const { CV_TRACE_FUNCTION(); Mat dx, dy; AutoBuffer<short> dxMax(0), dyMax(0); std::deque<uchar*> stack, borderPeaksLocal; const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1); int *_mag_p, *_mag_a, *_mag_n; short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL; uchar *_pmap; double scale = 1.0; CV_TRACE_REGION("gradient") if(needGradient) { if (aperture_size == 7) { scale = 1 / 16.0; } Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE); Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE); } else { dx = src.rowRange(rowStart, rowEnd); dy = src2.rowRange(rowStart, rowEnd); } CV_TRACE_REGION_NEXT("magnitude"); if(cn > 1) { dxMax.allocate(2 * dx.cols); dyMax.allocate(2 * dy.cols); _dx_a = (short*)dxMax; _dx_n = _dx_a + dx.cols; _dy_a = (short*)dyMax; _dy_n = _dy_a + dy.cols; } // _mag_p: previous row, _mag_a: actual row, _mag_n: next row #if CV_SIMD128 AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128)); _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128); _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128); _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128); #else AutoBuffer<int> buffer(3 * (mapstep * cn)); _mag_p = (int*)buffer + 1; _mag_a = _mag_p + mapstep * cn; _mag_n = _mag_a + mapstep * cn; #endif // For the first time when just 2 rows are filled and for left and right borders if(rowStart == boundaries.start) memset(_mag_n - 1, 0, mapstep * sizeof(int)); else _mag_n[src.cols] = _mag_n[-1] = 0; _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0; // calculate magnitude and angle of gradient, perform non-maxima suppression. // fill the map with one of the following values: // 0 - the pixel might belong to an edge // 1 - the pixel can not belong to an edge // 2 - the pixel does belong to an edge for (int i = rowStart; i <= boundaries.end; ++i) { // Scroll the ring buffer std::swap(_mag_n, _mag_a); std::swap(_mag_n, _mag_p); if(i < rowEnd) { // Next row calculation _dx = dx.ptr<short>(i - rowStart); _dy = dy.ptr<short>(i - rowStart); if (L2gradient) { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for ( ; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short*)(_dx + j)); v_int16x8 v_dy = v_load((const short*)(_dy + j)); v_int32x4 v_dxp_low, v_dxp_high; v_int32x4 v_dyp_low, v_dyp_high; v_expand(v_dx, v_dxp_low, v_dxp_high); v_expand(v_dy, v_dyp_low, v_dyp_high); v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); } } #endif for ( ; j < width; ++j) _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; } else { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for(; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short *)(_dx + j)); v_int16x8 v_dy = v_load((const short *)(_dy + j)); v_dx = v_reinterpret_as_s16(v_abs(v_dx)); v_dy = v_reinterpret_as_s16(v_abs(v_dy)); v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh; v_expand(v_dx, v_dx_ml, v_dx_mh); v_expand(v_dy, v_dy_ml, v_dy_mh); v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh); } } #endif for ( ; j < width; ++j) _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); } if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn) { int maxIdx = jn; for(int k = 1; k < cn; ++k) if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k; _mag_n[j] = _mag_n[maxIdx]; _dx_n[j] = _dx[maxIdx]; _dy_n[j] = _dy[maxIdx]; } _mag_n[src.cols] = 0; } // at the very beginning we do not have a complete ring // buffer of 3 magnitude rows for non-maxima suppression if (i <= boundaries.start) continue; } else { memset(_mag_n - 1, 0, mapstep * sizeof(int)); if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); } } // From here actual src row is (i - 1) // Set left and right border to 1 #if CV_SIMD128 if(haveSIMD) _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128; else #endif _pmap = map.ptr<uchar>(i) + 1; _pmap[src.cols] =_pmap[-1] = 1; if(cn == 1) { _dx = dx.ptr<short>(i - rowStart - 1); _dy = dy.ptr<short>(i - rowStart - 1); } else { _dx = _dx_a; _dy = _dy_a; } const int TG22 = 13573; int j = 0; #if CV_SIMD128 if (haveSIMD) { const v_int32x4 v_low = v_setall_s32(low); const v_int8x16 v_one = v_setall_s8(1); for (; j <= src.cols - 32; j += 32) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_m1 = v_load_aligned((const int*)(_mag_a + j + 16)); v_m2 = v_load_aligned((const int*)(_mag_a + j + 20)); v_m3 = v_load_aligned((const int*)(_mag_a + j + 24)); v_m4 = v_load_aligned((const int*)(_mag_a + j + 28)); v_store_aligned((signed char*)(_pmap + j), v_one); v_store_aligned((signed char*)(_pmap + j + 16), v_one); v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_cmp1 = v_m1 > v_low; v_cmp2 = v_m2 > v_low; v_cmp3 = v_m3 > v_low; v_cmp4 = v_m4 > v_low; v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); v_cmp80 = v_pack(v_cmp1, v_cmp2); v_cmp81 = v_pack(v_cmp3, v_cmp4); unsigned int mask = v_signmask(v_cmp); v_cmp = v_pack(v_cmp80, v_cmp81); mask |= v_signmask(v_cmp) << 16; if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } } if (j <= src.cols - 16) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_store_aligned((signed char*)(_pmap + j), v_one); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); unsigned int mask = v_signmask(v_cmp); if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } j += 16; } } #endif for (; j < src.cols; j++) { int m = _mag_a[j]; if (m > low) { short xs = _dx[j]; short ys = _dy[j]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[j - 1] && m >= _mag_a[j + 1]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[j] && m >= _mag_n[j]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[j - s] && m > _mag_n[j + s]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } } } _pmap[j] = 1; } }
void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, int ksize, int borderType ) { CV_INSTRUMENT_REGION() // Prepare InputArray src Mat src = _src.getMat(); CV_Assert( !src.empty() ); CV_Assert( src.type() == CV_8UC1 ); CV_Assert( borderType == BORDER_DEFAULT || borderType == BORDER_REPLICATE ); // Prepare OutputArrays dx, dy _dx.create( src.size(), CV_16SC1 ); _dy.create( src.size(), CV_16SC1 ); Mat dx = _dx.getMat(), dy = _dy.getMat(); // TODO: Allow for other kernel sizes CV_Assert(ksize == 3); // Get dimensions const int H = src.rows, W = src.cols; // Row, column indices int i = 0, j = 0; // Handle border types int i_top = 0, // Case for H == 1 && W == 1 && BORDER_REPLICATE i_bottom = H - 1, j_offl = 0, // j offset from 0th pixel to reach -1st pixel j_offr = 0; // j offset from W-1th pixel to reach Wth pixel if ( borderType == BORDER_DEFAULT ) // Equiv. to BORDER_REFLECT_101 { if ( H > 1 ) { i_top = 1; i_bottom = H - 2; } if ( W > 1 ) { j_offl = 1; j_offr = -1; } } // Pointer to row vectors uchar *p_src, *c_src, *n_src; // previous, current, next row short *c_dx, *c_dy; int i_start = 0; int j_start = 0; #if CV_SIMD128 && CV_SSE2 if(hasSIMD128()) { uchar *m_src; short *n_dx, *n_dy; // Characters in variable names have the following meanings: // u: unsigned char // s: signed int // // [row][column] // m: offset -1 // n: offset 0 // p: offset 1 // Example: umn is offset -1 in row and offset 0 in column for ( i = 0; i < H - 1; i += 2 ) { if ( i == 0 ) p_src = src.ptr<uchar>(i_top); else p_src = src.ptr<uchar>(i-1); c_src = src.ptr<uchar>(i); n_src = src.ptr<uchar>(i+1); if ( i == H - 2 ) m_src = src.ptr<uchar>(i_bottom); else m_src = src.ptr<uchar>(i+2); c_dx = dx.ptr<short>(i); c_dy = dy.ptr<short>(i); n_dx = dx.ptr<short>(i+1); n_dy = dy.ptr<short>(i+1); v_uint8x16 v_select_m = v_uint8x16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF); // Process rest of columns 16-column chunks at a time for ( j = 1; j < W - 16; j += 16 ) { // Load top row for 3x3 Sobel filter v_uint8x16 v_um = v_load(&p_src[j-1]); v_uint8x16 v_up = v_load(&p_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_uint8x16 v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2); // Load second row for 3x3 Sobel filter v_um = v_load(&c_src[j-1]); v_up = v_load(&c_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2); // Load third row for 3x3 Sobel filter v_um = v_load(&n_src[j-1]); v_up = v_load(&n_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2); // dx & dy for rows 1, 2, 3 v_int16x8 v_sdx1, v_sdy1; spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1, v_s1m1, v_s1n1, v_s1p1, v_s2m1, v_s2p1, v_s3m1, v_s3n1, v_s3p1 ); v_int16x8 v_sdx2, v_sdy2; spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2, v_s1m2, v_s1n2, v_s1p2, v_s2m2, v_s2p2, v_s3m2, v_s3n2, v_s3p2 ); // Store v_store(&c_dx[j], v_sdx1); v_store(&c_dx[j+8], v_sdx2); v_store(&c_dy[j], v_sdy1); v_store(&c_dy[j+8], v_sdy2); // Load fourth row for 3x3 Sobel filter v_um = v_load(&m_src[j-1]); v_up = v_load(&m_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2); // dx & dy for rows 2, 3, 4 spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1, v_s2m1, v_s2n1, v_s2p1, v_s3m1, v_s3p1, v_s4m1, v_s4n1, v_s4p1 ); spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2, v_s2m2, v_s2n2, v_s2p2, v_s3m2, v_s3p2, v_s4m2, v_s4n2, v_s4p2 ); // Store v_store(&n_dx[j], v_sdx1); v_store(&n_dx[j+8], v_sdx2); v_store(&n_dy[j], v_sdy1); v_store(&n_dy[j+8], v_sdy2); } } } i_start = i; j_start = j; #endif int j_p, j_n; uchar v00, v01, v02, v10, v11, v12, v20, v21, v22; for ( i = 0; i < H; i++ ) { if ( i == 0 ) p_src = src.ptr<uchar>(i_top); else p_src = src.ptr<uchar>(i-1); c_src = src.ptr<uchar>(i); if ( i == H - 1 ) n_src = src.ptr<uchar>(i_bottom); else n_src = src.ptr<uchar>(i+1); c_dx = dx.ptr<short>(i); c_dy = dy.ptr<short>(i); // Process left-most column j = 0; j_p = j + j_offl; j_n = 1; if ( j_n >= W ) j_n = j + j_offr; v00 = p_src[j_p]; v01 = p_src[j]; v02 = p_src[j_n]; v10 = c_src[j_p]; v11 = c_src[j]; v12 = c_src[j_n]; v20 = n_src[j_p]; v21 = n_src[j]; v22 = n_src[j_n]; spatialGradientKernel<short>( c_dx[0], c_dy[0], v00, v01, v02, v10, v12, v20, v21, v22 ); v00 = v01; v10 = v11; v20 = v21; v01 = v02; v11 = v12; v21 = v22; // Process middle columns j = i >= i_start ? 1 : j_start; j_p = j - 1; v00 = p_src[j_p]; v01 = p_src[j]; v10 = c_src[j_p]; v11 = c_src[j]; v20 = n_src[j_p]; v21 = n_src[j]; for ( ; j < W - 1; j++ ) { // Get values for next column j_n = j + 1; v02 = p_src[j_n]; v12 = c_src[j_n]; v22 = n_src[j_n]; spatialGradientKernel<short>( c_dx[j], c_dy[j], v00, v01, v02, v10, v12, v20, v21, v22 ); // Move values back one column for next iteration v00 = v01; v10 = v11; v20 = v21; v01 = v02; v11 = v12; v21 = v22; } // Process right-most column if ( j < W ) { j_n = j + j_offr; v02 = p_src[j_n]; v12 = c_src[j_n]; v22 = n_src[j_n]; spatialGradientKernel<short>( c_dx[j], c_dy[j], v00, v01, v02, v10, v12, v20, v21, v22 ); } } }