TheTest & test_loadstore() { AlignedData<R> data; AlignedData<R> out; // check if addresses are aligned and unaligned respectively EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); // check some initialization methods R r1 = data.a; R r2 = v_load(data.u.d); R r3 = v_load_aligned(data.a.d); R r4(r2); EXPECT_EQ(data.a[0], r1.get0()); EXPECT_EQ(data.u[0], r2.get0()); EXPECT_EQ(data.a[0], r3.get0()); EXPECT_EQ(data.u[0], r4.get0()); R r_low = v_load_low((LaneType*)data.u.d); EXPECT_EQ(data.u[0], r_low.get0()); v_store(out.u.d, r_low); for (int i = 0; i < R::nlanes/2; ++i) { EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]); } R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8)); EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0()); v_store(out.u.d, r_low_align8byte); for (int i = 0; i < R::nlanes/2; ++i) { EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]); } // check some store methods out.u.clear(); out.a.clear(); v_store(out.u.d, r1); v_store_aligned(out.a.d, r2); EXPECT_EQ(data.a, out.a); EXPECT_EQ(data.u, out.u); // check more store methods Data<R> d, res(0); R r5 = d; v_store_high(res.mid(), r5); v_store_low(res.d, r5); EXPECT_EQ(d, res); // check halves load correctness res.clear(); R r6 = v_load_halves(d.d, d.mid()); v_store(res.d, r6); EXPECT_EQ(d, res); // zero, all Data<R> resZ = V_RegTrait128<LaneType>::zero(); Data<R> resV = V_RegTrait128<LaneType>::all(8); for (int i = 0; i < R::nlanes; ++i) { EXPECT_EQ((LaneType)0, resZ[i]); EXPECT_EQ((LaneType)8, resV[i]); } // reinterpret_as v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a); v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a); v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a); v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a); v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a); v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a); v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a); v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a); v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a); #if CV_SIMD128_64F v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a); #endif return *this; }
inline bool v_check_any(const v_uint16x8& a) { return v_check_any(v_reinterpret_as_s16(a)); }
void operator()(const Range &boundaries) const { CV_TRACE_FUNCTION(); Mat dx, dy; AutoBuffer<short> dxMax(0), dyMax(0); std::deque<uchar*> stack, borderPeaksLocal; const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1); int *_mag_p, *_mag_a, *_mag_n; short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL; uchar *_pmap; double scale = 1.0; CV_TRACE_REGION("gradient") if(needGradient) { if (aperture_size == 7) { scale = 1 / 16.0; } Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE); Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE); } else { dx = src.rowRange(rowStart, rowEnd); dy = src2.rowRange(rowStart, rowEnd); } CV_TRACE_REGION_NEXT("magnitude"); if(cn > 1) { dxMax.allocate(2 * dx.cols); dyMax.allocate(2 * dy.cols); _dx_a = (short*)dxMax; _dx_n = _dx_a + dx.cols; _dy_a = (short*)dyMax; _dy_n = _dy_a + dy.cols; } // _mag_p: previous row, _mag_a: actual row, _mag_n: next row #if CV_SIMD128 AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128)); _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128); _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128); _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128); #else AutoBuffer<int> buffer(3 * (mapstep * cn)); _mag_p = (int*)buffer + 1; _mag_a = _mag_p + mapstep * cn; _mag_n = _mag_a + mapstep * cn; #endif // For the first time when just 2 rows are filled and for left and right borders if(rowStart == boundaries.start) memset(_mag_n - 1, 0, mapstep * sizeof(int)); else _mag_n[src.cols] = _mag_n[-1] = 0; _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0; // calculate magnitude and angle of gradient, perform non-maxima suppression. // fill the map with one of the following values: // 0 - the pixel might belong to an edge // 1 - the pixel can not belong to an edge // 2 - the pixel does belong to an edge for (int i = rowStart; i <= boundaries.end; ++i) { // Scroll the ring buffer std::swap(_mag_n, _mag_a); std::swap(_mag_n, _mag_p); if(i < rowEnd) { // Next row calculation _dx = dx.ptr<short>(i - rowStart); _dy = dy.ptr<short>(i - rowStart); if (L2gradient) { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for ( ; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short*)(_dx + j)); v_int16x8 v_dy = v_load((const short*)(_dy + j)); v_int32x4 v_dxp_low, v_dxp_high; v_int32x4 v_dyp_low, v_dyp_high; v_expand(v_dx, v_dxp_low, v_dxp_high); v_expand(v_dy, v_dyp_low, v_dyp_high); v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); } } #endif for ( ; j < width; ++j) _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; } else { int j = 0, width = src.cols * cn; #if CV_SIMD128 if (haveSIMD) { for(; j <= width - 8; j += 8) { v_int16x8 v_dx = v_load((const short *)(_dx + j)); v_int16x8 v_dy = v_load((const short *)(_dy + j)); v_dx = v_reinterpret_as_s16(v_abs(v_dx)); v_dy = v_reinterpret_as_s16(v_abs(v_dy)); v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh; v_expand(v_dx, v_dx_ml, v_dx_mh); v_expand(v_dy, v_dy_ml, v_dy_mh); v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh); } } #endif for ( ; j < width; ++j) _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); } if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn) { int maxIdx = jn; for(int k = 1; k < cn; ++k) if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k; _mag_n[j] = _mag_n[maxIdx]; _dx_n[j] = _dx[maxIdx]; _dy_n[j] = _dy[maxIdx]; } _mag_n[src.cols] = 0; } // at the very beginning we do not have a complete ring // buffer of 3 magnitude rows for non-maxima suppression if (i <= boundaries.start) continue; } else { memset(_mag_n - 1, 0, mapstep * sizeof(int)); if(cn > 1) { std::swap(_dx_n, _dx_a); std::swap(_dy_n, _dy_a); } } // From here actual src row is (i - 1) // Set left and right border to 1 #if CV_SIMD128 if(haveSIMD) _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128; else #endif _pmap = map.ptr<uchar>(i) + 1; _pmap[src.cols] =_pmap[-1] = 1; if(cn == 1) { _dx = dx.ptr<short>(i - rowStart - 1); _dy = dy.ptr<short>(i - rowStart - 1); } else { _dx = _dx_a; _dy = _dy_a; } const int TG22 = 13573; int j = 0; #if CV_SIMD128 if (haveSIMD) { const v_int32x4 v_low = v_setall_s32(low); const v_int8x16 v_one = v_setall_s8(1); for (; j <= src.cols - 32; j += 32) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_m1 = v_load_aligned((const int*)(_mag_a + j + 16)); v_m2 = v_load_aligned((const int*)(_mag_a + j + 20)); v_m3 = v_load_aligned((const int*)(_mag_a + j + 24)); v_m4 = v_load_aligned((const int*)(_mag_a + j + 28)); v_store_aligned((signed char*)(_pmap + j), v_one); v_store_aligned((signed char*)(_pmap + j + 16), v_one); v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_cmp1 = v_m1 > v_low; v_cmp2 = v_m2 > v_low; v_cmp3 = v_m3 > v_low; v_cmp4 = v_m4 > v_low; v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); v_cmp80 = v_pack(v_cmp1, v_cmp2); v_cmp81 = v_pack(v_cmp3, v_cmp4); unsigned int mask = v_signmask(v_cmp); v_cmp = v_pack(v_cmp80, v_cmp81); mask |= v_signmask(v_cmp) << 16; if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } } if (j <= src.cols - 16) { v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); v_store_aligned((signed char*)(_pmap + j), v_one); v_int32x4 v_cmp1 = v_m1 > v_low; v_int32x4 v_cmp2 = v_m2 > v_low; v_int32x4 v_cmp3 = v_m3 > v_low; v_int32x4 v_cmp4 = v_m4 > v_low; v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); unsigned int mask = v_signmask(v_cmp); if (mask) { int k = j; do { int l = trailingZeros32(mask); k += l; mask >>= l; int m = _mag_a[k]; short xs = _dx[k]; short ys = _dy[k]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[k] && m >= _mag_n[k]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[k - s] && m > _mag_n[k + s]) { CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); } } } ++k; } while((mask >>= 1)); } j += 16; } } #endif for (; j < src.cols; j++) { int m = _mag_a[j]; if (m > low) { short xs = _dx[j]; short ys = _dy[j]; int x = (int)std::abs(xs); int y = (int)std::abs(ys) << 15; int tg22x = x * TG22; if (y < tg22x) { if (m > _mag_a[j - 1] && m >= _mag_a[j + 1]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int tg67x = tg22x + (x << 16); if (y > tg67x) { if (m > _mag_p[j] && m >= _mag_n[j]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } else { int s = (xs ^ ys) < 0 ? -1 : 1; if(m > _mag_p[j - s] && m > _mag_n[j + s]) { CANNY_CHECK(m, high, (_pmap+j), stack); } } } } _pmap[j] = 1; } }
inline int v_signmask(const v_uint16x8& a) { return v_signmask(v_reinterpret_as_s16(a)); }
void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, int ksize, int borderType ) { CV_INSTRUMENT_REGION() // Prepare InputArray src Mat src = _src.getMat(); CV_Assert( !src.empty() ); CV_Assert( src.type() == CV_8UC1 ); CV_Assert( borderType == BORDER_DEFAULT || borderType == BORDER_REPLICATE ); // Prepare OutputArrays dx, dy _dx.create( src.size(), CV_16SC1 ); _dy.create( src.size(), CV_16SC1 ); Mat dx = _dx.getMat(), dy = _dy.getMat(); // TODO: Allow for other kernel sizes CV_Assert(ksize == 3); // Get dimensions const int H = src.rows, W = src.cols; // Row, column indices int i = 0, j = 0; // Handle border types int i_top = 0, // Case for H == 1 && W == 1 && BORDER_REPLICATE i_bottom = H - 1, j_offl = 0, // j offset from 0th pixel to reach -1st pixel j_offr = 0; // j offset from W-1th pixel to reach Wth pixel if ( borderType == BORDER_DEFAULT ) // Equiv. to BORDER_REFLECT_101 { if ( H > 1 ) { i_top = 1; i_bottom = H - 2; } if ( W > 1 ) { j_offl = 1; j_offr = -1; } } // Pointer to row vectors uchar *p_src, *c_src, *n_src; // previous, current, next row short *c_dx, *c_dy; int i_start = 0; int j_start = 0; #if CV_SIMD128 && CV_SSE2 if(hasSIMD128()) { uchar *m_src; short *n_dx, *n_dy; // Characters in variable names have the following meanings: // u: unsigned char // s: signed int // // [row][column] // m: offset -1 // n: offset 0 // p: offset 1 // Example: umn is offset -1 in row and offset 0 in column for ( i = 0; i < H - 1; i += 2 ) { if ( i == 0 ) p_src = src.ptr<uchar>(i_top); else p_src = src.ptr<uchar>(i-1); c_src = src.ptr<uchar>(i); n_src = src.ptr<uchar>(i+1); if ( i == H - 2 ) m_src = src.ptr<uchar>(i_bottom); else m_src = src.ptr<uchar>(i+2); c_dx = dx.ptr<short>(i); c_dy = dy.ptr<short>(i); n_dx = dx.ptr<short>(i+1); n_dy = dy.ptr<short>(i+1); v_uint8x16 v_select_m = v_uint8x16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF); // Process rest of columns 16-column chunks at a time for ( j = 1; j < W - 16; j += 16 ) { // Load top row for 3x3 Sobel filter v_uint8x16 v_um = v_load(&p_src[j-1]); v_uint8x16 v_up = v_load(&p_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_uint8x16 v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2); // Load second row for 3x3 Sobel filter v_um = v_load(&c_src[j-1]); v_up = v_load(&c_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2); // Load third row for 3x3 Sobel filter v_um = v_load(&n_src[j-1]); v_up = v_load(&n_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2); // dx & dy for rows 1, 2, 3 v_int16x8 v_sdx1, v_sdy1; spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1, v_s1m1, v_s1n1, v_s1p1, v_s2m1, v_s2p1, v_s3m1, v_s3n1, v_s3p1 ); v_int16x8 v_sdx2, v_sdy2; spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2, v_s1m2, v_s1n2, v_s1p2, v_s2m2, v_s2p2, v_s3m2, v_s3n2, v_s3p2 ); // Store v_store(&c_dx[j], v_sdx1); v_store(&c_dx[j+8], v_sdx2); v_store(&c_dy[j], v_sdy1); v_store(&c_dy[j+8], v_sdy2); // Load fourth row for 3x3 Sobel filter v_um = v_load(&m_src[j-1]); v_up = v_load(&m_src[j+1]); // TODO: Replace _mm_slli_si128 with hal method v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)), v_uint8x16(_mm_srli_si128(v_um.val, 1))); v_expand(v_um, v_um1, v_um2); v_expand(v_un, v_un1, v_un2); v_expand(v_up, v_up1, v_up2); v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1); v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2); v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1); v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2); v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1); v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2); // dx & dy for rows 2, 3, 4 spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1, v_s2m1, v_s2n1, v_s2p1, v_s3m1, v_s3p1, v_s4m1, v_s4n1, v_s4p1 ); spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2, v_s2m2, v_s2n2, v_s2p2, v_s3m2, v_s3p2, v_s4m2, v_s4n2, v_s4p2 ); // Store v_store(&n_dx[j], v_sdx1); v_store(&n_dx[j+8], v_sdx2); v_store(&n_dy[j], v_sdy1); v_store(&n_dy[j+8], v_sdy2); } } } i_start = i; j_start = j; #endif int j_p, j_n; uchar v00, v01, v02, v10, v11, v12, v20, v21, v22; for ( i = 0; i < H; i++ ) { if ( i == 0 ) p_src = src.ptr<uchar>(i_top); else p_src = src.ptr<uchar>(i-1); c_src = src.ptr<uchar>(i); if ( i == H - 1 ) n_src = src.ptr<uchar>(i_bottom); else n_src = src.ptr<uchar>(i+1); c_dx = dx.ptr<short>(i); c_dy = dy.ptr<short>(i); // Process left-most column j = 0; j_p = j + j_offl; j_n = 1; if ( j_n >= W ) j_n = j + j_offr; v00 = p_src[j_p]; v01 = p_src[j]; v02 = p_src[j_n]; v10 = c_src[j_p]; v11 = c_src[j]; v12 = c_src[j_n]; v20 = n_src[j_p]; v21 = n_src[j]; v22 = n_src[j_n]; spatialGradientKernel<short>( c_dx[0], c_dy[0], v00, v01, v02, v10, v12, v20, v21, v22 ); v00 = v01; v10 = v11; v20 = v21; v01 = v02; v11 = v12; v21 = v22; // Process middle columns j = i >= i_start ? 1 : j_start; j_p = j - 1; v00 = p_src[j_p]; v01 = p_src[j]; v10 = c_src[j_p]; v11 = c_src[j]; v20 = n_src[j_p]; v21 = n_src[j]; for ( ; j < W - 1; j++ ) { // Get values for next column j_n = j + 1; v02 = p_src[j_n]; v12 = c_src[j_n]; v22 = n_src[j_n]; spatialGradientKernel<short>( c_dx[j], c_dy[j], v00, v01, v02, v10, v12, v20, v21, v22 ); // Move values back one column for next iteration v00 = v01; v10 = v11; v20 = v21; v01 = v02; v11 = v12; v21 = v22; } // Process right-most column if ( j < W ) { j_n = j + j_offr; v02 = p_src[j_n]; v12 = c_src[j_n]; v22 = n_src[j_n]; spatialGradientKernel<short>( c_dx[j], c_dy[j], v00, v01, v02, v10, v12, v20, v21, v22 ); } } }