Пример #1
0
void loadHWMesh(const void* file_data_ptr, u32 name_hash, GPUMesh& gpu_mesh) {
	auto file_data = static_cast<const char*>(file_data_ptr);

	auto mesh_header = reinterpret_cast<const HWMeshHeader*>(file_data);
	assert(std::memcmp(mesh_header->magic, "HWMESH", 6) == 0);
	assert(mesh_header->version == HWMeshHeader::HWMESH_VERSION);

	auto mesh_index = reinterpret_cast<const HWMeshIndex*>(file_data + sizeof(HWMeshHeader));
	auto entry = std::find_if(mesh_index, mesh_index + mesh_header->num_meshes, [name_hash](const HWMeshIndex& i) { return i.name_hash == name_hash; });
	assert(entry != mesh_index + mesh_header->num_meshes);

	auto mesh_data = reinterpret_cast<const HWMeshData*>(file_data + entry->file_offset);
	assert(mesh_data->num_submeshes <= HWMeshData::MAX_SUBMESHES);
	assert(mesh_data->vertex_format == vertex_fmt::FMT_POS3F_NORM3F_TEX2F);
	assert(mesh_data->num_submeshes == 1); // for now
	gpu_mesh.indices_count = mesh_data->submesh_indices_size[0];

	auto vertex_data = reinterpret_cast<const char*>(file_data + entry->file_offset + sizeof(HWMeshData));
	vertex_data = alignPtr(vertex_data, 16);

	auto index_data = vertex_data + mesh_data->vertex_data_size;
	index_data = alignPtr(index_data, 4);

	gpu_mesh.loadVertexData(vertex_data, mesh_data->vertex_data_size, (vertex_fmt::VertexFormat)mesh_data->vertex_format);
	gpu_mesh.loadIndices(index_data, mesh_data->index_data_size, mesh_data->index_type);
}
Пример #2
0
bool TransTable::resize( size_t sizeBytes )
{
	size_t sizeEntries = (sizeBytes + sizeof(TransEntry)-1)/sizeof(TransEntry);
	if ( sizeEntries <= buckets )
	{
		// do dummy alloc (buckets entries)
		dummyAlloc();
		return 1;
	}
	if ( !roundPow2( sizeEntries ) )
		return 0;					// bad size
	if ( size == sizeEntries )
		return 1;
	// realloc!
	dealloc();
	allocEntries = new(std::nothrow) TransEntry[ sizeEntries + alignSize/sizeof(TransEntry) ];
	if ( !allocEntries )
	{
		dummyAlloc();
		return 0;
	}
	// align entries
	entries = static_cast<TransEntry *>(alignPtr( allocEntries, alignSize ));
	size = sizeEntries;
	return 1;
}
Пример #3
0
static void* cvAlloc( size_t size)
{
    uint8_t* udata = (uint8_t*)malloc(size + sizeof(void*) + 32);
    if(!udata) return 0;
    uint8_t** adata = (uint8_t**)alignPtr((uint8_t**)udata + 1, 32);
    adata[-1] = udata;
    return adata;
}
Пример #4
0
void* fastMalloc( size_t size )
{
    uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
    if(!udata)
        return OutOfMemoryError(size);
    uchar** adata = alignPtr((uchar**)udata + 1, CV_MALLOC_ALIGN);
    adata[-1] = udata;
    return adata;
}
Пример #5
0
    void operator()( const Range& range ) const
    {
        const float inf = 1e15f;
        int i, i1 = range.start, i2 = range.end;
        int n = dst->cols;
        AutoBuffer<uchar> _buf((n+2)*2*sizeof(float) + (n+2)*sizeof(int));
        float* f = (float*)(uchar*)_buf;
        float* z = f + n;
        int* v = alignPtr((int*)(z + n + 1), sizeof(int));

        for( i = i1; i < i2; i++ )
        {
            float* d = dst->ptr<float>(i);
            int p, q, k;

            v[0] = 0;
            z[0] = -inf;
            z[1] = inf;
            f[0] = d[0];

            for( q = 1, k = 0; q < n; q++ )
            {
                float fq = d[q];
                f[q] = fq;

                for(;;k--)
                {
                    p = v[k];
                    float s = (fq + sqr_tab[q] - d[p] - sqr_tab[p])*inv_tab[q - p];
                    if( s > z[k] )
                    {
                        k++;
                        v[k] = q;
                        z[k] = s;
                        z[k+1] = inf;
                        break;
                    }
                }
            }

            for( q = 0, k = 0; q < n; q++ )
            {
                while( z[k+1] < q )
                    k++;
                p = v[k];
                d[q] = std::sqrt(sqr_tab[std::abs(q - p)] + f[p]);
            }
        }
    }
Пример #6
0
void cv::split(const Mat& src, Mat* mv)
{
    int k, depth = src.depth(), cn = src.channels();
    if( cn == 1 )
    {
        src.copyTo(mv[0]);
        return;
    }

    SplitFunc func = splitTab[depth];
    CV_Assert( func != 0 );
    
    int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1();
    int blocksize0 = (BLOCK_SIZE + esz-1)/esz;
    AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
    const Mat** arrays = (const Mat**)(uchar*)_buf;
    uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
    
    arrays[0] = &src;
    for( k = 0; k < cn; k++ )
    {
        mv[k].create(src.dims, src.size, depth);
        arrays[k+1] = &mv[k];
    }
    
    NAryMatIterator it(arrays, ptrs, cn+1);
    int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
    
    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        for( int j = 0; j < total; j += blocksize )
        {
            int bsz = std::min(total - j, blocksize);
            func( ptrs[0], &ptrs[1], bsz, cn );
            
            if( j + blocksize < total )
            {
                ptrs[0] += bsz*esz;
                for( k = 0; k < cn; k++ )
                    ptrs[k+1] += bsz*esz1;
            }
        }
    }
}
Пример #7
0
Mat& Mat::setTo(InputArray _value, InputArray _mask)
{
    if( !data )
        return *this;
    
    Mat value = _value.getMat(), mask = _mask.getMat();
    
    CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::MAT ));
    CV_Assert( mask.empty() || mask.type() == CV_8U );
    
    size_t esz = elemSize();
    BinaryFunc copymask = getCopyMaskFunc(esz);
    
    const Mat* arrays[] = { this, !mask.empty() ? &mask : 0, 0 };
    uchar* ptrs[2]={0,0};
    NAryMatIterator it(arrays, ptrs);
    int total = (int)it.size, blockSize0 = std::min(total, (int)((BLOCK_SIZE + esz-1)/esz));
    AutoBuffer<uchar> _scbuf(blockSize0*esz + 32);
    uchar* scbuf = alignPtr((uchar*)_scbuf, (int)sizeof(double));
    convertAndUnrollScalar( value, type(), scbuf, blockSize0 );
    
    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        for( int j = 0; j < total; j += blockSize0 )
        {
            Size sz(std::min(blockSize0, total - j), 1);
            size_t blockSize = sz.width*esz;
            if( ptrs[1] )
            {
                copymask(scbuf, 0, ptrs[1], 0, ptrs[0], 0, sz, &esz);
                ptrs[1] += sz.width;
            }
            else
                memcpy(ptrs[0], scbuf, blockSize);
            ptrs[0] += blockSize;
        }
    }
    return *this;
}
Пример #8
0
template<class CastOp, class VecOp> void
pyrUp_( const Mat& _src, Mat& _dst, int)
{
    const int PU_SZ = 3;
    typedef typename CastOp::type1 WT;
    typedef typename CastOp::rtype T;

    Size ssize = _src.size(), dsize = _dst.size();
    int cn = _src.channels();
    int bufstep = (int)alignSize((dsize.width+1)*cn, 16);
    AutoBuffer<WT> _buf(bufstep*PU_SZ + 16);
    WT* buf = alignPtr((WT*)_buf, 16);
    AutoBuffer<int> _dtab(ssize.width*cn);
    int* dtab = _dtab;
    WT* rows[PU_SZ];
    CastOp castOp;
    VecOp vecOp;

    CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 &&
               std::abs(dsize.height - ssize.height*2) == dsize.height % 2);
    int k, x, sy0 = -PU_SZ/2, sy = sy0;

    ssize.width *= cn;
    dsize.width *= cn;

    for( x = 0; x < ssize.width; x++ )
        dtab[x] = (x/cn)*2*cn + x % cn;

    for( int y = 0; y < ssize.height; y++ )
    {
        T* dst0 = (T*)(_dst.data + _dst.step*y*2);
        T* dst1 = (T*)(_dst.data + _dst.step*(y*2+1));
        WT *row0, *row1, *row2;

        if( y*2+1 >= dsize.height )
            dst1 = dst0;

        // fill the ring buffer (horizontal convolution and decimation)
        for( ; sy <= y + 1; sy++ )
        {
            WT* row = buf + ((sy - sy0) % PU_SZ)*bufstep;
            int _sy = borderInterpolate(sy*2, dsize.height, BORDER_REFLECT_101)/2;
            const T* src = (const T*)(_src.data + _src.step*_sy);

            if( ssize.width == cn )
            {
                for( x = 0; x < cn; x++ )
                    row[x] = row[x + cn] = src[x]*8;
                continue;
            }

            for( x = 0; x < cn; x++ )
            {
                int dx = dtab[x];
                WT t0 = src[x]*6 + src[x + cn]*2;
                WT t1 = (src[x] + src[x + cn])*4;
                row[dx] = t0; row[dx + cn] = t1;
                dx = dtab[ssize.width - cn + x];
                int sx = ssize.width - cn + x;
                t0 = src[sx - cn] + src[sx]*7;
                t1 = src[sx]*8;
                row[dx] = t0; row[dx + cn] = t1;
            }

            for( x = cn; x < ssize.width - cn; x++ )
            {
                int dx = dtab[x];
                WT t0 = src[x-cn] + src[x]*6 + src[x+cn];
                WT t1 = (src[x] + src[x+cn])*4;
                row[dx] = t0;
                row[dx+cn] = t1;
            }
        }

        // do vertical convolution and decimation and write the result to the destination image
        for( k = 0; k < PU_SZ; k++ )
            rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep;
        row0 = rows[0]; row1 = rows[1]; row2 = rows[2];

        x = vecOp(rows, dst0, (int)_dst.step, dsize.width);
        for( ; x < dsize.width; x++ )
        {
            T t1 = castOp((row1[x] + row2[x])*4);
            T t0 = castOp(row0[x] + row1[x]*6 + row2[x]);
            dst1[x] = t1; dst0[x] = t0;
        }
    }
}
Пример #9
0
template<class CastOp, class VecOp> void
pyrDown_( const Mat& _src, Mat& _dst, int borderType )
{
    const int PD_SZ = 5;
    typedef typename CastOp::type1 WT;
    typedef typename CastOp::rtype T;

    CV_Assert( !_src.empty() );
    Size ssize = _src.size(), dsize = _dst.size();
    int cn = _src.channels();
    int bufstep = (int)alignSize(dsize.width*cn, 16);
    AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
    WT* buf = alignPtr((WT*)_buf, 16);
    int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
    AutoBuffer<int> _tabM(dsize.width*cn);
    int* tabM = _tabM;
    WT* rows[PD_SZ];
    CastOp castOp;
    VecOp vecOp;

    CV_Assert( ssize.width > 0 && ssize.height > 0 &&
               std::abs(dsize.width*2 - ssize.width) <= 2 &&
               std::abs(dsize.height*2 - ssize.height) <= 2 );
    int k, x, sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);

    for( x = 0; x <= PD_SZ+1; x++ )
    {
        int sx0 = borderInterpolate(x - PD_SZ/2, ssize.width, borderType)*cn;
        int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, ssize.width, borderType)*cn;
        for( k = 0; k < cn; k++ )
        {
            tabL[x*cn + k] = sx0 + k;
            tabR[x*cn + k] = sx1 + k;
        }
    }

    ssize.width *= cn;
    dsize.width *= cn;
    width0 *= cn;

    for( x = 0; x < dsize.width; x++ )
        tabM[x] = (x/cn)*2*cn + x % cn;

    for( int y = 0; y < dsize.height; y++ )
    {
        T* dst = (T*)(_dst.data + _dst.step*y);
        WT *row0, *row1, *row2, *row3, *row4;

        // fill the ring buffer (horizontal convolution and decimation)
        for( ; sy <= y*2 + 2; sy++ )
        {
            WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
            int _sy = borderInterpolate(sy, ssize.height, borderType);
            const T* src = (const T*)(_src.data + _src.step*_sy);
            int limit = cn;
            const int* tab = tabL;

            for( x = 0;;)
            {
                for( ; x < limit; x++ )
                {
                    row[x] = src[tab[x+cn*2]]*6 + (src[tab[x+cn]] + src[tab[x+cn*3]])*4 +
                        src[tab[x]] + src[tab[x+cn*4]];
                }

                if( x == dsize.width )
                    break;

                if( cn == 1 )
                {
                    for( ; x < width0; x++ )
                        row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 +
                            src[x*2 - 2] + src[x*2 + 2];
                }
                else if( cn == 3 )
                {
                    for( ; x < width0; x += 3 )
                    {
                        const T* s = src + x*2;
                        WT t0 = s[0]*6 + (s[-3] + s[3])*4 + s[-6] + s[6];
                        WT t1 = s[1]*6 + (s[-2] + s[4])*4 + s[-5] + s[7];
                        WT t2 = s[2]*6 + (s[-1] + s[5])*4 + s[-4] + s[8];
                        row[x] = t0; row[x+1] = t1; row[x+2] = t2;
                    }
                }
                else if( cn == 4 )
                {
                    for( ; x < width0; x += 4 )
                    {
                        const T* s = src + x*2;
                        WT t0 = s[0]*6 + (s[-4] + s[4])*4 + s[-8] + s[8];
                        WT t1 = s[1]*6 + (s[-3] + s[5])*4 + s[-7] + s[9];
                        row[x] = t0; row[x+1] = t1;
                        t0 = s[2]*6 + (s[-2] + s[6])*4 + s[-6] + s[10];
                        t1 = s[3]*6 + (s[-1] + s[7])*4 + s[-5] + s[11];
                        row[x+2] = t0; row[x+3] = t1;
                    }
                }
                else
                {
                    for( ; x < width0; x++ )
                    {
                        int sx = tabM[x];
                        row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 +
                            src[sx - cn*2] + src[sx + cn*2];
                    }
                }

                limit = dsize.width;
                tab = tabR - x;
            }
        }

        // do vertical convolution and decimation and write the result to the destination image
        for( k = 0; k < PD_SZ; k++ )
            rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
        row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];

        x = vecOp(rows, dst, (int)_dst.step, dsize.width);
        for( ; x < dsize.width; x++ )
            dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]);
    }
}
Пример #10
0
static Rect maskBoundingRect( const Mat& img )
{
    CV_Assert( img.depth() <= CV_8S && img.channels() == 1 );

    Size size = img.size();
    int xmin = size.width, ymin = -1, xmax = -1, ymax = -1, i, j, k;

    for( i = 0; i < size.height; i++ )
    {
        const uchar* _ptr = img.ptr(i);
        const uchar* ptr = (const uchar*)alignPtr(_ptr, 4);
        int have_nz = 0, k_min, offset = (int)(ptr - _ptr);
        j = 0;
        offset = MIN(offset, size.width);
        for( ; j < offset; j++ )
            if( _ptr[j] )
            {
                have_nz = 1;
                break;
            }
        if( j < offset )
        {
            if( j < xmin )
                xmin = j;
            if( j > xmax )
                xmax = j;
        }
        if( offset < size.width )
        {
            xmin -= offset;
            xmax -= offset;
            size.width -= offset;
            j = 0;
            for( ; j <= xmin - 4; j += 4 )
                if( *((int*)(ptr+j)) )
                    break;
            for( ; j < xmin; j++ )
                if( ptr[j] )
                {
                    xmin = j;
                    if( j > xmax )
                        xmax = j;
                    have_nz = 1;
                    break;
                }
            k_min = MAX(j-1, xmax);
            k = size.width - 1;
            for( ; k > k_min && (k&3) != 3; k-- )
                if( ptr[k] )
                    break;
            if( k > k_min && (k&3) == 3 )
            {
                for( ; k > k_min+3; k -= 4 )
                    if( *((int*)(ptr+k-3)) )
                        break;
            }
            for( ; k > k_min; k-- )
                if( ptr[k] )
                {
                    xmax = k;
                    have_nz = 1;
                    break;
                }
            if( !have_nz )
            {
                j &= ~3;
                for( ; j <= k - 3; j += 4 )
                    if( *((int*)(ptr+j)) )
                        break;
                for( ; j <= k; j++ )
                    if( ptr[j] )
                    {
                        have_nz = 1;
                        break;
                    }
            }
            xmin += offset;
            xmax += offset;
            size.width += offset;
        }
        if( have_nz )
        {
            if( ymin < 0 )
                ymin = i;
            ymax = i;
        }
    }

    if( xmin >= size.width )
        xmin = ymin = 0;
    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
}
Пример #11
0
/*
** seeColon (for proctologists only)
** Walks a colon definition, decompiling
** on the fly. Knows about primitive control structures.
*/
static void seeColon(FICL_VM *pVM, CELL *pc)
{
	char *cp;
    CELL *param0 = pc;
    FICL_DICT *pd = vmGetDict(pVM);
	FICL_WORD *pSemiParen = ficlLookup(pVM->pSys, "(;)");
    assert(pSemiParen);

    for (; pc->p != pSemiParen; pc++)
    {
        FICL_WORD *pFW = (FICL_WORD *)(pc->p);

        cp = pVM->pad;
		if ((void *)pc == (void *)pVM->ip)
			*cp++ = '>';
		else
			*cp++ = ' ';
        cp += sprintf(cp, "%3d   ", pc-param0);
        
        if (isAFiclWord(pd, pFW))
        {
            WORDKIND kind = ficlWordClassify(pFW);
            CELL c;

            switch (kind)
            {
            case LITERAL:
                c = *++pc;
                if (isAFiclWord(pd, c.p))
                {
                    FICL_WORD *pLit = (FICL_WORD *)c.p;
                    sprintf(cp, "%.*s ( %#lx literal )", 
                        pLit->nName, pLit->name, (unsigned long)c.u);
                }
                else
                    sprintf(cp, "literal %ld (%#lx)",
                        (long)c.i, (unsigned long)c.u);
                break;
            case STRINGLIT:
                {
                    FICL_STRING *sp = (FICL_STRING *)(void *)++pc;
                    pc = (CELL *)alignPtr(sp->text + sp->count + 1) - 1;
                    sprintf(cp, "s\" %.*s\"", sp->count, sp->text);
                }
                break;
            case CSTRINGLIT:
                {
                    FICL_STRING *sp = (FICL_STRING *)(void *)++pc;
                    pc = (CELL *)alignPtr(sp->text + sp->count + 1) - 1;
                    sprintf(cp, "c\" %.*s\"", sp->count, sp->text);
                }
                break;
            case IF:
                c = *++pc;
                if (c.i > 0)
                    sprintf(cp, "if / while (branch %d)", pc+c.i-param0);
                else
                    sprintf(cp, "until (branch %d)",      pc+c.i-param0);
                break;                                                           
            case BRANCH:
                c = *++pc;
                if (c.i == 0)
                    sprintf(cp, "repeat (branch %d)",     pc+c.i-param0);
                else if (c.i == 1)
                    sprintf(cp, "else (branch %d)",       pc+c.i-param0);
                else
                    sprintf(cp, "endof (branch %d)",       pc+c.i-param0);
                break;

            case OF:
                c = *++pc;
                sprintf(cp, "of (branch %d)",       pc+c.i-param0);
                break;

            case QDO:
                c = *++pc;
                sprintf(cp, "?do (leave %d)",  (CELL *)c.p-param0);
                break;
            case DO:
                c = *++pc;
                sprintf(cp, "do (leave %d)", (CELL *)c.p-param0);
                break;
            case LOOP:
                c = *++pc;
                sprintf(cp, "loop (branch %d)", pc+c.i-param0);
                break;
            case PLOOP:
                c = *++pc;
                sprintf(cp, "+loop (branch %d)", pc+c.i-param0);
                break;
            default:
                sprintf(cp, "%.*s", pFW->nName, pFW->name);
                break;
            }
 
        }
        else /* probably not a word - punt and print value */
        {
            sprintf(cp, "%ld ( %#lx )", (long)pc->i, (unsigned long)pc->u);
        }

		vmTextOut(pVM, pVM->pad, 1);
    }

    vmTextOut(pVM, ";", 1);
}
Пример #12
0
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
{
    Mat img = _img.getMat();
    const int K = patternSize/2, N = patternSize + K + 1;
    int i, j, k, pixel[25];
    makeOffsets(pixel, (int)img.step, patternSize);

#if CV_SIMD128
    const int quarterPatternSize = patternSize/4;
    v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
    bool hasSimd = hasSIMD128();
#if CV_TRY_AVX2
    Ptr<opt_AVX2::FAST_t_patternSize16_AVX2> fast_t_impl_avx2;
    if(CV_CPU_HAS_SUPPORT_AVX2)
        fast_t_impl_avx2 = opt_AVX2::FAST_t_patternSize16_AVX2::getImpl(img.cols, threshold, nonmax_suppression, pixel);
#endif

#endif

    keypoints.clear();

    threshold = std::min(std::max(threshold, 0), 255);

    uchar threshold_tab[512];
    for( i = -255; i <= 255; i++ )
        threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);

    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
    uchar* buf[3];
    buf[0] = _buf.data(); buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
    int* cpbuf[3];
    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
    cpbuf[1] = cpbuf[0] + img.cols + 1;
    cpbuf[2] = cpbuf[1] + img.cols + 1;
    memset(buf[0], 0, img.cols*3);

    for(i = 3; i < img.rows-2; i++)
    {
        const uchar* ptr = img.ptr<uchar>(i) + 3;
        uchar* curr = buf[(i - 3)%3];
        int* cornerpos = cpbuf[(i - 3)%3];
        memset(curr, 0, img.cols);
        int ncorners = 0;

        if( i < img.rows - 3 )
        {
            j = 3;
#if CV_SIMD128
            if( hasSimd )
            {
                if( patternSize == 16 )
                {
#if CV_TRY_AVX2
                    if (fast_t_impl_avx2)
                        fast_t_impl_avx2->process(j, ptr, curr, cornerpos, ncorners);
#endif
                    //vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements
                    {
                        for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
                        {
                            v_uint8x16 v = v_load(ptr);
                            v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
                            v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);

                            v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
                            v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
                            v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
                            v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));

                            v_int8x16 m0, m1;
                            m0 = (v0 < x0) & (v0 < x1);
                            m1 = (x0 < v1) & (x1 < v1);
                            m0 = m0 | ((v0 < x1) & (v0 < x2));
                            m1 = m1 | ((x1 < v1) & (x2 < v1));
                            m0 = m0 | ((v0 < x2) & (v0 < x3));
                            m1 = m1 | ((x2 < v1) & (x3 < v1));
                            m0 = m0 | ((v0 < x3) & (v0 < x0));
                            m1 = m1 | ((x3 < v1) & (x0 < v1));
                            m0 = m0 | m1;

                            int mask = v_signmask(m0);
                            if( mask == 0 )
                                continue;
                            if( (mask & 255) == 0 )
                            {
                                j -= 8;
                                ptr -= 8;
                                continue;
                            }

                            v_int8x16 c0 = v_setzero_s8();
                            v_int8x16 c1 = v_setzero_s8();
                            v_uint8x16 max0 = v_setzero_u8();
                            v_uint8x16 max1 = v_setzero_u8();
                            for( k = 0; k < N; k++ )
                            {
                                v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
                                m0 = v0 < x;
                                m1 = x < v1;

                                c0 = v_sub_wrap(c0, m0) & m0;
                                c1 = v_sub_wrap(c1, m1) & m1;

                                max0 = v_max(max0, v_reinterpret_as_u8(c0));
                                max1 = v_max(max1, v_reinterpret_as_u8(c1));
                            }

                            max0 = v_max(max0, max1);
                            int m = v_signmask(K16 < max0);

                            for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
                            {
                                if(m & 1)
                                {
                                    cornerpos[ncorners++] = j+k;
                                    if(nonmax_suppression)
                                        curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
                                }
                            }
                        }
                    }
                }
            }
#endif
            for( ; j < img.cols - 3; j++, ptr++ )
            {
                int v = ptr[0];
                const uchar* tab = &threshold_tab[0] - v + 255;
                int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];

                if( d & 1 )
                {
                    int vt = v - threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x < vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }

                if( d & 2 )
                {
                    int vt = v + threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x > vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }
            }
        }

        cornerpos[-1] = ncorners;

        if( i == 3 )
            continue;

        const uchar* prev = buf[(i - 4 + 3)%3];
        const uchar* pprev = buf[(i - 5 + 3)%3];
        cornerpos = cpbuf[(i - 4 + 3)%3];
        ncorners = cornerpos[-1];

        for( k = 0; k < ncorners; k++ )
        {
            j = cornerpos[k];
            int score = prev[j];
            if( !nonmax_suppression ||
               (score > prev[j+1] && score > prev[j-1] &&
                score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
            {
                keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score));
            }
        }
    }
Пример #13
0
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
{
    Mat img = _img.getMat();
    const int K = patternSize/2, N = patternSize + K + 1;
#if CV_SSE2
    const int quarterPatternSize = patternSize/4;
    (void)quarterPatternSize;
#endif
    int i, j, k, pixel[25];
    makeOffsets(pixel, (int)img.step, patternSize);

    keypoints.clear();

    threshold = std::min(std::max(threshold, 0), 255);

#if CV_SSE2
    __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
    (void)K16;
    (void)delta;
    (void)t;
#endif
    uchar threshold_tab[512];
    for( i = -255; i <= 255; i++ )
        threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);

    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
    uchar* buf[3];
    buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
    int* cpbuf[3];
    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
    cpbuf[1] = cpbuf[0] + img.cols + 1;
    cpbuf[2] = cpbuf[1] + img.cols + 1;
    memset(buf[0], 0, img.cols*3);

    for(i = 3; i < img.rows-2; i++)
    {
        const uchar* ptr = img.ptr<uchar>(i) + 3;
        uchar* curr = buf[(i - 3)%3];
        int* cornerpos = cpbuf[(i - 3)%3];
        memset(curr, 0, img.cols);
        int ncorners = 0;

        if( i < img.rows - 3 )
        {
            j = 3;
    #if CV_SSE2
            if( patternSize == 16 )
            {
                for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
                {
                    __m128i m0, m1;
                    __m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
                    __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
                    v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);

                    __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
                    __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
                    __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
                    __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
                    m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
                    m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
                    m0 = _mm_or_si128(m0, m1);
                    int mask = _mm_movemask_epi8(m0);
                    if( mask == 0 )
                        continue;
                    if( (mask & 255) == 0 )
                    {
                        j -= 8;
                        ptr -= 8;
                        continue;
                    }

                    __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
                    for( k = 0; k < N; k++ )
                    {
                        __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
                        m0 = _mm_cmpgt_epi8(x, v0);
                        m1 = _mm_cmpgt_epi8(v1, x);

                        c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
                        c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);

                        max0 = _mm_max_epu8(max0, c0);
                        max1 = _mm_max_epu8(max1, c1);
                    }

                    max0 = _mm_max_epu8(max0, max1);
                    int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));

                    for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
                        if(m & 1)
                        {
                            cornerpos[ncorners++] = j+k;
                            if(nonmax_suppression)
                                curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
                        }
                }
            }
    #endif
            for( ; j < img.cols - 3; j++, ptr++ )
            {
                int v = ptr[0];
                const uchar* tab = &threshold_tab[0] - v + 255;
                int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];

                if( d & 1 )
                {
                    int vt = v - threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x < vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }

                if( d & 2 )
                {
                    int vt = v + threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x > vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }
            }
        }

        cornerpos[-1] = ncorners;

        if( i == 3 )
            continue;

        const uchar* prev = buf[(i - 4 + 3)%3];
        const uchar* pprev = buf[(i - 5 + 3)%3];
        cornerpos = cpbuf[(i - 4 + 3)%3];
        ncorners = cornerpos[-1];

        for( k = 0; k < ncorners; k++ )
        {
            j = cornerpos[k];
            int score = prev[j];
            if( !nonmax_suppression ||
               (score > prev[j+1] && score > prev[j-1] &&
                score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
            {
                keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score));
            }
        }
    }
Пример #14
0
void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
{
    CV_Assert( mv && n > 0 );
    
    int depth = mv[0].depth();
    bool allch1 = true;
    int k, cn = 0;
    size_t i;
    
    for( i = 0; i < n; i++ )
    {
        CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth);
        allch1 = allch1 && mv[i].channels() == 1;
        cn += mv[i].channels();
    }
    
    CV_Assert( 0 < cn && cn <= CV_CN_MAX );
    _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn));
    Mat dst = _dst.getMat();
    
    if( n == 1 )
    {
        mv[0].copyTo(dst);
        return;
    }
    
    if( !allch1 )
    {
        AutoBuffer<int> pairs(cn*2);
        int j, ni=0;
        
        for( i = 0, j = 0; i < n; i++, j += ni )
        {
            ni = mv[i].channels();
            for( k = 0; k < ni; k++ )
            {
                pairs[(j+k)*2] = j + k;
                pairs[(j+k)*2+1] = j + k;
            }
        }
        mixChannels( mv, n, &dst, 1, &pairs[0], cn );
        return;
    }
        
    size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
    int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
    AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
    const Mat** arrays = (const Mat**)(uchar*)_buf;
    uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
    
    arrays[0] = &dst;
    for( k = 0; k < cn; k++ )
        arrays[k+1] = &mv[k];
    
    NAryMatIterator it(arrays, ptrs, cn+1);
    int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
    MergeFunc func = mergeTab[depth];
    
    for( i = 0; i < it.nplanes; i++, ++it )
    {
        for( int j = 0; j < total; j += blocksize )
        {
            int bsz = std::min(total - j, blocksize);
            func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn );
            
            if( j + blocksize < total )
            {
                ptrs[0] += bsz*esz;
                for( int k = 0; k < cn; k++ )
                    ptrs[k+1] += bsz*esz1;
            }
        }
    }
}
Пример #15
0
/**************************************************************************
                        d i c t A l i g n
** Align the dictionary's free space pointer
**************************************************************************/
void dictAlign(FICL_DICT *pDict)
{
    pDict->here = alignPtr(pDict->here);
}
Пример #16
0
    void operator()(const Range &boundaries) const
    {
        CV_TRACE_FUNCTION();

        Mat dx, dy;
        AutoBuffer<short> dxMax(0), dyMax(0);
        std::deque<uchar*> stack, borderPeaksLocal;
        const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1);
        int *_mag_p, *_mag_a, *_mag_n;
        short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL;
        uchar *_pmap;
        double scale = 1.0;

        CV_TRACE_REGION("gradient")
        if(needGradient)
        {
            if (aperture_size == 7)
            {
                scale = 1 / 16.0;
            }
            Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE);
            Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE);
        }
        else
        {
            dx = src.rowRange(rowStart, rowEnd);
            dy = src2.rowRange(rowStart, rowEnd);
        }

        CV_TRACE_REGION_NEXT("magnitude");
        if(cn > 1)
        {
            dxMax.allocate(2 * dx.cols);
            dyMax.allocate(2 * dy.cols);
            _dx_a = (short*)dxMax;
            _dx_n = _dx_a + dx.cols;
            _dy_a = (short*)dyMax;
            _dy_n = _dy_a + dy.cols;
        }

        // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
#if CV_SIMD128
        AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128));
        _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128);
        _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128);
        _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128);
#else
        AutoBuffer<int> buffer(3 * (mapstep * cn));
        _mag_p = (int*)buffer + 1;
        _mag_a = _mag_p + mapstep * cn;
        _mag_n = _mag_a + mapstep * cn;
#endif

        // For the first time when just 2 rows are filled and for left and right borders
        if(rowStart == boundaries.start)
            memset(_mag_n - 1, 0, mapstep * sizeof(int));
        else
            _mag_n[src.cols] = _mag_n[-1] = 0;

        _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0;

        // calculate magnitude and angle of gradient, perform non-maxima suppression.
        // fill the map with one of the following values:
        //   0 - the pixel might belong to an edge
        //   1 - the pixel can not belong to an edge
        //   2 - the pixel does belong to an edge
        for (int i = rowStart; i <= boundaries.end; ++i)
        {
            // Scroll the ring buffer
            std::swap(_mag_n, _mag_a);
            std::swap(_mag_n, _mag_p);

            if(i < rowEnd)
            {
                // Next row calculation
                _dx = dx.ptr<short>(i - rowStart);
                _dy = dy.ptr<short>(i - rowStart);

                if (L2gradient)
                {
                    int j = 0, width = src.cols * cn;
#if CV_SIMD128
                    if (haveSIMD)
                    {
                       for ( ; j <= width - 8; j += 8)
                        {
                            v_int16x8 v_dx = v_load((const short*)(_dx + j));
                            v_int16x8 v_dy = v_load((const short*)(_dy + j));

                            v_int32x4 v_dxp_low, v_dxp_high;
                            v_int32x4 v_dyp_low, v_dyp_high;
                            v_expand(v_dx, v_dxp_low, v_dxp_high);
                            v_expand(v_dy, v_dyp_low, v_dyp_high);

                            v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
                            v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
                        }
                    }
#endif
                    for ( ; j < width; ++j)
                        _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
                }
                else
                {
                    int j = 0, width = src.cols * cn;
#if CV_SIMD128
                    if (haveSIMD)
                    {
                        for(; j <= width - 8; j += 8)
                        {
                            v_int16x8 v_dx = v_load((const short *)(_dx + j));
                            v_int16x8 v_dy = v_load((const short *)(_dy + j));

                            v_dx = v_reinterpret_as_s16(v_abs(v_dx));
                            v_dy = v_reinterpret_as_s16(v_abs(v_dy));

                            v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh;
                            v_expand(v_dx, v_dx_ml, v_dx_mh);
                            v_expand(v_dy, v_dy_ml, v_dy_mh);

                            v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
                            v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh);
                        }
                    }
#endif
                    for ( ; j < width; ++j)
                        _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
                }

                if(cn > 1)
                {
                    std::swap(_dx_n, _dx_a);
                    std::swap(_dy_n, _dy_a);

                    for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
                    {
                        int maxIdx = jn;
                        for(int k = 1; k < cn; ++k)
                            if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k;

                        _mag_n[j] = _mag_n[maxIdx];
                        _dx_n[j] = _dx[maxIdx];
                        _dy_n[j] = _dy[maxIdx];
                    }

                    _mag_n[src.cols] = 0;
                }

                // at the very beginning we do not have a complete ring
                // buffer of 3 magnitude rows for non-maxima suppression
                if (i <= boundaries.start)
                    continue;
            }
            else
            {
                memset(_mag_n - 1, 0, mapstep * sizeof(int));

                if(cn > 1)
                {
                    std::swap(_dx_n, _dx_a);
                    std::swap(_dy_n, _dy_a);
                }
            }

            // From here actual src row is (i - 1)
            // Set left and right border to 1
#if CV_SIMD128
            if(haveSIMD)
                _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128;
            else
#endif
                _pmap = map.ptr<uchar>(i) + 1;

            _pmap[src.cols] =_pmap[-1] = 1;

            if(cn == 1)
            {
                _dx = dx.ptr<short>(i - rowStart - 1);
                _dy = dy.ptr<short>(i - rowStart - 1);
            }
            else
            {
                _dx = _dx_a;
                _dy = _dy_a;
            }

            const int TG22 = 13573;
            int j = 0;
#if CV_SIMD128
            if (haveSIMD)
            {
                const v_int32x4 v_low = v_setall_s32(low);
                const v_int8x16 v_one = v_setall_s8(1);

                for (; j <= src.cols - 32; j += 32)
                {
                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));

                    v_int32x4 v_cmp1 = v_m1 > v_low;
                    v_int32x4 v_cmp2 = v_m2 > v_low;
                    v_int32x4 v_cmp3 = v_m3 > v_low;
                    v_int32x4 v_cmp4 = v_m4 > v_low;

                    v_m1 = v_load_aligned((const int*)(_mag_a + j + 16));
                    v_m2 = v_load_aligned((const int*)(_mag_a + j + 20));
                    v_m3 = v_load_aligned((const int*)(_mag_a + j + 24));
                    v_m4 = v_load_aligned((const int*)(_mag_a + j + 28));

                    v_store_aligned((signed char*)(_pmap + j), v_one);
                    v_store_aligned((signed char*)(_pmap + j + 16), v_one);

                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    v_cmp1 = v_m1 > v_low;
                    v_cmp2 = v_m2 > v_low;
                    v_cmp3 = v_m3 > v_low;
                    v_cmp4 = v_m4 > v_low;

                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);

                    v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    unsigned int mask = v_signmask(v_cmp);

                    v_cmp = v_pack(v_cmp80, v_cmp81);
                    mask |= v_signmask(v_cmp) << 16;

                    if (mask)
                    {
                        int k = j;

                        do
                        {
                            int l = trailingZeros32(mask);
                            k += l;
                            mask >>= l;

                            int m = _mag_a[k];
                            short xs = _dx[k];
                            short ys = _dy[k];
                            int x = (int)std::abs(xs);
                            int y = (int)std::abs(ys) << 15;

                            int tg22x = x * TG22;

                            if (y < tg22x)
                            {
                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                                {
                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                }
                            }
                            else
                            {
                                int tg67x = tg22x + (x << 16);
                                if (y > tg67x)
                                {
                                    if (m > _mag_p[k] && m >= _mag_n[k])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                                else
                                {
                                    int s = (xs ^ ys) < 0 ? -1 : 1;
                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                            }
                            ++k;
                        } while((mask >>= 1));
                    }
                }

                if (j <= src.cols - 16)
                {
                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));

                    v_store_aligned((signed char*)(_pmap + j), v_one);

                    v_int32x4 v_cmp1 = v_m1 > v_low;
                    v_int32x4 v_cmp2 = v_m2 > v_low;
                    v_int32x4 v_cmp3 = v_m3 > v_low;
                    v_int32x4 v_cmp4 = v_m4 > v_low;

                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);

                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
                    unsigned int mask = v_signmask(v_cmp);

                    if (mask)
                    {
                        int k = j;

                        do
                        {
                            int l = trailingZeros32(mask);
                            k += l;
                            mask >>= l;

                            int m = _mag_a[k];
                            short xs = _dx[k];
                            short ys = _dy[k];
                            int x = (int)std::abs(xs);
                            int y = (int)std::abs(ys) << 15;

                            int tg22x = x * TG22;

                            if (y < tg22x)
                            {
                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                                {
                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                }
                            }
                            else
                            {
                                int tg67x = tg22x + (x << 16);
                                if (y > tg67x)
                                {
                                    if (m > _mag_p[k] && m >= _mag_n[k])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                                else
                                {
                                    int s = (xs ^ ys) < 0 ? -1 : 1;
                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                    {
                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                    }
                                }
                            }
                            ++k;
                        } while((mask >>= 1));
                    }
                    j += 16;
                }
            }
#endif
            for (; j < src.cols; j++)
            {
                int m = _mag_a[j];

                if (m > low)
                {
                    short xs = _dx[j];
                    short ys = _dy[j];
                    int x = (int)std::abs(xs);
                    int y = (int)std::abs(ys) << 15;

                    int tg22x = x * TG22;

                    if (y < tg22x)
                    {
                        if (m > _mag_a[j - 1] && m >= _mag_a[j + 1])
                        {
                            CANNY_CHECK(m, high, (_pmap+j), stack);
                        }
                    }
                    else
                    {
                        int tg67x = tg22x + (x << 16);
                        if (y > tg67x)
                        {
                            if (m > _mag_p[j] && m >= _mag_n[j])
                            {
                                CANNY_CHECK(m, high, (_pmap+j), stack);
                            }
                        }
                        else
                        {
                            int s = (xs ^ ys) < 0 ? -1 : 1;
                            if(m > _mag_p[j - s] && m > _mag_n[j + s])
                            {
                                CANNY_CHECK(m, high, (_pmap+j), stack);
                            }
                        }
                    }
                }
                _pmap[j] = 1;
            }
        }