C++ (Cpp) _mm256_setr_epi32示例

示例#1

0

显示文件

文件： SimdAvx2BgrToGray.cpp 项目： pozdneev/Simd

        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride));

            size_t alignedWidth = AlignLo(width, A);

            __m256i _permuteBody = _mm256_setr_epi32(0, 1, 2, 0, 3, 4, 5, 0);
            __m256i _permuteTail = _mm256_setr_epi32(2, 3, 4, 0, 5, 6, 7, 0);

            __m256i _shuffle = _mm256_setr_epi8(
                0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1,
                0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);

            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += A)
                    Store<align>((__m256i*)(gray + col), BgrToGray<align>(bgr + 3*col, _permuteBody, _permuteTail, _shuffle));
                if(width != alignedWidth)
                    Store<false>((__m256i*)(gray + width - A), BgrToGray<false>(bgr + 3*(width - A), _permuteBody, _permuteTail, _shuffle));
                bgr += bgrStride;
                gray += grayStride;
            }
        }

示例#2

0

显示文件

文件： avx2.cpp 项目： WojciechMula/toys

void bitmask_avx2(uint32_t* ptr, size_t n, uint32_t key, uint8_t* out) {

    uint32_t* output = (uint32_t*)out;

    const size_t N = 8*4; // unrolled 4 times
    const size_t chunks = n / N;
    const size_t tail   = n % N;

    const __m256i vkey = _mm256_set1_epi32(key);
    
    for (size_t i=0; i < chunks; i++) {
        
        const __m256i in0 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 0*8));
        const __m256i in1 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 1*8));
        const __m256i in2 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 2*8));
        const __m256i in3 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 3*8));

        const __m256i eq0 = _mm256_cmpeq_epi32(in0, vkey);
        const __m256i eq1 = _mm256_cmpeq_epi32(in1, vkey);
        const __m256i eq2 = _mm256_cmpeq_epi32(in2, vkey);
        const __m256i eq3 = _mm256_cmpeq_epi32(in3, vkey);

        // eq0 = [a0 a1 a2 a3 a4 a5 a6 a7] (packed dword)
        // eq1 = [b0 b1 b2 b3 b4 b5 b6 b7] (packed dword)
        // eq2 = [c0 c1 c2 c3 c4 c5 c6 c7] (packed dword)
        // eq3 = [d0 d1 d2 d3 d4 d5 d6 d7] (packed dword)

        //  t0 = [a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7] (packed word)
        const __m256i t0  = _mm256_packs_epi32(eq0, eq2);
        // m02 = [a0 a1 a2 a3 a4 a5 a6 a7 c0 c1 c2 c3 c4 c5 c6 c7] (packed word)
        const __m256i m02 = _mm256_permutevar8x32_epi32(t0,
                                _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7));

        //  t0 = [b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7] (packed word)
        const __m256i t1 = _mm256_packs_epi32(eq1, eq3);
        // m13 = [b0 b1 b2 b3 b4 b5 b6 b7 d0 d1 d2 d3 d4 d5 d6 d7] (packed word)
        const __m256i m13 = _mm256_permutevar8x32_epi32(t1,
                                _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7));

        // m   = [a0..7 b0..7 c0..7 d0..7] (packed byte)
        const __m256i m   = _mm256_packs_epi16(m02, m13);

        *output++ = _mm256_movemask_epi8(m);
    }

    if (tail > 0) {
        bitmask_better_2(ptr + chunks*N, tail, key, out + chunks*N);
    }
}

示例#3

0

显示文件

文件： avx2.cpp 项目： WojciechMula/toys

bool is_sorted_avx2(int32_t* a, size_t n) {

    const __m256i shuffle_pattern = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 7);

    size_t i = 0;
    while (i < n - 8) {
        // curr = [ a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 ]
        const __m256i curr = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i));
        // next = [ a1 | a2 | a3 | a4 | a5 | a6 | a7 | a7 ]
        const __m256i next = _mm256_permutevar8x32_epi32(curr, shuffle_pattern);

        // Note: the last element of curr and next is a7, thus for this element 
        //       the comparison result is always zero.
        //
        // In fact, the first 7 elements are being tested.
        const __m256i mask = _mm256_cmpgt_epi32(curr, next);
        if (!_mm256_testz_si256(mask, mask)) {
            return false;
        }

        i += 7;
    }

    for (/**/; i + 1 < n; i++) {
        if (a[i] > a[i + 1])
            return false;
    }

    return true;
}

示例#4

0

显示文件

文件： main.cpp 项目： JamesDunne/live-mixer

// Main audio processing callback.
// NOTE: Called on a separate thread from main() thread.
ASIOTime *bufferSwitchTimeInfo(ASIOTime *timeInfo, long index, ASIOBool processNow)
{
    // Buffer size (in samples):
    long buffSize = drv.preferredSize;

    // Assume the buffer size is an even multiple of 32-bytes:
    assert((buffSize % sizeof(vec4_d64)) == 0);

    for (long i = 0; i < buffSize; ++i)
    {
        assert(index == 0 || index == 1);

        // Process 8 channels of 32-bit samples per iteration:
        for (long n = 0; n < inputChannels / 8; ++n)
        {
            const long ci = n * 8;
            const long co = drv.inputBuffers + ci;

            // Stripe input samples into a vector:
            const vec8_i32 inpSamples = _mm256_setr_epi32(
                ((long *)drv.bufferInfos[ci + 0].buffers[index])[i],
                ((long *)drv.bufferInfos[ci + 1].buffers[index])[i],
                ((long *)drv.bufferInfos[ci + 2].buffers[index])[i],
                ((long *)drv.bufferInfos[ci + 3].buffers[index])[i],
                ((long *)drv.bufferInfos[ci + 4].buffers[index])[i],
                ((long *)drv.bufferInfos[ci + 5].buffers[index])[i],
                ((long *)drv.bufferInfos[ci + 6].buffers[index])[i],
                ((long *)drv.bufferInfos[ci + 7].buffers[index])[i]
            );

            // Process audio effects:
            vec8_i32 outSamples;
            processEffects(inpSamples, outSamples, n * 2);
            // Copy outputs to output channel buffers:
            const long *outputs32 = (const long *)&outSamples;
            ((long *)drv.bufferInfos[co + 0].buffers[index])[i] = outputs32[0];
            ((long *)drv.bufferInfos[co + 1].buffers[index])[i] = outputs32[1];
            ((long *)drv.bufferInfos[co + 2].buffers[index])[i] = outputs32[2];
            ((long *)drv.bufferInfos[co + 3].buffers[index])[i] = outputs32[3];
            ((long *)drv.bufferInfos[co + 4].buffers[index])[i] = outputs32[4];
            ((long *)drv.bufferInfos[co + 5].buffers[index])[i] = outputs32[5];
            ((long *)drv.bufferInfos[co + 6].buffers[index])[i] = outputs32[6];
            ((long *)drv.bufferInfos[co + 7].buffers[index])[i] = outputs32[7];
        }
    }

    if (drv.postOutput)
        ASIOOutputReady();

    return 0L;
}

示例#5

0

显示文件

文件： codec_avx2.c 项目： BurningEnlightenment/base64-cmake

static inline __m256i
enc_reshuffle (__m256i in)
{
	// Spread out 32-bit words over both halves of the input register:
	in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32(
		0, 1, 2, -1,
		3, 4, 5, -1));

	// Slice into 32-bit chunks and operate on all chunks in parallel.
	// All processing is done within the 32-bit chunk. First, shuffle:
	// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
	// after:  [00000000|aaaaaabb|bbbbcccc|ccdddddd]
	in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
		-1, 9, 10, 11,
		-1, 6,  7,  8,
		-1, 3,  4,  5,
		-1, 0,  1,  2,
		-1, 9, 10, 11,
		-1, 6,  7,  8,
		-1, 3,  4,  5,
		-1, 0,  1,  2));

	// cd      = [00000000|00000000|0000cccc|ccdddddd]
	const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF));

	// ab      = [0000aaaa|aabbbbbb|00000000|00000000]
	const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000));

	// merged  = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
	const __m256i merged = _mm256_or_si256(ab, cd);

	// bd      = [00000000|00bbbbbb|00000000|00dddddd]
	const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F));

	// ac      = [00aaaaaa|00000000|00cccccc|00000000]
	const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00));

	// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
	const __m256i indices = _mm256_or_si256(ac, bd);

	// return  = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
	return _mm256_bswap_epi32(indices);
}

示例#6

0

显示文件

文件： avx2.cpp 项目： WojciechMula/toys

bool is_sorted_avx2_unrolled4(int32_t* a, size_t n) {

    const __m256i shuffle_pattern = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 7);

    size_t i = 0;
    while (i < n - (4*7 + 1)) {
        const __m256i curr0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 0*7));
        const __m256i curr1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 1*7));
        const __m256i curr2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 2*7));
        const __m256i curr3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 3*7));

        const __m256i next0 = _mm256_permutevar8x32_epi32(curr0, shuffle_pattern);
        const __m256i next1 = _mm256_permutevar8x32_epi32(curr1, shuffle_pattern);
        const __m256i next2 = _mm256_permutevar8x32_epi32(curr2, shuffle_pattern);
        const __m256i next3 = _mm256_permutevar8x32_epi32(curr3, shuffle_pattern);

        const __m256i mask0 = _mm256_cmpgt_epi32(curr0, next0);
        const __m256i mask1 = _mm256_cmpgt_epi32(curr1, next1);
        const __m256i mask2 = _mm256_cmpgt_epi32(curr2, next2);
        const __m256i mask3 = _mm256_cmpgt_epi32(curr3, next3);

        const __m256i mask = _mm256_or_si256(mask0, 
                             _mm256_or_si256(mask1, 
                             _mm256_or_si256(mask2, mask3)));

        if (!_mm256_testz_si256(mask, mask)) {
            return false;
        }

        i += 7*4;
    }

    for (/**/; i + 1 < n; i++) {
        if (a[i] > a[i + 1])
            return false;
    }

    return true;
}

示例#7

0

显示文件

文件： codec_avx2.c 项目： BurningEnlightenment/base64-cmake

static inline __m256i
dec_reshuffle (__m256i in)
{
	// Shuffle bytes to 32-bit bigendian:
	in = _mm256_bswap_epi32(in);

	// Mask in a single byte per shift:
	__m256i mask = _mm256_set1_epi32(0x3F000000);

	// Pack bytes together:
	__m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2);
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4));
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6));
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8));

	// Pack bytes together within 32-bit words, discarding words 3 and 7:
	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
		 3,  2,  1,
		 7,  6,  5,
		11, 10,  9,
		15, 14, 13,
		-1, -1, -1, -1,
		 3,  2,  1,
		 7,  6,  5,
		11, 10,  9,
		15, 14, 13,
		-1, -1, -1, -1));

	// Pack 32-bit words together, squashing empty words 3 and 7:
	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(
		0, 1, 2, 4, 5, 6, -1, -1));
}

示例#8

0

显示文件

文件： fma.c 项目： pzemtsov/MandelView

void calculate_fma_float (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256 dd = _mm256_set1_ps ((float) scale);
    __m256 XX0 = _mm256_set1_ps ((float) X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256 y0 = _mm256_set1_ps (j*(float) scale + (float) Y0);
        for (unsigned i = 0; i < SX; i += 8)	{
            __m256i ind = _mm256_setr_epi32 (i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
            __m256 x0 = _mm256_fmadd_ps (dd, _mm256_cvtepi32_ps (ind), XX0);
            __m256 x = x0;
            __m256 y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256 x2 = _mm256_mul_ps (x, x);
                __m256 y2 = _mm256_mul_ps (y, y);
                __m256 abs = _mm256_add_ps (x2, y2);
                __m256i cmp = _mm256_castps_si256 (_mm256_cmp_ps (abs, _mm256_set1_ps (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi32 (counts, cmp_mask);
                __m256 t = _mm256_add_ps (x, x);
                y = _mm256_fmadd_ps (t, y, y0);
                x = _mm256_add_ps (_mm256_sub_ps (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12));
            __m128i result128 = _128i_shuffle (_mm256_extractf128_si256 (result, 0), _mm256_extractf128_si256 (result, 1), 0, 0, 0, 0);
            result128 = _mm_shuffle_epi32 (result128, combine_4_2bits (0, 2, 0, 2));
            _mm_storel_epi64 ((__m128i*) out, result128);
            out += 8;
        }
    }
}

示例#9

0

显示文件

文件： nx_avx_double.hpp 项目： mywoodstock/nxsimd

 inline vector4db::vector4db(bool b0, bool b1, bool b2, bool b3)
     : m_value(_mm256_castsi256_pd(
               _mm256_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1,
                                 -(int)b2, -(int)b2, -(int)b3, -(int)b3)))
 {
 }

示例#10

0

显示文件

文件： TransformedAABBoxAVX.cpp 项目： jianguo90hou/OcclusionCulling

//-----------------------------------------------------------------------------------------
// Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer
// If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee
// as visible. If all rasterized AABB pixels are occluded then the occludee is culled
//-----------------------------------------------------------------------------------------
bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	// Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. 
	// so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3);
	__m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
	float* pDepthBuffer = (float*)pRenderTargetPixels;
	
	// Rasterize the AABB triangles 4 at a time
	for(UINT i = 0; i < AABB_TRIANGLES; i += SSE)
	{
		vFloat4 xformedPos[3];
		Gather(xformedPos, i, pXformedPos, idx);

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		for(int m = 0; m < 3; m++)
		{
			fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X);
			fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y);
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea));

		__m128 Z[3];
		Z[0] = xformedPos[0].Z;
		Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea);
		
		// Use bounding box traversal strategy to determine which pixels to rasterize 
		//__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3));
		__m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1));

		__m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < SSE; lane++)
        {
			// Skip triangle if area is zero 
			if(triArea.m128i_i32[lane] <= 0)
			{
				continue;
			}

			// Extract this triangle's properties from the SIMD versions
			__m256 zz[3];
			for (int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]);
			}

			int startXx = startX.m128i_i32[lane];
			int endXx = endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy = endY.m128i_i32[lane];

			__m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]);
			__m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]);
			__m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]);

			__m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]);
			__m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]);
			__m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]);

			__m256i aa0Inc = _mm256_slli_epi32(aa0, 2);
			__m256i aa1Inc = _mm256_slli_epi32(aa1, 2);
			__m256i aa2Inc = _mm256_slli_epi32(aa2, 2);

			__m256i bb0Inc = _mm256_slli_epi32(bb0, 1);
			__m256i bb1Inc = _mm256_slli_epi32(bb1, 1);
			__m256i bb2Inc = _mm256_slli_epi32(bb2, 1);

			__m256i row, col;

			// Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better performance
			int	rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx));
			__m256i aa0Col = _mm256_mullo_epi32(aa0, col);
			__m256i aa1Col = _mm256_mullo_epi32(aa1, col);
			__m256i aa2Col = _mm256_mullo_epi32(aa2, col);

			row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy));
			__m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane]));
			__m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane]));
			__m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane]));

			__m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row);
			__m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row);
			__m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row);

			__m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for (int r = startYy; r < endYy; r += 2,
				rowIdx += 2 * SCREENW,
				sum0Row = _mm256_add_epi32(sum0Row, bb0Inc),
				sum1Row = _mm256_add_epi32(sum1Row, bb1Inc),
				sum2Row = _mm256_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m256i alpha = sum0Row;
				__m256i beta = sum1Row;
				__m256i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m256 depth = zz[0];
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1]));
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2]));
				__m256i anyOut = _mm256_setzero_si256();

				for (int c = startXx; c < endXx; c += 4,
					index += 8,
					alpha = _mm256_add_epi32(alpha, aa0Inc),
					beta = _mm256_add_epi32(beta, aa1Inc),
					gama = _mm256_add_epi32(gama, aa2Inc),
					depth = _mm256_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama);

					__m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]);
					__m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D);
					__m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask));
					anyOut = _mm256_or_si256(anyOut, finalMask);
				}//for each column	

				if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000)))
				{
					return true; //early exit
				}
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles

	return false;
}