예제 #1
0
template <class T> inline uint8_t generator_iter<T>::count()
{
  uint8_t nbr = _mm_popcnt_u32(mask); // popcnt returns a 8 bits value
  for (ind_t ib = iblock+1; ib < bound; ib++)
    nbr += _mm_popcnt_u32(movemask_epi8(m.blocks[ib] == block1));
  return nbr;
};
예제 #2
0
파일: FastTn.cpp 프로젝트: ktprime/ktprime
static int getCpuInfo(  )
{
#ifdef _WIN32
	SYSTEM_INFO si;
	GetSystemInfo(&si);

	if (si.dwNumberOfProcessors > Config.Threads) {
		Config.Threads = si.dwNumberOfProcessors;
	}

	int popcnt = 0;
#if POPCNT && _MSC_VER
	__try {
		popcnt = _mm_popcnt_u32(1);
	}
	__except (filterFunc(GetExceptionCode())) {
		popcnt = 0;
	}
#elif POPCNT
	popcnt = _mm_popcnt_u32(1);
#endif

	printf("Cpu cores = %ld, SSE4 with Popcnt = %d\n",
		si.dwNumberOfProcessors, popcnt);

	return si.dwNumberOfProcessors;
#endif
}
예제 #3
0
/* 1が立っているビットを算出 */
int CountBit(BitBoard bit)
{
	int l_moves = bit & 0x00000000FFFFFFFF;
	int h_moves = (bit & 0xFFFFFFFF00000000) >> 32;

	int count = _mm_popcnt_u32(l_moves);
	count += _mm_popcnt_u32(h_moves);

	/*l_moves -= (l_moves >> 1) & 0x55555555;
	l_moves = (l_moves & 0x33333333) + ((l_moves >> 2) & 0x33333333);
	l_moves = (l_moves + (l_moves >> 4)) & 0x0F0F0F0F;
	l_moves += l_moves >> 8;
	l_moves += l_moves >> 16;
	l_moves &= 0x0000003f;

	h_moves -= (h_moves >> 1) & 0x55555555;
	h_moves = (h_moves & 0x33333333) + ((h_moves >> 2) & 0x33333333);
	h_moves = (h_moves + (h_moves >> 4)) & 0x0F0F0F0F;
	h_moves += h_moves >> 8;
	h_moves += h_moves >> 16;
	h_moves &= 0x0000003f;

	return l_moves + h_moves;*/

	return count;
}
예제 #4
0
    //////////////////////////////////////////////////////////////////////////
    // @brief processes a single decl from the streamout stream. Reads 4 components from the input
    //        stream and writes N components to the output buffer given the componentMask or if
    //        a hole, just increments the buffer pointer
    // @param pStream - pointer to current attribute
    // @param pOutBuffers - pointers to the current location of each output buffer
    // @param decl - input decl
    void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
    {
        // @todo add this to x86 macros
        Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);

        uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
        uint32_t packedMask = (1 << numComponents) - 1;
        if (!decl.hole)
        {
            // increment stream pointer to correct slot
            Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));

            // load 4 components from stream
            Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
            Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
            pAttrib = BITCAST(pAttrib, simd4PtrTy);
            Value *vattrib = LOAD(pAttrib);

            // shuffle/pack enabled components
            Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));

            // store to output buffer
            // cast SO buffer to i8*, needed by maskstore
            Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));

            // cast input to <4xfloat>
            Value* src = BITCAST(vpackedAttrib, simd4Ty);
            CALL(maskStore, {pOut, ToMask(packedMask), src});
        }

        // increment SO buffer
        pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
    }
예제 #5
0
int _normoptimized(const unsigned char* a, const unsigned char* b, const int n)
{
#ifdef USE_PENTIUM4
	int _ddt = 0;
	unsigned int _dis, _dis2, _dis3, _dis4;
	long int _cnt, _cnt2, _cnt3, _cnt4;
	for (int _i = 0; _i < n; _i+=16){
		// xor 128 bits
		__m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i));
		__m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i));
		b0 = _mm_xor_si128(a0, b0);
		__m128i d = _mm_srli_si128(b0, 4);
	    __m128i e = _mm_srli_si128(d,4);
	    __m128i f = _mm_srli_si128(e,4);
	    _dis = _mm_cvtsi128_si32(b0);
	    _dis2 = _mm_cvtsi128_si32(d);
	    _dis3 = _mm_cvtsi128_si32(e);
	    _dis4 = _mm_cvtsi128_si32(f);
	    // now count
	    _cnt = _mm_popcnt_u32(_dis);
	    _cnt2 = _mm_popcnt_u32(_dis2);
	    _cnt3 = _mm_popcnt_u32(_dis3);
	    _cnt4 = _mm_popcnt_u32(_dis4);
	    _ddt += _cnt + _cnt2 + _cnt3 + _cnt4;
	}
	return _ddt;
#else	
	int _ddt = 0;
	unsigned long int _dis, _dis2;
	long int _cnt, _cnt2;
	for (int _i = 0; _i < n; _i+=16){
		// xor 128 bits
		__m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i));
		__m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i));
		b0 = _mm_xor_si128(a0, b0);
		a0 = _mm_srli_si128(b0,8);
		_dis = _mm_cvtsi128_si64(b0);
		_dis2 = _mm_cvtsi128_si64(a0);
		_cnt = _mm_popcnt_u64(_dis);
		_cnt2 = _mm_popcnt_u64(_dis2);
		_ddt +=  _cnt + _cnt2;		// other commmands don't give any advantage
	}
	return _ddt;
#endif
}
예제 #6
0
파일: main.cpp 프로젝트: CCJY/coliru
uint seqRank ( uint * vector , byte searchedByte , uint position ){
    register uint i , cont = 0;
    __m128i patt , window , returnValue ;
    byte * c1 , patt_code [16];
    uint d = position > >4 , r = position & 0 xf ;
    for ( i =0; i <16; i ++)
        patt_code [i ]= searchedByte ;
    long long * pat_array = ( long long *) patt_code ;
    patt = _mm_set_epi64x ( pat_array [1] , pat_array [0]) ;
    long long * text_array = ( long long *) vector ;
    for ( i =0; i <d; i ++) {
        window = _mm_set_epi64x ( text_array [1] , text_array [0]) ;
        returnValue = _mm_cmpestrm ( patt , 16 , window , 16 , mode ) ;
        cont += _mm_popcnt_u32 ( _mm_extract_epi32 ( returnValue ,0) );
        text_array += 2;
    }
    window = _mm_set_epi64x ( text_array [1] , text_array [0]) ;
    returnValue = _mm_cmpestrm ( patt , r , window , r , mode );
    cont += _mm_popcnt_u32 ( _mm_extract_epi32 ( returnValue ,0) ) +r -16;
    return cont ;
}
예제 #7
0
void SGMStereo::addPixelwiseHamming(const int* leftCensusRow, const int* rightCensusRow) {
	for (int x = 0; x < disparityTotal_; ++x) {
		int leftCencusCode = leftCensusRow[x];
		int hammingDistance = 0;
		for (int d = 0; d <= x; ++d) {
			int rightCensusCode = rightCensusRow[x - d];
			hammingDistance = static_cast<int>(_mm_popcnt_u32(static_cast<unsigned int>(leftCencusCode^rightCensusCode)));
			pixelwiseCostRow_[disparityTotal_*x + d] += static_cast<unsigned char>(hammingDistance*censusWeightFactor_);
		}
		hammingDistance = static_cast<unsigned char>(hammingDistance*censusWeightFactor_);
		for (int d = x + 1; d < disparityTotal_; ++d) {
			pixelwiseCostRow_[disparityTotal_*x + d] += hammingDistance;
		}
	}
	for (int x = disparityTotal_; x < width_; ++x) {
		int leftCencusCode = leftCensusRow[x];
		for (int d = 0; d < disparityTotal_; ++d) {
			int rightCensusCode = rightCensusRow[x - d];
			int hammingDistance = static_cast<int>(_mm_popcnt_u32(static_cast<unsigned int>(leftCencusCode^rightCensusCode)));
			pixelwiseCostRow_[disparityTotal_*x + d] += static_cast<unsigned char>(hammingDistance*censusWeightFactor_);
		}
	}
}
예제 #8
0
void SwrSetLinkage(
    HANDLE hContext,
    uint32_t mask,
    const uint8_t* pMap)
{
    API_STATE* pState = GetDrawState(GetContext(hContext));

    static const uint8_t IDENTITY_MAP[] =
    {
         0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
    };
    static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
        "Update for new value of MAX_ATTRIBUTES");

    pState->linkageMask = mask;
    pState->linkageCount = _mm_popcnt_u32(mask);

    if (!pMap)
    {
        pMap = IDENTITY_MAP;
    }
    memcpy(pState->linkageMap, pMap, pState->linkageCount);
}
예제 #9
0
void BackendSingleSample(DRAW_CONTEXT*        pDC,
                         uint32_t             workerId,
                         uint32_t             x,
                         uint32_t             y,
                         SWR_TRIANGLE_DESC&   work,
                         RenderOutputBuffers& renderBuffers)
{
    RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId);
    RDTSC_BEGIN(BESetup, pDC->drawId);

    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    const API_STATE& state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    SWR_PS_CONTEXT             psContext;
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    SetupPixelShaderContext<T>(&psContext, samplePos, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(psContext.pColorBuffer,
                       &pDepthBuffer,
                       &pStencilBuffer,
                       state.colorHottileEnable,
                       renderBuffers);

    RDTSC_END(BESetup, 1);

    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));

    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
#if USE_8x2_TILE_BACKEND
            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
            simdmask coverageMask = work.coverageMask[0] & MASK;

            if (coverageMask)
            {
                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                {
                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
                                  "Unsupported depth hot tile format");

                    const simdscalar z =
                        _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));

                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                }

                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
                {
                    const uint64_t* pCoverageMask =
                        (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
                            ? &work.innerCoverageMask
                            : &work.coverageMask[0];

                    generateInputCoverage<T, T::InputCoverage>(
                        pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
                }

                RDTSC_BEGIN(BEBarycentric, pDC->drawId);

                CalcPixelBarycentrics(coeffs, psContext);

                CalcCentroid<T, true>(
                    &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);

                // interpolate and quantize z
                psContext.vZ = vplaneps(
                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                RDTSC_END(BEBarycentric, 1);

                // interpolate user clip distance if available
                if (state.backendState.clipDistanceMask)
                {
                    coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
                                                         work.pUserClipBuffer,
                                                         psContext.vI.center,
                                                         psContext.vJ.center);
                }

                simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
                simdscalar depthPassMask   = vCoverageMask;
                simdscalar stencilPassMask = vCoverageMask;

                // Early-Z?
                if (T::bCanEarlyZ)
                {
                    RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
                    depthPassMask = DepthStencilTest(&state,
                                                     work.triFlags.frontFacing,
                                                     work.triFlags.viewportIndex,
                                                     psContext.vZ,
                                                     pDepthBuffer,
                                                     vCoverageMask,
                                                     pStencilBuffer,
                                                     &stencilPassMask);
                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
                                                               _simd_movemask_ps(stencilPassMask),
                                                               _simd_movemask_ps(vCoverageMask)));
                    RDTSC_END(BEEarlyDepthTest, 0);

                    // early-exit if no pixels passed depth or earlyZ is forced on
                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                    {
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                          &state.depthStencilState,
                                          work.triFlags.frontFacing,
                                          psContext.vZ,
                                          pDepthBuffer,
                                          depthPassMask,
                                          vCoverageMask,
                                          pStencilBuffer,
                                          stencilPassMask);

                        if (!_simd_movemask_ps(depthPassMask))
                        {
                            goto Endtile;
                        }
                    }
                }

                psContext.sampleIndex = 0;
                psContext.activeMask  = _simd_castps_si(vCoverageMask);

                // execute pixel shader
                RDTSC_BEGIN(BEPixelShader, pDC->drawId);
                state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
                RDTSC_END(BEPixelShader, 0);

                // update stats
                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                AR_EVENT(PSStats(psContext.stats.numInstExecuted));

                vCoverageMask = _simd_castsi_ps(psContext.activeMask);

                // late-Z
                if (!T::bCanEarlyZ)
                {
                    RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
                    depthPassMask = DepthStencilTest(&state,
                                                     work.triFlags.frontFacing,
                                                     work.triFlags.viewportIndex,
                                                     psContext.vZ,
                                                     pDepthBuffer,
                                                     vCoverageMask,
                                                     pStencilBuffer,
                                                     &stencilPassMask);
                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
                                                              _simd_movemask_ps(stencilPassMask),
                                                              _simd_movemask_ps(vCoverageMask)));
                    RDTSC_END(BELateDepthTest, 0);

                    if (!_simd_movemask_ps(depthPassMask))
                    {
                        // need to call depth/stencil write for stencil write
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                          &state.depthStencilState,
                                          work.triFlags.frontFacing,
                                          psContext.vZ,
                                          pDepthBuffer,
                                          depthPassMask,
                                          vCoverageMask,
                                          pStencilBuffer,
                                          stencilPassMask);
                        goto Endtile;
                    }
                }
                else
                {
                    // for early z, consolidate discards from shader
                    // into depthPassMask
                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
                }

                uint32_t statMask  = _simd_movemask_ps(depthPassMask);
                uint32_t statCount = _mm_popcnt_u32(statMask);
                UPDATE_STAT_BE(DepthPassCount, statCount);

                // output merger
                RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
                OutputMerger8x2(pDC,
                                psContext,
                                psContext.pColorBuffer,
                                0,
                                &state.blendState,
                                state.pfnBlendFunc,
                                vCoverageMask,
                                depthPassMask,
                                state.psState.renderTargetMask,
                                useAlternateOffset,
                                workerId);
#else
                OutputMerger4x2(pDC,
                                psContext,
                                psContext.pColorBuffer,
                                0,
                                &state.blendState,
                                state.pfnBlendFunc,
                                vCoverageMask,
                                depthPassMask,
                                state.psState.renderTargetMask,
                                workerId,
                                workerId);
#endif

                // do final depth write after all pixel kills
                if (!state.psState.forceEarlyZ)
                {
                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                      &state.depthStencilState,
                                      work.triFlags.frontFacing,
                                      psContext.vZ,
                                      pDepthBuffer,
                                      depthPassMask,
                                      vCoverageMask,
                                      pStencilBuffer,
                                      stencilPassMask);
                }
                RDTSC_END(BEOutputMerger, 0);
            }

        Endtile:
            RDTSC_BEGIN(BEEndTile, pDC->drawId);

            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
            {
                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            }

#if USE_8x2_TILE_BACKEND
            if (useAlternateOffset)
            {
                DWORD    rt;
                uint32_t rtMask = state.colorHottileEnable;
                while (_BitScanForward(&rt, rtMask))
                {
                    rtMask &= ~(1 << rt);
                    psContext.pColorBuffer[rt] +=
                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                }
            }
#else
            DWORD rt;
            uint32_t rtMask = state.colorHottileEnable;
            while (_BitScanForward(&rt, rtMask))
            {
                rtMask &= ~(1 << rt);
                psContext.pColorBuffer[rt] +=
                    (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
            }
#endif
            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
            pStencilBuffer +=
                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;

            RDTSC_END(BEEndTile, 0);

            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
        }

        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
    }
                            0x4,0x5,0x6,0x7,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x0,0x1,0x2,0x3,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x0,0x1,0x2,0x3,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
                           };


// write vector new, while omitting repeated values assuming that previously written vector was "old"
static int store_unique(__m128i old,__m128i new, uint32_t *  output) {
    __m128i vecTmp = _mm_alignr_epi8(new, old, 16-4);
    int M = _mm_movemask_epi8(_mm_cmpeq_epi32(vecTmp,new));//_pdep_u32(,0x1111);
    M=_pext_u32(M,0x1111);
    int numberofnewvalues = 4 - _mm_popcnt_u32(M);
    __m128i key =  _mm_lddqu_si128((const __m128i* )uniqshuf + M);
    __m128i val =  _mm_shuffle_epi8(new,key);
    _mm_storeu_si128((__m128i* )output,val);
    return numberofnewvalues;
}

// working in-place, this function overwrites the repeated values
static uint32_t unique(uint32_t *  out, uint32_t len) {
    uint32_t pos = 1;
    for(uint32_t i = 1; i < len; ++i) {
        if(out[i] != out[i-1]) {
            out[pos++] = out[i];
        }
    }
    return pos;
void Decoder::ADMMDecoder_deg_6_7_2_3_6()
{
	int maxIter          = maxIteration;
	float mu             = 5.5f; 
	float tableau[12]    = { 0.0f };

    if ((mBlocklength == 576) && (mNChecks == 288))
    {
     	mu          = 3.37309f;//penalty
        tableau[2]  = 0.00001f;
	tableau[3]  = 2.00928f;
	tableau[6]  = 4.69438f;

    }
    else if((mBlocklength == 2304) && (mNChecks == 1152) )
    {
    	mu          = 3.81398683f;//penalty
        tableau[2]  = 0.29669288f; 
	tableau[3]  = 0.46964023f;
	tableau[6]  = 3.19548154f;
    }
    else
    {
    	mu          = 5.5;//penalty
        tableau[2]  = 0.8f;
	tableau[3]  = 0.8f;
	tableau[6]  = 0.8f;
    }

    const float rho      = 1.9f;    //over relaxation parameter;
    const float un_m_rho = 1.0 - rho;
    const auto  _rho      = _mm256_set1_ps(      rho );
    const auto  _un_m_rho = _mm256_set1_ps( un_m_rho );
    float tableaX[12];

    //
    // ON PRECALCULE LES CONSTANTES
    //
	#pragma  unroll
    for (int i = 0; i < 7; i++)
    {
        tableaX[i] = tableau[ i ] / mu;
    }
	const auto t_mu    = _mm256_set1_ps ( mu );

	const auto t2_amu  = _mm256_set1_ps (        tableau[ 2 ] / mu   );
	const auto t3_amu  = _mm256_set1_ps (        tableau[ 3 ] / mu   );
	const auto t6_amu  = _mm256_set1_ps (        tableau[ 6 ] / mu   );

	const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu );
	const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu );
	const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu );

	const auto t2_deg  = _mm256_set1_ps ( 2.0f );
	const auto t3_deg  = _mm256_set1_ps ( 3.0f );
	const auto t6_deg  = _mm256_set1_ps ( 6.0f );

	const auto zero    = _mm256_set1_ps ( 0.0f );
	const auto un      = _mm256_set1_ps ( 1.0f );
    const __m256 a     = _mm256_set1_ps ( 0.0f );
    const __m256 b     = _mm256_set1_ps ( 0.5f );

    //////////////////////////////////////////////////////////////////////////////////////
	#pragma  unroll
	for( int j = 0; j < _mPCheckMapSize; j+=8 )
    {
	_mm256_store_ps(&Lambda  [j],         a);
        _mm256_store_ps(&zReplica[j],         b);
        _mm256_store_ps(&latestProjVector[j], b);
    }
    //////////////////////////////////////////////////////////////////////////////////////

	for(int i = 0; i < maxIter; i++)
	{
        int ptr    = 0;
		mIteration = i + 1;

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto start = timer();
		#endif

        //
		// VN processing kernel
		//
		#pragma  unroll
		for (int j = 0; j < _mBlocklength; j++)
        {
            const int degVn = VariableDegree[j];
            float M[8] __attribute__((aligned(64)));

            if( degVn == 2 ){
#if 1
            	const int dVN = 2;
            	for(int qq = 0; qq < 8; qq++) 
		{
    				M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    				#pragma  unroll
    				for(int k = 1; k < dVN; k++) 
				{
    					M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    				}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
            	const int degVN = 2;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }else if( degVn == 3 ){
#if 1
            	const int dVN = 3;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 3;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
		}else if( degVn == 6 ){
#if 1
            	const int dVN = 6;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 6;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }
        }

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				t_vn   += (timer() - start);
		#endif

		//
		// CN processing kernel
		//
	int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph
        int allVerified       = 0;
	float vector_before_proj[8] __attribute__((aligned(64)));

        const auto zero    = _mm256_set1_ps ( 0.0f    );
        const auto mask_6  = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto mask_7  = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto  dot5   = _mm256_set1_ps(     0.5f );

    	//
    	// MEASURE OF THE CN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto starT = timer();
		#endif

    	const auto seuilProj = _mm256_set1_ps( 1e-5f );
        for(int j = 0; j < _mNChecks; j++)
		{
            if( CheckDegree[j] == 6 ){
            	const int  cDeg6       = 0x3F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		int test              = (_mm256_movemask_ps( synd ) & cDeg6);  // deg 6
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  (xpred,      _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified       += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6

    	        if( skip == false )
    	        {
    	        	const auto _ztemp  = mp.projection_deg6( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
	    	    	const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
	    	    	const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda[CumSumCheckDegree],         mask_6,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica[CumSumCheckDegree],         mask_6,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
					t_pj   += (timer() - START);
		#endif
                CumSumCheckDegree += 6;

            }else if( CheckDegree[j] == 7 )
	    {
            	const int  cDeg7       = 0x7F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		const int  test       = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  ( xpred,    _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified         += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7

    	        if( skip == false )
    	        {
			const auto _ztemp  = mp.projection_deg7( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
    	    		const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
    	    		const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda        [CumSumCheckDegree], mask_7,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica        [CumSumCheckDegree], mask_7,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
							t_pj   += (timer() - START);
		#endif

                CumSumCheckDegree += 7;

            }else{
                exit( 0 );
            }
        }

    	//
    	// MEASURE OF THE CN LOOP EXECUTION TIME
    	//
        #ifdef PROFILE_ON
				t_cn   += (timer() - starT);
	#endif
	#ifdef PROFILE_ON
	t_ex += 1;
		//FILE *ft=fopen("time.txt","a");
		//fprintf(ft,"%d \n", t_cn/t_ex);
		//fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj);
		//fclose(ft);
	#endif
		if(allVerified == 0)
		{
			mAlgorithmConverge = true;
			mValidCodeword     = true;
			break;
		}
	}

	//
	// MEASURE OF THE NUMBER OF EXECUTION
	//
//	#ifdef PROFILE_ON
//		t_ex += 1;
//	#endif

}
예제 #12
0
파일: backend.cpp 프로젝트: chemecse/mesa
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BENullBackend, pDC->drawId);
    ///@todo: handle center multisample pattern
    AR_BEGIN(BESetup, pDC->drawId);

    const API_STATE &state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);

    SWR_PS_CONTEXT psContext;
    // skip SetupPixelShaderContext(&psContext, ...); // not needed here

    AR_END(BESetup, 0);

    simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
            // iterate over active samples
            unsigned long sample = 0;
            uint32_t sampleMask = state.blendState.sampleMask;
            while (_BitScanForward(&sample, sampleMask))
            {
                sampleMask &= ~(1 << sample);

                simdmask coverageMask = work.coverageMask[sample] & MASK;

                if (coverageMask)
                {
                    // offset depth/stencil buffers current sample
                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);

                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                    {
                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");

                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));

                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                    }

                    AR_BEGIN(BEBarycentric, pDC->drawId);

                    // calculate per sample positions
                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));

                    CalcSampleBarycentrics(coeffs, psContext);

                    // interpolate and quantize z
                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                    AR_END(BEBarycentric, 0);

                    // interpolate user clip distance if available
                    if (state.backendState.clipDistanceMask)
                    {
                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
                    }

                    simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
                    simdscalar stencilPassMask = vCoverageMask;

                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                        psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                    AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                        pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                    AR_END(BEEarlyDepthTest, 0);

                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
                    uint32_t statCount = _mm_popcnt_u32(statMask);
                    UPDATE_STAT_BE(DepthPassCount, statCount);
                }

            Endtile:
                ATTR_UNUSED;
                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            }

            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;

            vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
        }

        vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
    }

    AR_END(BENullBackend, 0);
}
예제 #13
0
void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BESampleRateBackend, pDC->drawId);
    AR_BEGIN(BESetup, pDC->drawId);

    const API_STATE &state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    SWR_PS_CONTEXT psContext;
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    SetupPixelShaderContext<T>(&psContext, samplePos, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);

    AR_END(BESetup, 0);

    psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));

    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
#if USE_8x2_TILE_BACKEND
            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
            {
                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];

                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
            }

            AR_BEGIN(BEBarycentric, pDC->drawId);

            CalcPixelBarycentrics(coeffs, psContext);

            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);

            AR_END(BEBarycentric, 0);

            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
            {
                simdmask coverageMask = work.coverageMask[sample] & MASK;

                if (coverageMask)
                {
                    // offset depth/stencil buffers current sample
                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);

                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                    {
                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");

                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));

                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                    }

                    AR_BEGIN(BEBarycentric, pDC->drawId);

                    // calculate per sample positions
                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));

                    CalcSampleBarycentrics(coeffs, psContext);

                    // interpolate and quantize z
                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                    AR_END(BEBarycentric, 0);

                    // interpolate user clip distance if available
                    if (state.backendState.clipDistanceMask)
                    {
                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
                    }

                    simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
                    simdscalar depthPassMask = vCoverageMask;
                    simdscalar stencilPassMask = vCoverageMask;

                    // Early-Z?
                    if (T::bCanEarlyZ)
                    {
                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                        AR_END(BEEarlyDepthTest, 0);

                        // early-exit if no samples passed depth or earlyZ is forced on.
                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                        {
                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

                            if (!_simd_movemask_ps(depthPassMask))
                            {
                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
                                continue;
                            }
                        }
                    }

                    psContext.sampleIndex = sample;
                    psContext.activeMask = _simd_castps_si(vCoverageMask);

                    // execute pixel shader
                    AR_BEGIN(BEPixelShader, pDC->drawId);
                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
                    AR_END(BEPixelShader, 0);

                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);

                    // late-Z
                    if (!T::bCanEarlyZ)
                    {
                        AR_BEGIN(BELateDepthTest, pDC->drawId);
                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                        AR_END(BELateDepthTest, 0);

                        if (!_simd_movemask_ps(depthPassMask))
                        {
                            // need to call depth/stencil write for stencil write
                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
                            continue;
                        }
                    }

                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
                    uint32_t statCount = _mm_popcnt_u32(statMask);
                    UPDATE_STAT_BE(DepthPassCount, statCount);

                    // output merger
                    AR_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
                    OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
#else
                    OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
#endif

                    // do final depth write after all pixel kills
                    if (!state.psState.forceEarlyZ)
                    {
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                    }
                    AR_END(BEOutputMerger, 0);
                }