template <class T> inline uint8_t generator_iter<T>::count() { uint8_t nbr = _mm_popcnt_u32(mask); // popcnt returns a 8 bits value for (ind_t ib = iblock+1; ib < bound; ib++) nbr += _mm_popcnt_u32(movemask_epi8(m.blocks[ib] == block1)); return nbr; };
static int getCpuInfo( ) { #ifdef _WIN32 SYSTEM_INFO si; GetSystemInfo(&si); if (si.dwNumberOfProcessors > Config.Threads) { Config.Threads = si.dwNumberOfProcessors; } int popcnt = 0; #if POPCNT && _MSC_VER __try { popcnt = _mm_popcnt_u32(1); } __except (filterFunc(GetExceptionCode())) { popcnt = 0; } #elif POPCNT popcnt = _mm_popcnt_u32(1); #endif printf("Cpu cores = %ld, SSE4 with Popcnt = %d\n", si.dwNumberOfProcessors, popcnt); return si.dwNumberOfProcessors; #endif }
/* 1が立っているビットを算出 */ int CountBit(BitBoard bit) { int l_moves = bit & 0x00000000FFFFFFFF; int h_moves = (bit & 0xFFFFFFFF00000000) >> 32; int count = _mm_popcnt_u32(l_moves); count += _mm_popcnt_u32(h_moves); /*l_moves -= (l_moves >> 1) & 0x55555555; l_moves = (l_moves & 0x33333333) + ((l_moves >> 2) & 0x33333333); l_moves = (l_moves + (l_moves >> 4)) & 0x0F0F0F0F; l_moves += l_moves >> 8; l_moves += l_moves >> 16; l_moves &= 0x0000003f; h_moves -= (h_moves >> 1) & 0x55555555; h_moves = (h_moves & 0x33333333) + ((h_moves >> 2) & 0x33333333); h_moves = (h_moves + (h_moves >> 4)) & 0x0F0F0F0F; h_moves += h_moves >> 8; h_moves += h_moves >> 16; h_moves &= 0x0000003f; return l_moves + h_moves;*/ return count; }
////////////////////////////////////////////////////////////////////////// // @brief processes a single decl from the streamout stream. Reads 4 components from the input // stream and writes N components to the output buffer given the componentMask or if // a hole, just increments the buffer pointer // @param pStream - pointer to current attribute // @param pOutBuffers - pointers to the current location of each output buffer // @param decl - input decl void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) { // @todo add this to x86 macros Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps); uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); uint32_t packedMask = (1 << numComponents) - 1; if (!decl.hole) { // increment stream pointer to correct slot Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); // load 4 components from stream Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); Type* simd4PtrTy = PointerType::get(simd4Ty, 0); pAttrib = BITCAST(pAttrib, simd4PtrTy); Value *vattrib = LOAD(pAttrib); // shuffle/pack enabled components Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); // store to output buffer // cast SO buffer to i8*, needed by maskstore Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0)); // cast input to <4xfloat> Value* src = BITCAST(vpackedAttrib, simd4Ty); CALL(maskStore, {pOut, ToMask(packedMask), src}); } // increment SO buffer pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents)); }
int _normoptimized(const unsigned char* a, const unsigned char* b, const int n) { #ifdef USE_PENTIUM4 int _ddt = 0; unsigned int _dis, _dis2, _dis3, _dis4; long int _cnt, _cnt2, _cnt3, _cnt4; for (int _i = 0; _i < n; _i+=16){ // xor 128 bits __m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i)); b0 = _mm_xor_si128(a0, b0); __m128i d = _mm_srli_si128(b0, 4); __m128i e = _mm_srli_si128(d,4); __m128i f = _mm_srli_si128(e,4); _dis = _mm_cvtsi128_si32(b0); _dis2 = _mm_cvtsi128_si32(d); _dis3 = _mm_cvtsi128_si32(e); _dis4 = _mm_cvtsi128_si32(f); // now count _cnt = _mm_popcnt_u32(_dis); _cnt2 = _mm_popcnt_u32(_dis2); _cnt3 = _mm_popcnt_u32(_dis3); _cnt4 = _mm_popcnt_u32(_dis4); _ddt += _cnt + _cnt2 + _cnt3 + _cnt4; } return _ddt; #else int _ddt = 0; unsigned long int _dis, _dis2; long int _cnt, _cnt2; for (int _i = 0; _i < n; _i+=16){ // xor 128 bits __m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i)); b0 = _mm_xor_si128(a0, b0); a0 = _mm_srli_si128(b0,8); _dis = _mm_cvtsi128_si64(b0); _dis2 = _mm_cvtsi128_si64(a0); _cnt = _mm_popcnt_u64(_dis); _cnt2 = _mm_popcnt_u64(_dis2); _ddt += _cnt + _cnt2; // other commmands don't give any advantage } return _ddt; #endif }
uint seqRank ( uint * vector , byte searchedByte , uint position ){ register uint i , cont = 0; __m128i patt , window , returnValue ; byte * c1 , patt_code [16]; uint d = position > >4 , r = position & 0 xf ; for ( i =0; i <16; i ++) patt_code [i ]= searchedByte ; long long * pat_array = ( long long *) patt_code ; patt = _mm_set_epi64x ( pat_array [1] , pat_array [0]) ; long long * text_array = ( long long *) vector ; for ( i =0; i <d; i ++) { window = _mm_set_epi64x ( text_array [1] , text_array [0]) ; returnValue = _mm_cmpestrm ( patt , 16 , window , 16 , mode ) ; cont += _mm_popcnt_u32 ( _mm_extract_epi32 ( returnValue ,0) ); text_array += 2; } window = _mm_set_epi64x ( text_array [1] , text_array [0]) ; returnValue = _mm_cmpestrm ( patt , r , window , r , mode ); cont += _mm_popcnt_u32 ( _mm_extract_epi32 ( returnValue ,0) ) +r -16; return cont ; }
void SGMStereo::addPixelwiseHamming(const int* leftCensusRow, const int* rightCensusRow) { for (int x = 0; x < disparityTotal_; ++x) { int leftCencusCode = leftCensusRow[x]; int hammingDistance = 0; for (int d = 0; d <= x; ++d) { int rightCensusCode = rightCensusRow[x - d]; hammingDistance = static_cast<int>(_mm_popcnt_u32(static_cast<unsigned int>(leftCencusCode^rightCensusCode))); pixelwiseCostRow_[disparityTotal_*x + d] += static_cast<unsigned char>(hammingDistance*censusWeightFactor_); } hammingDistance = static_cast<unsigned char>(hammingDistance*censusWeightFactor_); for (int d = x + 1; d < disparityTotal_; ++d) { pixelwiseCostRow_[disparityTotal_*x + d] += hammingDistance; } } for (int x = disparityTotal_; x < width_; ++x) { int leftCencusCode = leftCensusRow[x]; for (int d = 0; d < disparityTotal_; ++d) { int rightCensusCode = rightCensusRow[x - d]; int hammingDistance = static_cast<int>(_mm_popcnt_u32(static_cast<unsigned int>(leftCencusCode^rightCensusCode))); pixelwiseCostRow_[disparityTotal_*x + d] += static_cast<unsigned char>(hammingDistance*censusWeightFactor_); } } }
void SwrSetLinkage( HANDLE hContext, uint32_t mask, const uint8_t* pMap) { API_STATE* pState = GetDrawState(GetContext(hContext)); static const uint8_t IDENTITY_MAP[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, }; static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap), "Update for new value of MAX_ATTRIBUTES"); pState->linkageMask = mask; pState->linkageCount = _mm_popcnt_u32(mask); if (!pMap) { pMap = IDENTITY_MAP; } memcpy(pState->linkageMap, pMap, pState->linkageCount); }
void BackendSingleSample(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC& work, RenderOutputBuffers& renderBuffers) { RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId); RDTSC_BEGIN(BESetup, pDC->drawId); void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; const API_STATE& state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); SWR_PS_CONTEXT psContext; const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; SetupPixelShaderContext<T>(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); RDTSC_END(BESetup, 1); psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { #if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); #endif simdmask coverageMask = work.coverageMask[0] & MASK; if (coverageMask) { if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); } if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; generateInputCoverage<T, T::InputCoverage>( pCoverageMask, psContext.inputMask, state.blendState.sampleMask); } RDTSC_BEGIN(BEBarycentric, pDC->drawId); CalcPixelBarycentrics(coeffs, psContext); CalcCentroid<T, true>( &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); // interpolate and quantize z psContext.vZ = vplaneps( coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_END(BEBarycentric, 1); // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; // Early-Z? if (T::bCanEarlyZ) { RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); RDTSC_END(BEEarlyDepthTest, 0); // early-exit if no pixels passed depth or earlyZ is forced on if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) { goto Endtile; } } } psContext.sampleIndex = 0; psContext.activeMask = _simd_castps_si(vCoverageMask); // execute pixel shader RDTSC_BEGIN(BEPixelShader, pDC->drawId); state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); RDTSC_END(BEPixelShader, 0); // update stats UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); AR_EVENT(PSStats(psContext.stats.numInstExecuted)); vCoverageMask = _simd_castsi_ps(psContext.activeMask); // late-Z if (!T::bCanEarlyZ) { RDTSC_BEGIN(BELateDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); RDTSC_END(BELateDepthTest, 0); if (!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); goto Endtile; } } else { // for early z, consolidate discards from shader // into depthPassMask depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask); } uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); // output merger RDTSC_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset, workerId); #else OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, workerId, workerId); #endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); } RDTSC_END(BEOutputMerger, 0); } Endtile: RDTSC_BEGIN(BEEndTile, pDC->drawId); work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) { work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { DWORD rt; uint32_t rtMask = state.colorHottileEnable; while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } } #else DWORD rt; uint32_t rtMask = state.colorHottileEnable; while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } #endif pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; RDTSC_END(BEEndTile, 0); psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); } psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); }
0x4,0x5,0x6,0x7,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF }; // write vector new, while omitting repeated values assuming that previously written vector was "old" static int store_unique(__m128i old,__m128i new, uint32_t * output) { __m128i vecTmp = _mm_alignr_epi8(new, old, 16-4); int M = _mm_movemask_epi8(_mm_cmpeq_epi32(vecTmp,new));//_pdep_u32(,0x1111); M=_pext_u32(M,0x1111); int numberofnewvalues = 4 - _mm_popcnt_u32(M); __m128i key = _mm_lddqu_si128((const __m128i* )uniqshuf + M); __m128i val = _mm_shuffle_epi8(new,key); _mm_storeu_si128((__m128i* )output,val); return numberofnewvalues; } // working in-place, this function overwrites the repeated values static uint32_t unique(uint32_t * out, uint32_t len) { uint32_t pos = 1; for(uint32_t i = 1; i < len; ++i) { if(out[i] != out[i-1]) { out[pos++] = out[i]; } } return pos;
void Decoder::ADMMDecoder_deg_6_7_2_3_6() { int maxIter = maxIteration; float mu = 5.5f; float tableau[12] = { 0.0f }; if ((mBlocklength == 576) && (mNChecks == 288)) { mu = 3.37309f;//penalty tableau[2] = 0.00001f; tableau[3] = 2.00928f; tableau[6] = 4.69438f; } else if((mBlocklength == 2304) && (mNChecks == 1152) ) { mu = 3.81398683f;//penalty tableau[2] = 0.29669288f; tableau[3] = 0.46964023f; tableau[6] = 3.19548154f; } else { mu = 5.5;//penalty tableau[2] = 0.8f; tableau[3] = 0.8f; tableau[6] = 0.8f; } const float rho = 1.9f; //over relaxation parameter; const float un_m_rho = 1.0 - rho; const auto _rho = _mm256_set1_ps( rho ); const auto _un_m_rho = _mm256_set1_ps( un_m_rho ); float tableaX[12]; // // ON PRECALCULE LES CONSTANTES // #pragma unroll for (int i = 0; i < 7; i++) { tableaX[i] = tableau[ i ] / mu; } const auto t_mu = _mm256_set1_ps ( mu ); const auto t2_amu = _mm256_set1_ps ( tableau[ 2 ] / mu ); const auto t3_amu = _mm256_set1_ps ( tableau[ 3 ] / mu ); const auto t6_amu = _mm256_set1_ps ( tableau[ 6 ] / mu ); const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu ); const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu ); const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu ); const auto t2_deg = _mm256_set1_ps ( 2.0f ); const auto t3_deg = _mm256_set1_ps ( 3.0f ); const auto t6_deg = _mm256_set1_ps ( 6.0f ); const auto zero = _mm256_set1_ps ( 0.0f ); const auto un = _mm256_set1_ps ( 1.0f ); const __m256 a = _mm256_set1_ps ( 0.0f ); const __m256 b = _mm256_set1_ps ( 0.5f ); ////////////////////////////////////////////////////////////////////////////////////// #pragma unroll for( int j = 0; j < _mPCheckMapSize; j+=8 ) { _mm256_store_ps(&Lambda [j], a); _mm256_store_ps(&zReplica[j], b); _mm256_store_ps(&latestProjVector[j], b); } ////////////////////////////////////////////////////////////////////////////////////// for(int i = 0; i < maxIter; i++) { int ptr = 0; mIteration = i + 1; // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON const auto start = timer(); #endif // // VN processing kernel // #pragma unroll for (int j = 0; j < _mBlocklength; j++) { const int degVn = VariableDegree[j]; float M[8] __attribute__((aligned(64))); if( degVn == 2 ){ #if 1 const int dVN = 2; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 2; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 3 ){ #if 1 const int dVN = 3; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 3; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 6 ){ #if 1 const int dVN = 6; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 6; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif } } // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON t_vn += (timer() - start); #endif // // CN processing kernel // int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph int allVerified = 0; float vector_before_proj[8] __attribute__((aligned(64))); const auto zero = _mm256_set1_ps ( 0.0f ); const auto mask_6 = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto mask_7 = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto dot5 = _mm256_set1_ps( 0.5f ); // // MEASURE OF THE CN EXECUTION TIME // #ifdef PROFILE_ON const auto starT = timer(); #endif const auto seuilProj = _mm256_set1_ps( 1e-5f ); for(int j = 0; j < _mNChecks; j++) { if( CheckDegree[j] == 6 ){ const int cDeg6 = 0x3F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); int test = (_mm256_movemask_ps( synd ) & cDeg6); // deg 6 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps (xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6 if( skip == false ) { const auto _ztemp = mp.projection_deg6( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda[CumSumCheckDegree], mask_6, mLambda); _mm256_maskstore_ps(&zReplica[CumSumCheckDegree], mask_6, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 6; }else if( CheckDegree[j] == 7 ) { const int cDeg7 = 0x7F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); const int test = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps ( xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7 if( skip == false ) { const auto _ztemp = mp.projection_deg7( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda [CumSumCheckDegree], mask_7, mLambda); _mm256_maskstore_ps(&zReplica [CumSumCheckDegree], mask_7, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 7; }else{ exit( 0 ); } } // // MEASURE OF THE CN LOOP EXECUTION TIME // #ifdef PROFILE_ON t_cn += (timer() - starT); #endif #ifdef PROFILE_ON t_ex += 1; //FILE *ft=fopen("time.txt","a"); //fprintf(ft,"%d \n", t_cn/t_ex); //fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj); //fclose(ft); #endif if(allVerified == 0) { mAlgorithmConverge = true; mValidCodeword = true; break; } } // // MEASURE OF THE NUMBER OF EXECUTION // // #ifdef PROFILE_ON // t_ex += 1; // #endif }
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BENullBackend, pDC->drawId); ///@todo: handle center multisample pattern AR_BEGIN(BESetup, pDC->drawId); const API_STATE &state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); uint8_t *pDepthBuffer, *pStencilBuffer; SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers); SWR_PS_CONTEXT psContext; // skip SetupPixelShaderContext(&psContext, ...); // not needed here AR_END(BESetup, 0); simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { // iterate over active samples unsigned long sample = 0; uint32_t sampleMask = state.blendState.sampleMask; while (_BitScanForward(&sample, sampleMask)) { sampleMask &= ~(1 << sample); simdmask coverageMask = work.coverageMask[sample] & MASK; if (coverageMask) { // offset depth/stencil buffers current sample uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); } AR_BEGIN(BEBarycentric, pDC->drawId); // calculate per sample positions psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample)); psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample)); CalcSampleBarycentrics(coeffs, psContext); // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); AR_END(BEBarycentric, 0); // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar stencilPassMask = vCoverageMask; AR_BEGIN(BEEarlyDepthTest, pDC->drawId); simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); AR_END(BEEarlyDepthTest, 0); uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); } Endtile: ATTR_UNUSED; work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx); } vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy); } AR_END(BENullBackend, 0); }
void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BESampleRateBackend, pDC->drawId); AR_BEGIN(BESetup, pDC->drawId); const API_STATE &state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); SWR_PS_CONTEXT psContext; const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; SetupPixelShaderContext<T>(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); AR_END(BESetup, 0); psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { #if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); #endif if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); } AR_BEGIN(BEBarycentric, pDC->drawId); CalcPixelBarycentrics(coeffs, psContext); CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); AR_END(BEBarycentric, 0); for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++) { simdmask coverageMask = work.coverageMask[sample] & MASK; if (coverageMask) { // offset depth/stencil buffers current sample uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); } AR_BEGIN(BEBarycentric, pDC->drawId); // calculate per sample positions psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); CalcSampleBarycentrics(coeffs, psContext); // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); AR_END(BEBarycentric, 0); // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; // Early-Z? if (T::bCanEarlyZ) { AR_BEGIN(BEEarlyDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); AR_END(BEEarlyDepthTest, 0); // early-exit if no samples passed depth or earlyZ is forced on. if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) { work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); continue; } } } psContext.sampleIndex = sample; psContext.activeMask = _simd_castps_si(vCoverageMask); // execute pixel shader AR_BEGIN(BEPixelShader, pDC->drawId); UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); AR_END(BEPixelShader, 0); vCoverageMask = _simd_castsi_ps(psContext.activeMask); // late-Z if (!T::bCanEarlyZ) { AR_BEGIN(BELateDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); AR_END(BELateDepthTest, 0); if (!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); continue; } } uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); // output merger AR_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset); #else OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask); #endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); } AR_END(BEOutputMerger, 0); }