void CopyBuffer( byte * dst, const byte * src, int numBytes ) { assert_16_byte_aligned( dst ); assert_16_byte_aligned( src ); int i = 0; for ( ; i + 128 <= numBytes; i += 128 ) { __m128i d0 = _mm_load_si128( (__m128i *)&src[i + 0*16] ); __m128i d1 = _mm_load_si128( (__m128i *)&src[i + 1*16] ); __m128i d2 = _mm_load_si128( (__m128i *)&src[i + 2*16] ); __m128i d3 = _mm_load_si128( (__m128i *)&src[i + 3*16] ); __m128i d4 = _mm_load_si128( (__m128i *)&src[i + 4*16] ); __m128i d5 = _mm_load_si128( (__m128i *)&src[i + 5*16] ); __m128i d6 = _mm_load_si128( (__m128i *)&src[i + 6*16] ); __m128i d7 = _mm_load_si128( (__m128i *)&src[i + 7*16] ); _mm_stream_si128( (__m128i *)&dst[i + 0*16], d0 ); _mm_stream_si128( (__m128i *)&dst[i + 1*16], d1 ); _mm_stream_si128( (__m128i *)&dst[i + 2*16], d2 ); _mm_stream_si128( (__m128i *)&dst[i + 3*16], d3 ); _mm_stream_si128( (__m128i *)&dst[i + 4*16], d4 ); _mm_stream_si128( (__m128i *)&dst[i + 5*16], d5 ); _mm_stream_si128( (__m128i *)&dst[i + 6*16], d6 ); _mm_stream_si128( (__m128i *)&dst[i + 7*16], d7 ); } for ( ; i + 16 <= numBytes; i += 16 ) { __m128i d = _mm_load_si128( (__m128i *)&src[i] ); _mm_stream_si128( (__m128i *)&dst[i], d ); } for ( ; i + 4 <= numBytes; i += 4 ) { *(uint32 *)&dst[i] = *(const uint32 *)&src[i]; } for ( ; i < numBytes; i++ ) { dst[i] = src[i]; } _mm_sfence(); }
/* ======================== idSnapshotProcessor::SubmitPendingSnap ======================== */ void idSnapshotProcessor::SubmitPendingSnap( int visIndex, uint8* objMemory, int objMemorySize, lzwCompressionData_t* lzwData ) { assert_16_byte_aligned( objMemory ); assert_16_byte_aligned( lzwData ); assert( hasPendingSnap ); assert( jobMemory->lzwInOutData.numlzwDeltas == 0 ); assert( net_optimalSnapDeltaSize.GetInteger() < jobMemory_t::MAX_LZW_MEM - 128 ); // Leave padding jobMemory->lzwInOutData.lzwDeltas = jobMemory->lzwDeltas.Ptr(); jobMemory->lzwInOutData.maxlzwDeltas = jobMemory->lzwDeltas.Num(); jobMemory->lzwInOutData.lzwMem = jobMemory->lzwMem.Ptr(); #ifdef STRESS_LZW_MEM jobMemory->lzwInOutData.maxlzwMem = g_maxlwMem; #else jobMemory->lzwInOutData.maxlzwMem = jobMemory_t::MAX_LZW_MEM; #endif jobMemory->lzwInOutData.lzwDmaOut = jobMemory_t::MAX_LZW_MEM; jobMemory->lzwInOutData.numlzwDeltas = 0; jobMemory->lzwInOutData.lzwBytes = 0; jobMemory->lzwInOutData.optimalLength = net_optimalSnapDeltaSize.GetInteger(); jobMemory->lzwInOutData.snapSequence = snapSequence; jobMemory->lzwInOutData.lastObjId = 0; jobMemory->lzwInOutData.lzwData = lzwData; idSnapShot::submitDeltaJobsInfo_t submitInfo; submitInfo.objParms = jobMemory->objParms.Ptr(); submitInfo.maxObjParms = jobMemory->objParms.Num(); submitInfo.headers = jobMemory->headers.Ptr(); submitInfo.maxHeaders = jobMemory->headers.Num(); submitInfo.objMemory = objMemory; submitInfo.maxObjMemory = objMemorySize; submitInfo.lzwParms = jobMemory->lzwParms.Ptr(); submitInfo.maxDeltaParms = jobMemory->lzwParms.Num(); // Use a copy of base state to avoid race conditions. // The main thread could change it behind the jobs backs. submittedState = baseState; submittedTemplateStates = templateStates; submitInfo.templateStates = &submittedTemplateStates; submitInfo.oldSnap = &submittedState; submitInfo.visIndex = visIndex; submitInfo.baseSequence = baseSequence; submitInfo.lzwInOutData = &jobMemory->lzwInOutData; pendingSnap.SubmitWriteDeltaToJobs( submitInfo ); }
/* ======================== idSnapshotProcessor::idSnapshotProcessor ======================== */ idSnapshotProcessor::idSnapshotProcessor() { //assert( mem.IsGlobalHeap() ); jobMemory = ( jobMemory_t* )Mem_Alloc( sizeof( jobMemory_t ) , TAG_NETWORKING ); assert_16_byte_aligned( jobMemory ); assert_16_byte_aligned( jobMemory->objParms.Ptr() ); assert_16_byte_aligned( jobMemory->headers.Ptr() ); assert_16_byte_aligned( jobMemory->lzwParms.Ptr() ); Reset( true ); }
/* ===================== R_CopyDecalSurface ===================== */ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const decal_t * decal, const float fadeColor[4] ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( decal->indexes ); assert_16_byte_aligned( decal->verts ); assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); const __m128 vector_fade_color = _mm_load_ps( fadeColor ); const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 ); // copy vertices and apply depth/time based fading assert_offsetof( idDrawVert, color, 6 * 4 ); for ( int i = 0; i < decal->numVerts; i++ ) { const idDrawVert &srcVert = decal->verts[i]; idDrawVert &dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) ); __m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 ); __m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color ); __m128i colorInt = _mm_cvtps_epi32( timeDepthFade ); __m128i colorShort = _mm_packs_epi32( colorInt, colorInt ); __m128i colorByte = _mm_packus_epi16( colorShort, colorShort ); v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 0 ), v0 ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 ); } // copy indexes assert( ( decal->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for ( int i = 0; i < decal->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi ); } _mm_sfence(); }
/* ======================== idJointBuffer::AllocBufferObject ======================== */ bool idJointBuffer::AllocBufferObject( const float * joints, int numAllocJoints ) { assert( apiObject == NULL ); assert_16_byte_aligned( joints ); if ( numAllocJoints <= 0 ) { idLib::Error( "idJointBuffer::AllocBufferObject: joints = %i", numAllocJoints ); } numJoints = numAllocJoints; bool allocationFailed = false; const int numBytes = GetAllocedSize(); GLuint buffer = 0; qglGenBuffersARB( 1, &buffer ); qglBindBufferARB( GL_UNIFORM_BUFFER, buffer ); qglBufferDataARB( GL_UNIFORM_BUFFER, numBytes, NULL, GL_STREAM_DRAW_ARB ); qglBindBufferARB( GL_UNIFORM_BUFFER, 0); apiObject = reinterpret_cast< void * >( buffer ); if ( r_showBuffers.GetBool() ) { idLib::Printf( "joint buffer alloc %p, api %p (%i joints)\n", this, GetAPIObject(), GetNumJoints() ); } // copy the data if ( joints != NULL ) { Update( joints, numAllocJoints ); } return !allocationFailed; }
/* ======================== idJointBuffer::Update ======================== */ void idJointBuffer::Update( const float * joints, int numUpdateJoints ) const { assert( apiObject != NULL ); assert( IsMapped() == false ); assert_16_byte_aligned( joints ); assert( ( GetOffset() & 15 ) == 0 ); if ( numUpdateJoints > numJoints ) { idLib::FatalError( "idJointBuffer::Update: size overrun, %i > %i\n", numUpdateJoints, numJoints ); } const int numBytes = numUpdateJoints * 3 * 4 * sizeof( float ); qglBindBufferARB( GL_UNIFORM_BUFFER, reinterpret_cast< GLuint >( apiObject ) ); qglBufferSubDataARB( GL_UNIFORM_BUFFER, GetOffset(), (GLsizeiptrARB)numBytes, joints ); }
/* ======================== idJointBuffer::Reference ======================== */ void idJointBuffer::Reference( const idJointBuffer & other, int jointRefOffset, int numRefJoints ) { assert( IsMapped() == false ); assert( other.IsMapped() == false ); assert( other.GetAPIObject() != NULL ); assert( jointRefOffset >= 0 ); assert( numRefJoints >= 0 ); assert( jointRefOffset + numRefJoints * sizeof( idJointMat ) <= other.GetNumJoints() * sizeof( idJointMat ) ); assert_16_byte_aligned( numRefJoints * 3 * 4 * sizeof( float ) ); FreeBufferObject(); numJoints = numRefJoints; offsetInOtherBuffer = other.GetOffset() + jointRefOffset; apiObject = other.apiObject; assert( OwnsBuffer() == false ); }
/* ======================== idIndexBuffer::AllocBufferObject ======================== */ bool idIndexBuffer::AllocBufferObject( const void * data, int allocSize ) { assert( apiObject == NULL ); assert_16_byte_aligned( data ); if ( allocSize <= 0 ) { idLib::Error( "idIndexBuffer::AllocBufferObject: allocSize = %i", allocSize ); } size = allocSize; bool allocationFailed = false; int numBytes = GetAllocedSize(); // clear out any previous error qglGetError(); GLuint bufferObject = 0xFFFF; qglGenBuffersARB( 1, & bufferObject ); if ( bufferObject == 0xFFFF ) { GLenum error = qglGetError(); idLib::FatalError( "idIndexBuffer::AllocBufferObject: failed - GL_Error %d", error ); } qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, bufferObject ); // these are rewritten every frame qglBufferDataARB( GL_ELEMENT_ARRAY_BUFFER_ARB, numBytes, NULL, bufferUsage ); apiObject = reinterpret_cast< void * >( bufferObject ); GLenum err = qglGetError(); if ( err == GL_OUT_OF_MEMORY ) { idLib::Warning( "idIndexBuffer:AllocBufferObject: allocation failed" ); allocationFailed = true; } if ( r_showBuffers.GetBool() ) { idLib::Printf( "index buffer alloc %p, api %p (%i bytes)\n", this, GetAPIObject(), GetSize() ); } // copy the data if ( data != NULL ) { Update( data, allocSize ); } return !allocationFailed; }
/* ======================== idVertexBuffer::Update ======================== */ void idVertexBuffer::Update( const void * data, int updateSize ) const { assert( apiObject != NULL ); assert( IsMapped() == false ); assert_16_byte_aligned( data ); assert( ( GetOffset() & 15 ) == 0 ); if ( updateSize > size ) { idLib::FatalError( "idVertexBuffer::Update: size overrun, %i > %i\n", updateSize, GetSize() ); } int numBytes = ( updateSize + 15 ) & ~15; GLuint bufferObject = reinterpret_cast< GLuint >( apiObject ); qglBindBufferARB( GL_ARRAY_BUFFER_ARB, bufferObject ); qglBufferSubDataARB( GL_ARRAY_BUFFER_ARB, GetOffset(), (GLsizeiptrARB)numBytes, data ); /* void * buffer = MapBuffer( BM_WRITE ); CopyBuffer( (byte *)buffer + GetOffset(), (byte *)data, numBytes ); UnmapBuffer(); */ }
/* ============ R_DecalPointCullStatic ============ */ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, const idDrawVert * verts, const int numVerts ) { assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; const __m128i vector_int_mask0 = _mm_set1_epi32( 1 << 0 ); const __m128i vector_int_mask1 = _mm_set1_epi32( 1 << 1 ); const __m128i vector_int_mask2 = _mm_set1_epi32( 1 << 2 ); const __m128i vector_int_mask3 = _mm_set1_epi32( 1 << 3 ); const __m128i vector_int_mask4 = _mm_set1_epi32( 1 << 4 ); const __m128i vector_int_mask5 = _mm_set1_epi32( 1 << 5 ); const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() ); const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() ); const __m128 p2 = _mm_loadu_ps( planes[2].ToFloatPtr() ); const __m128 p3 = _mm_loadu_ps( planes[3].ToFloatPtr() ); const __m128 p4 = _mm_loadu_ps( planes[4].ToFloatPtr() ); const __m128 p5 = _mm_loadu_ps( planes[5].ToFloatPtr() ); const __m128 p0X = _mm_splat_ps( p0, 0 ); const __m128 p0Y = _mm_splat_ps( p0, 1 ); const __m128 p0Z = _mm_splat_ps( p0, 2 ); const __m128 p0W = _mm_splat_ps( p0, 3 ); const __m128 p1X = _mm_splat_ps( p1, 0 ); const __m128 p1Y = _mm_splat_ps( p1, 1 ); const __m128 p1Z = _mm_splat_ps( p1, 2 ); const __m128 p1W = _mm_splat_ps( p1, 3 ); const __m128 p2X = _mm_splat_ps( p2, 0 ); const __m128 p2Y = _mm_splat_ps( p2, 1 ); const __m128 p2Z = _mm_splat_ps( p2, 2 ); const __m128 p2W = _mm_splat_ps( p2, 3 ); const __m128 p3X = _mm_splat_ps( p3, 0 ); const __m128 p3Y = _mm_splat_ps( p3, 1 ); const __m128 p3Z = _mm_splat_ps( p3, 2 ); const __m128 p3W = _mm_splat_ps( p3, 3 ); const __m128 p4X = _mm_splat_ps( p4, 0 ); const __m128 p4Y = _mm_splat_ps( p4, 1 ); const __m128 p4Z = _mm_splat_ps( p4, 2 ); const __m128 p4W = _mm_splat_ps( p4, 3 ); const __m128 p5X = _mm_splat_ps( p5, 0 ); const __m128 p5Y = _mm_splat_ps( p5, 1 ); const __m128 p5Z = _mm_splat_ps( p5, 2 ); const __m128 p5W = _mm_splat_ps( p5, 3 ); for ( int i = 0; i < numVerts; ) { const int nextNumVerts = vertsODS.FetchNextBatch() - 4; for ( ; i <= nextNumVerts; i += 4 ) { const __m128 v0 = _mm_load_ps( vertsODS[i + 0].xyz.ToFloatPtr() ); const __m128 v1 = _mm_load_ps( vertsODS[i + 1].xyz.ToFloatPtr() ); const __m128 v2 = _mm_load_ps( vertsODS[i + 2].xyz.ToFloatPtr() ); const __m128 v3 = _mm_load_ps( vertsODS[i + 3].xyz.ToFloatPtr() ); const __m128 r0 = _mm_unpacklo_ps( v0, v2 ); // v0.x, v2.x, v0.z, v2.z const __m128 r1 = _mm_unpackhi_ps( v0, v2 ); // v0.y, v2.y, v0.w, v2.w const __m128 r2 = _mm_unpacklo_ps( v1, v3 ); // v1.x, v3.x, v1.z, v3.z const __m128 r3 = _mm_unpackhi_ps( v1, v3 ); // v1.y, v3.y, v1.w, v3.w const __m128 vX = _mm_unpacklo_ps( r0, r2 ); // v0.x, v1.x, v2.x, v3.x const __m128 vY = _mm_unpackhi_ps( r0, r2 ); // v0.y, v1.y, v2.y, v3.y const __m128 vZ = _mm_unpacklo_ps( r1, r3 ); // v0.z, v1.z, v2.z, v3.z const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) ); const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) ); const __m128 d2 = _mm_madd_ps( vX, p2X, _mm_madd_ps( vY, p2Y, _mm_madd_ps( vZ, p2Z, p2W ) ) ); const __m128 d3 = _mm_madd_ps( vX, p3X, _mm_madd_ps( vY, p3Y, _mm_madd_ps( vZ, p3Z, p3W ) ) ); const __m128 d4 = _mm_madd_ps( vX, p4X, _mm_madd_ps( vY, p4Y, _mm_madd_ps( vZ, p4Z, p4W ) ) ); const __m128 d5 = _mm_madd_ps( vX, p5X, _mm_madd_ps( vY, p5Y, _mm_madd_ps( vZ, p5Z, p5W ) ) ); __m128i c0 = __m128c( _mm_cmpgt_ps( d0, vector_float_zero ) ); __m128i c1 = __m128c( _mm_cmpgt_ps( d1, vector_float_zero ) ); __m128i c2 = __m128c( _mm_cmpgt_ps( d2, vector_float_zero ) ); __m128i c3 = __m128c( _mm_cmpgt_ps( d3, vector_float_zero ) ); __m128i c4 = __m128c( _mm_cmpgt_ps( d4, vector_float_zero ) ); __m128i c5 = __m128c( _mm_cmpgt_ps( d5, vector_float_zero ) ); c0 = _mm_and_si128( c0, vector_int_mask0 ); c1 = _mm_and_si128( c1, vector_int_mask1 ); c2 = _mm_and_si128( c2, vector_int_mask2 ); c3 = _mm_and_si128( c3, vector_int_mask3 ); c4 = _mm_and_si128( c4, vector_int_mask4 ); c5 = _mm_and_si128( c5, vector_int_mask5 ); c0 = _mm_or_si128( c0, c1 ); c2 = _mm_or_si128( c2, c3 ); c4 = _mm_or_si128( c4, c5 ); c0 = _mm_or_si128( c0, c2 ); c0 = _mm_or_si128( c0, c4 ); __m128i s0 = _mm_packs_epi32( c0, c0 ); __m128i b0 = _mm_packus_epi16( s0, s0 ); *(unsigned int *)&cullBits[i] = _mm_cvtsi128_si32( b0 ); } } }
void idSnapShot::SubmitObjectJob( const submitDeltaJobsInfo_t& submitDeltaJobsInfo, objectState_t* newState, objectState_t* oldState, objParms_t*& baseObjParm, objParms_t*& curObjParm, objHeader_t*& curHeader, uint8*& curObjDest, lzwParm_t*& curlzwParm ) { assert( newState != NULL || oldState != NULL ); assert_16_byte_aligned( curHeader ); assert_16_byte_aligned( curObjDest ); int32 dataSize = newState != NULL ? newState->buffer.Size() : 0; int totalSize = OBJ_DEST_SIZE_ALIGN16( dataSize ); if( curObjParm - submitDeltaJobsInfo.objParms >= submitDeltaJobsInfo.maxObjParms ) { idLib::Error( "Out of parms for snapshot jobs.\n" ); } // Check to see if we are out of dest write space, and need to flush the jobs bool needToSubmit = ( curObjDest - submitDeltaJobsInfo.objMemory ) + totalSize >= submitDeltaJobsInfo.maxObjMemory; needToSubmit |= ( curHeader - submitDeltaJobsInfo.headers >= submitDeltaJobsInfo.maxHeaders ); if( needToSubmit ) { // If this obj will put us over the limit, then submit the jobs now, and start over re-using the same buffers SubmitLZWJob( submitDeltaJobsInfo, baseObjParm, curObjParm, curlzwParm, true ); curHeader = submitDeltaJobsInfo.headers; curObjDest = submitDeltaJobsInfo.objMemory; } // Setup obj parms assert( submitDeltaJobsInfo.visIndex < 256 ); curObjParm->visIndex = submitDeltaJobsInfo.visIndex; curObjParm->destHeader = curHeader; curObjParm->dest = curObjDest; memset( &curObjParm->newState, 0, sizeof( curObjParm->newState ) ); memset( &curObjParm->oldState, 0, sizeof( curObjParm->oldState ) ); if( newState != NULL ) { assert( newState->buffer.Size() <= 65535 ); curObjParm->newState.valid = 1; curObjParm->newState.data = newState->buffer.Ptr(); curObjParm->newState.size = newState->buffer.Size(); curObjParm->newState.objectNum = newState->objectNum; curObjParm->newState.visMask = newState->visMask; } if( oldState != NULL ) { assert( oldState->buffer.Size() <= 65535 ); curObjParm->oldState.valid = 1; curObjParm->oldState.data = oldState->buffer.Ptr(); curObjParm->oldState.size = oldState->buffer.Size(); curObjParm->oldState.objectNum = oldState->objectNum; curObjParm->oldState.visMask = oldState->visMask; } assert_16_byte_aligned( curObjParm ); assert_16_byte_aligned( curObjParm->newState.data ); assert_16_byte_aligned( curObjParm->oldState.data ); SnapshotObjectJob( curObjParm ); // Advance past header + data curObjDest += totalSize; // Advance parm pointer curObjParm++; // Advance header pointer curHeader++; }
/* ==================== R_CopyOverlaySurface ==================== */ static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* indexes, int numIndexes, const overlay_t* overlay, const idDrawVert* sourceVerts ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( overlay->verts ); assert_16_byte_aligned( overlay->indexes ); assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); #if defined(USE_INTRINSICS) const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); // copy vertices for( int i = 0; i < overlay->numVerts; i++ ) { const overlayVertex_t& overlayVert = overlay->verts[i]; const idDrawVert& srcVert = sourceVerts[overlayVert.vertexNum]; idDrawVert& dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( ( const __m128i* )( ( byte* )&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( ( const __m128i* )( ( byte* )&srcVert + 16 ) ); __m128i st = _mm_cvtsi32_si128( *( unsigned int* )overlayVert.st ); st = _mm_shuffle_epi32( st, _MM_SHUFFLE( 0, 1, 2, 3 ) ); v0 = _mm_and_si128( v0, vector_int_clear_last ); v0 = _mm_or_si128( v0, st ); _mm_stream_si128( ( __m128i* )( ( byte* )&dstVert + 0 ), v0 ); _mm_stream_si128( ( __m128i* )( ( byte* )&dstVert + 16 ), v1 ); } // copy indexes assert( ( overlay->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for( int i = 0; i < overlay->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( ( const __m128i* )&overlay->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( ( __m128i* )&indexes[numIndexes + i], vi ); } _mm_sfence(); #else // copy vertices for( int i = 0; i < overlay->numVerts; i++ ) { const overlayVertex_t& overlayVert = overlay->verts[i]; // NOTE: bad out-of-order write-combined write, SIMD code does the right thing verts[numVerts + i] = sourceVerts[overlayVert.vertexNum]; // RB begin verts[numVerts + i].SetTexCoordS( overlayVert.st[0] ); verts[numVerts + i].SetTexCoordT( overlayVert.st[1] ); // RB end } // copy indexes for( int i = 0; i < overlay->numIndexes; i += 2 ) { assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts ); WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] ); } #endif }
/* ==================== R_OverlayPointCullSkinned ==================== */ static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, halfFloat_t* texCoordT, const idPlane* planes, const idDrawVert* verts, const int numVerts, const idJointMat* joints ) { assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( texCoordS ); assert_16_byte_aligned( texCoordT ); assert_16_byte_aligned( verts ); #if defined(USE_INTRINSICS) idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f }; const __m128i vector_int_mask0 = _mm_set1_epi32( 1 << 0 ); const __m128i vector_int_mask1 = _mm_set1_epi32( 1 << 1 ); const __m128i vector_int_mask2 = _mm_set1_epi32( 1 << 2 ); const __m128i vector_int_mask3 = _mm_set1_epi32( 1 << 3 ); const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() ); const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() ); const __m128 p0X = _mm_splat_ps( p0, 0 ); const __m128 p0Y = _mm_splat_ps( p0, 1 ); const __m128 p0Z = _mm_splat_ps( p0, 2 ); const __m128 p0W = _mm_splat_ps( p0, 3 ); const __m128 p1X = _mm_splat_ps( p1, 0 ); const __m128 p1Y = _mm_splat_ps( p1, 1 ); const __m128 p1Z = _mm_splat_ps( p1, 2 ); const __m128 p1W = _mm_splat_ps( p1, 3 ); for( int i = 0; i < numVerts; ) { const int nextNumVerts = vertsODS.FetchNextBatch() - 4; for( ; i <= nextNumVerts; i += 4 ) { const __m128 v0 = LoadSkinnedDrawVertPosition( vertsODS[i + 0], joints ); const __m128 v1 = LoadSkinnedDrawVertPosition( vertsODS[i + 1], joints ); const __m128 v2 = LoadSkinnedDrawVertPosition( vertsODS[i + 2], joints ); const __m128 v3 = LoadSkinnedDrawVertPosition( vertsODS[i + 3], joints ); const __m128 r0 = _mm_unpacklo_ps( v0, v2 ); // v0.x, v2.x, v0.z, v2.z const __m128 r1 = _mm_unpackhi_ps( v0, v2 ); // v0.y, v2.y, v0.w, v2.w const __m128 r2 = _mm_unpacklo_ps( v1, v3 ); // v1.x, v3.x, v1.z, v3.z const __m128 r3 = _mm_unpackhi_ps( v1, v3 ); // v1.y, v3.y, v1.w, v3.w const __m128 vX = _mm_unpacklo_ps( r0, r2 ); // v0.x, v1.x, v2.x, v3.x const __m128 vY = _mm_unpackhi_ps( r0, r2 ); // v0.y, v1.y, v2.y, v3.y const __m128 vZ = _mm_unpacklo_ps( r1, r3 ); // v0.z, v1.z, v2.z, v3.z const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) ); const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) ); const __m128 d2 = _mm_sub_ps( vector_float_one, d0 ); const __m128 d3 = _mm_sub_ps( vector_float_one, d1 ); __m128i flt16S = FastF32toF16( __m128c( d0 ) ); __m128i flt16T = FastF32toF16( __m128c( d1 ) ); _mm_storel_epi64( ( __m128i* )&texCoordS[i], flt16S ); _mm_storel_epi64( ( __m128i* )&texCoordT[i], flt16T ); __m128i c0 = __m128c( _mm_cmplt_ps( d0, vector_float_zero ) ); __m128i c1 = __m128c( _mm_cmplt_ps( d1, vector_float_zero ) ); __m128i c2 = __m128c( _mm_cmplt_ps( d2, vector_float_zero ) ); __m128i c3 = __m128c( _mm_cmplt_ps( d3, vector_float_zero ) ); c0 = _mm_and_si128( c0, vector_int_mask0 ); c1 = _mm_and_si128( c1, vector_int_mask1 ); c2 = _mm_and_si128( c2, vector_int_mask2 ); c3 = _mm_and_si128( c3, vector_int_mask3 ); c0 = _mm_or_si128( c0, c1 ); c2 = _mm_or_si128( c2, c3 ); c0 = _mm_or_si128( c0, c2 ); c0 = _mm_packs_epi32( c0, c0 ); c0 = _mm_packus_epi16( c0, c0 ); *( unsigned int* )&cullBits[i] = _mm_cvtsi128_si32( c0 ); } } #else idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); for( int i = 0; i < numVerts; ) { const int nextNumVerts = vertsODS.FetchNextBatch() - 1; for( ; i <= nextNumVerts; i++ ) { const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints ); const float d0 = planes[0].Distance( transformed ); const float d1 = planes[1].Distance( transformed ); const float d2 = 1.0f - d0; const float d3 = 1.0f - d1; halfFloat_t s = Scalar_FastF32toF16( d0 ); halfFloat_t t = Scalar_FastF32toF16( d1 ); texCoordS[i] = s; texCoordT[i] = t; byte bits; bits = IEEE_FLT_SIGNBITSET( d0 ) << 0; bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1; bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2; bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3; cullBits[i] = bits; } } #endif }