コード例 #1
0
ファイル: BufferObject.cpp プロジェクト: Deepfreeze32/taken
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
	assert_16_byte_aligned( dst );
	assert_16_byte_aligned( src );

	int i = 0;
	for ( ; i + 128 <= numBytes; i += 128 ) {
		__m128i d0 = _mm_load_si128( (__m128i *)&src[i + 0*16] );
		__m128i d1 = _mm_load_si128( (__m128i *)&src[i + 1*16] );
		__m128i d2 = _mm_load_si128( (__m128i *)&src[i + 2*16] );
		__m128i d3 = _mm_load_si128( (__m128i *)&src[i + 3*16] );
		__m128i d4 = _mm_load_si128( (__m128i *)&src[i + 4*16] );
		__m128i d5 = _mm_load_si128( (__m128i *)&src[i + 5*16] );
		__m128i d6 = _mm_load_si128( (__m128i *)&src[i + 6*16] );
		__m128i d7 = _mm_load_si128( (__m128i *)&src[i + 7*16] );
		_mm_stream_si128( (__m128i *)&dst[i + 0*16], d0 );
		_mm_stream_si128( (__m128i *)&dst[i + 1*16], d1 );
		_mm_stream_si128( (__m128i *)&dst[i + 2*16], d2 );
		_mm_stream_si128( (__m128i *)&dst[i + 3*16], d3 );
		_mm_stream_si128( (__m128i *)&dst[i + 4*16], d4 );
		_mm_stream_si128( (__m128i *)&dst[i + 5*16], d5 );
		_mm_stream_si128( (__m128i *)&dst[i + 6*16], d6 );
		_mm_stream_si128( (__m128i *)&dst[i + 7*16], d7 );
	}
	for ( ; i + 16 <= numBytes; i += 16 ) {
		__m128i d = _mm_load_si128( (__m128i *)&src[i] );
		_mm_stream_si128( (__m128i *)&dst[i], d );
	}
	for ( ; i + 4 <= numBytes; i += 4 ) {
		*(uint32 *)&dst[i] = *(const uint32 *)&src[i];
	}
	for ( ; i < numBytes; i++ ) {
		dst[i] = src[i];
	}
	_mm_sfence();
}
コード例 #2
0
/*
========================
idSnapshotProcessor::SubmitPendingSnap
========================
*/
void idSnapshotProcessor::SubmitPendingSnap( int visIndex, uint8* objMemory, int objMemorySize, lzwCompressionData_t* lzwData )
{

	assert_16_byte_aligned( objMemory );
	assert_16_byte_aligned( lzwData );
	
	assert( hasPendingSnap );
	assert( jobMemory->lzwInOutData.numlzwDeltas == 0 );
	
	assert( net_optimalSnapDeltaSize.GetInteger() < jobMemory_t::MAX_LZW_MEM - 128 );		// Leave padding
	
	jobMemory->lzwInOutData.lzwDeltas		= jobMemory->lzwDeltas.Ptr();
	jobMemory->lzwInOutData.maxlzwDeltas	= jobMemory->lzwDeltas.Num();
	jobMemory->lzwInOutData.lzwMem			= jobMemory->lzwMem.Ptr();
	
#ifdef STRESS_LZW_MEM
	jobMemory->lzwInOutData.maxlzwMem		= g_maxlwMem;
#else
	jobMemory->lzwInOutData.maxlzwMem		= jobMemory_t::MAX_LZW_MEM;
#endif
	
	jobMemory->lzwInOutData.lzwDmaOut		= jobMemory_t::MAX_LZW_MEM;
	jobMemory->lzwInOutData.numlzwDeltas	= 0;
	jobMemory->lzwInOutData.lzwBytes		= 0;
	jobMemory->lzwInOutData.optimalLength	= net_optimalSnapDeltaSize.GetInteger();
	jobMemory->lzwInOutData.snapSequence	= snapSequence;
	jobMemory->lzwInOutData.lastObjId		= 0;
	jobMemory->lzwInOutData.lzwData			= lzwData;
	
	idSnapShot::submitDeltaJobsInfo_t submitInfo;
	
	submitInfo.objParms			= jobMemory->objParms.Ptr();
	submitInfo.maxObjParms		= jobMemory->objParms.Num();
	submitInfo.headers			= jobMemory->headers.Ptr();
	submitInfo.maxHeaders		= jobMemory->headers.Num();
	submitInfo.objMemory		= objMemory;
	submitInfo.maxObjMemory		= objMemorySize;
	submitInfo.lzwParms			= jobMemory->lzwParms.Ptr();
	submitInfo.maxDeltaParms	= jobMemory->lzwParms.Num();
	
	
	// Use a copy of base state to avoid race conditions.
	// The main thread could change it behind the jobs backs.
	submittedState				= baseState;
	submittedTemplateStates		= templateStates;
	
	submitInfo.templateStates	= &submittedTemplateStates;
	
	submitInfo.oldSnap			= &submittedState;
	submitInfo.visIndex			= visIndex;
	submitInfo.baseSequence		= baseSequence;
	
	submitInfo.lzwInOutData		= &jobMemory->lzwInOutData;
	
	pendingSnap.SubmitWriteDeltaToJobs( submitInfo );
}
コード例 #3
0
/*
========================
idSnapshotProcessor::idSnapshotProcessor
========================
*/
idSnapshotProcessor::idSnapshotProcessor()
{

	//assert( mem.IsGlobalHeap() );
	
	jobMemory = ( jobMemory_t* )Mem_Alloc( sizeof( jobMemory_t ) , TAG_NETWORKING );
	
	assert_16_byte_aligned( jobMemory );
	assert_16_byte_aligned( jobMemory->objParms.Ptr() );
	assert_16_byte_aligned( jobMemory->headers.Ptr() );
	assert_16_byte_aligned( jobMemory->lzwParms.Ptr() );
	
	Reset( true );
}
コード例 #4
0
ファイル: ModelDecal.cpp プロジェクト: Deepfreeze32/taken
/*
=====================
R_CopyDecalSurface
=====================
*/
static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes,
									const decal_t * decal, const float fadeColor[4] ) {
	assert_16_byte_aligned( &verts[numVerts] );
	assert_16_byte_aligned( &indexes[numIndexes] );
	assert_16_byte_aligned( decal->indexes );
	assert_16_byte_aligned( decal->verts );
	assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
	assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
	assert_16_byte_aligned( fadeColor );


	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
	const __m128 vector_fade_color = _mm_load_ps( fadeColor );
	const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 );

	// copy vertices and apply depth/time based fading
	assert_offsetof( idDrawVert, color, 6 * 4 );
	for ( int i = 0; i < decal->numVerts; i++ ) {
		const idDrawVert &srcVert = decal->verts[i];
		idDrawVert &dstVert = verts[numVerts + i];

		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert +  0 ) );
		__m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) );
		__m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 );

		__m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color );
		__m128i colorInt = _mm_cvtps_epi32( timeDepthFade );
		__m128i colorShort = _mm_packs_epi32( colorInt, colorInt );
		__m128i colorByte = _mm_packus_epi16( colorShort, colorShort );
		v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) );

		_mm_stream_si128( (__m128i *)( (byte *)&dstVert +  0 ), v0 );
		_mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 );
	}

	// copy indexes
	assert( ( decal->numIndexes & 7 ) == 0 );
	assert( sizeof( triIndex_t ) == 2 );
	for ( int i = 0; i < decal->numIndexes; i += 8 ) {
		__m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] );

		vi = _mm_add_epi16( vi, vector_short_num_verts );

		_mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi );
	}

	_mm_sfence();

}
コード例 #5
0
ファイル: BufferObject.cpp プロジェクト: Deepfreeze32/taken
/*
========================
idJointBuffer::AllocBufferObject
========================
*/
bool idJointBuffer::AllocBufferObject( const float * joints, int numAllocJoints ) {
	assert( apiObject == NULL );
	assert_16_byte_aligned( joints );

	if ( numAllocJoints <= 0 ) {
		idLib::Error( "idJointBuffer::AllocBufferObject: joints = %i", numAllocJoints );
	}

	numJoints = numAllocJoints;

	bool allocationFailed = false;

	const int numBytes = GetAllocedSize();

	GLuint buffer = 0;
	qglGenBuffersARB( 1, &buffer );
	qglBindBufferARB( GL_UNIFORM_BUFFER, buffer );
	qglBufferDataARB( GL_UNIFORM_BUFFER, numBytes, NULL, GL_STREAM_DRAW_ARB );
	qglBindBufferARB( GL_UNIFORM_BUFFER, 0);
	apiObject = reinterpret_cast< void * >( buffer );

	if ( r_showBuffers.GetBool() ) {
		idLib::Printf( "joint buffer alloc %p, api %p (%i joints)\n", this, GetAPIObject(), GetNumJoints() );
	}

	// copy the data
	if ( joints != NULL ) {
		Update( joints, numAllocJoints );
	}

	return !allocationFailed;
}
コード例 #6
0
ファイル: BufferObject.cpp プロジェクト: Deepfreeze32/taken
/*
========================
idJointBuffer::Update
========================
*/
void idJointBuffer::Update( const float * joints, int numUpdateJoints ) const {
	assert( apiObject != NULL );
	assert( IsMapped() == false );
	assert_16_byte_aligned( joints );
	assert( ( GetOffset() & 15 ) == 0 );

	if ( numUpdateJoints > numJoints ) {
		idLib::FatalError( "idJointBuffer::Update: size overrun, %i > %i\n", numUpdateJoints, numJoints );
	}

	const int numBytes = numUpdateJoints * 3 * 4 * sizeof( float );

	qglBindBufferARB( GL_UNIFORM_BUFFER, reinterpret_cast< GLuint >( apiObject ) );
	qglBufferSubDataARB( GL_UNIFORM_BUFFER, GetOffset(), (GLsizeiptrARB)numBytes, joints );
}
コード例 #7
0
ファイル: BufferObject.cpp プロジェクト: Deepfreeze32/taken
/*
========================
idJointBuffer::Reference
========================
*/
void idJointBuffer::Reference( const idJointBuffer & other, int jointRefOffset, int numRefJoints ) {
	assert( IsMapped() == false );
	assert( other.IsMapped() == false );
	assert( other.GetAPIObject() != NULL );
	assert( jointRefOffset >= 0 );
	assert( numRefJoints >= 0 );
	assert( jointRefOffset + numRefJoints * sizeof( idJointMat ) <= other.GetNumJoints() * sizeof( idJointMat ) );
	assert_16_byte_aligned( numRefJoints * 3 * 4 * sizeof( float ) );

	FreeBufferObject();
	numJoints = numRefJoints;
	offsetInOtherBuffer = other.GetOffset() + jointRefOffset;
	apiObject = other.apiObject;
	assert( OwnsBuffer() == false );
}
コード例 #8
0
ファイル: BufferObject.cpp プロジェクト: Deepfreeze32/taken
/*
========================
idIndexBuffer::AllocBufferObject
========================
*/
bool idIndexBuffer::AllocBufferObject( const void * data, int allocSize ) {
	assert( apiObject == NULL );
	assert_16_byte_aligned( data );

	if ( allocSize <= 0 ) {
		idLib::Error( "idIndexBuffer::AllocBufferObject: allocSize = %i", allocSize );
	}

	size = allocSize;

	bool allocationFailed = false;

	int numBytes = GetAllocedSize();


	// clear out any previous error
	qglGetError();

	GLuint bufferObject = 0xFFFF;
	qglGenBuffersARB( 1, & bufferObject );
	if ( bufferObject == 0xFFFF ) {
		GLenum error = qglGetError();
		idLib::FatalError( "idIndexBuffer::AllocBufferObject: failed - GL_Error %d", error );
	}
	qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, bufferObject );

	// these are rewritten every frame
	qglBufferDataARB( GL_ELEMENT_ARRAY_BUFFER_ARB, numBytes, NULL, bufferUsage );
	apiObject = reinterpret_cast< void * >( bufferObject );

	GLenum err = qglGetError();
	if ( err == GL_OUT_OF_MEMORY ) {
		idLib::Warning( "idIndexBuffer:AllocBufferObject: allocation failed" );
		allocationFailed = true;
	}


	if ( r_showBuffers.GetBool() ) {
		idLib::Printf( "index buffer alloc %p, api %p (%i bytes)\n", this, GetAPIObject(), GetSize() );
	}

	// copy the data
	if ( data != NULL ) {
		Update( data, allocSize );
	}

	return !allocationFailed;
}
コード例 #9
0
ファイル: BufferObject.cpp プロジェクト: Deepfreeze32/taken
/*
========================
idVertexBuffer::Update
========================
*/
void idVertexBuffer::Update( const void * data, int updateSize ) const {
	assert( apiObject != NULL );
	assert( IsMapped() == false );
	assert_16_byte_aligned( data );
	assert( ( GetOffset() & 15 ) == 0 );

	if ( updateSize > size ) {
		idLib::FatalError( "idVertexBuffer::Update: size overrun, %i > %i\n", updateSize, GetSize() );
	}

	int numBytes = ( updateSize + 15 ) & ~15;

	GLuint bufferObject = reinterpret_cast< GLuint >( apiObject );
	qglBindBufferARB( GL_ARRAY_BUFFER_ARB, bufferObject );
	qglBufferSubDataARB( GL_ARRAY_BUFFER_ARB, GetOffset(), (GLsizeiptrARB)numBytes, data );
/*
	void * buffer = MapBuffer( BM_WRITE );
	CopyBuffer( (byte *)buffer + GetOffset(), (byte *)data, numBytes );
	UnmapBuffer();
*/
}
コード例 #10
0
ファイル: ModelDecal.cpp プロジェクト: Deepfreeze32/taken
/*
============
R_DecalPointCullStatic
============
*/
static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, const idDrawVert * verts, const int numVerts ) {
	assert_16_byte_aligned( cullBits );
	assert_16_byte_aligned( verts );


	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );

	const __m128 vector_float_zero	= { 0.0f, 0.0f, 0.0f, 0.0f };
	const __m128i vector_int_mask0	= _mm_set1_epi32( 1 << 0 );
	const __m128i vector_int_mask1	= _mm_set1_epi32( 1 << 1 );
	const __m128i vector_int_mask2	= _mm_set1_epi32( 1 << 2 );
	const __m128i vector_int_mask3	= _mm_set1_epi32( 1 << 3 );
	const __m128i vector_int_mask4	= _mm_set1_epi32( 1 << 4 );
	const __m128i vector_int_mask5	= _mm_set1_epi32( 1 << 5 );

	const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() );
	const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() );
	const __m128 p2 = _mm_loadu_ps( planes[2].ToFloatPtr() );
	const __m128 p3 = _mm_loadu_ps( planes[3].ToFloatPtr() );
	const __m128 p4 = _mm_loadu_ps( planes[4].ToFloatPtr() );
	const __m128 p5 = _mm_loadu_ps( planes[5].ToFloatPtr() );

	const __m128 p0X = _mm_splat_ps( p0, 0 );
	const __m128 p0Y = _mm_splat_ps( p0, 1 );
	const __m128 p0Z = _mm_splat_ps( p0, 2 );
	const __m128 p0W = _mm_splat_ps( p0, 3 );

	const __m128 p1X = _mm_splat_ps( p1, 0 );
	const __m128 p1Y = _mm_splat_ps( p1, 1 );
	const __m128 p1Z = _mm_splat_ps( p1, 2 );
	const __m128 p1W = _mm_splat_ps( p1, 3 );

	const __m128 p2X = _mm_splat_ps( p2, 0 );
	const __m128 p2Y = _mm_splat_ps( p2, 1 );
	const __m128 p2Z = _mm_splat_ps( p2, 2 );
	const __m128 p2W = _mm_splat_ps( p2, 3 );

	const __m128 p3X = _mm_splat_ps( p3, 0 );
	const __m128 p3Y = _mm_splat_ps( p3, 1 );
	const __m128 p3Z = _mm_splat_ps( p3, 2 );
	const __m128 p3W = _mm_splat_ps( p3, 3 );

	const __m128 p4X = _mm_splat_ps( p4, 0 );
	const __m128 p4Y = _mm_splat_ps( p4, 1 );
	const __m128 p4Z = _mm_splat_ps( p4, 2 );
	const __m128 p4W = _mm_splat_ps( p4, 3 );

	const __m128 p5X = _mm_splat_ps( p5, 0 );
	const __m128 p5Y = _mm_splat_ps( p5, 1 );
	const __m128 p5Z = _mm_splat_ps( p5, 2 );
	const __m128 p5W = _mm_splat_ps( p5, 3 );

	for ( int i = 0; i < numVerts; ) {

		const int nextNumVerts = vertsODS.FetchNextBatch() - 4;

		for ( ; i <= nextNumVerts; i += 4 ) {
			const __m128 v0 = _mm_load_ps( vertsODS[i + 0].xyz.ToFloatPtr() );
			const __m128 v1 = _mm_load_ps( vertsODS[i + 1].xyz.ToFloatPtr() );
			const __m128 v2 = _mm_load_ps( vertsODS[i + 2].xyz.ToFloatPtr() );
			const __m128 v3 = _mm_load_ps( vertsODS[i + 3].xyz.ToFloatPtr() );

			const __m128 r0 = _mm_unpacklo_ps( v0, v2 );	// v0.x, v2.x, v0.z, v2.z
			const __m128 r1 = _mm_unpackhi_ps( v0, v2 );	// v0.y, v2.y, v0.w, v2.w
			const __m128 r2 = _mm_unpacklo_ps( v1, v3 );	// v1.x, v3.x, v1.z, v3.z
			const __m128 r3 = _mm_unpackhi_ps( v1, v3 );	// v1.y, v3.y, v1.w, v3.w

			const __m128 vX = _mm_unpacklo_ps( r0, r2 );	// v0.x, v1.x, v2.x, v3.x
			const __m128 vY = _mm_unpackhi_ps( r0, r2 );	// v0.y, v1.y, v2.y, v3.y
			const __m128 vZ = _mm_unpacklo_ps( r1, r3 );	// v0.z, v1.z, v2.z, v3.z

			const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) );
			const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) );
			const __m128 d2 = _mm_madd_ps( vX, p2X, _mm_madd_ps( vY, p2Y, _mm_madd_ps( vZ, p2Z, p2W ) ) );
			const __m128 d3 = _mm_madd_ps( vX, p3X, _mm_madd_ps( vY, p3Y, _mm_madd_ps( vZ, p3Z, p3W ) ) );
			const __m128 d4 = _mm_madd_ps( vX, p4X, _mm_madd_ps( vY, p4Y, _mm_madd_ps( vZ, p4Z, p4W ) ) );
			const __m128 d5 = _mm_madd_ps( vX, p5X, _mm_madd_ps( vY, p5Y, _mm_madd_ps( vZ, p5Z, p5W ) ) );

			__m128i c0 = __m128c( _mm_cmpgt_ps( d0, vector_float_zero ) );
			__m128i c1 = __m128c( _mm_cmpgt_ps( d1, vector_float_zero ) );
			__m128i c2 = __m128c( _mm_cmpgt_ps( d2, vector_float_zero ) );
			__m128i c3 = __m128c( _mm_cmpgt_ps( d3, vector_float_zero ) );
			__m128i c4 = __m128c( _mm_cmpgt_ps( d4, vector_float_zero ) );
			__m128i c5 = __m128c( _mm_cmpgt_ps( d5, vector_float_zero ) );

			c0 = _mm_and_si128( c0, vector_int_mask0 );
			c1 = _mm_and_si128( c1, vector_int_mask1 );
			c2 = _mm_and_si128( c2, vector_int_mask2 );
			c3 = _mm_and_si128( c3, vector_int_mask3 );
			c4 = _mm_and_si128( c4, vector_int_mask4 );
			c5 = _mm_and_si128( c5, vector_int_mask5 );

			c0 = _mm_or_si128( c0, c1 );
			c2 = _mm_or_si128( c2, c3 );
			c4 = _mm_or_si128( c4, c5 );

			c0 = _mm_or_si128( c0, c2 );
			c0 = _mm_or_si128( c0, c4 );

			__m128i s0 = _mm_packs_epi32( c0, c0 );
			__m128i b0 = _mm_packus_epi16( s0, s0 );

			*(unsigned int *)&cullBits[i] = _mm_cvtsi128_si32( b0 );
		}
	}

}
コード例 #11
0
void idSnapShot::SubmitObjectJob(	const submitDeltaJobsInfo_t& 	submitDeltaJobsInfo,
									objectState_t* 					newState,
									objectState_t* 					oldState,
									objParms_t*&					baseObjParm,
									objParms_t*&					curObjParm,
									objHeader_t*&					curHeader,
									uint8*&						curObjDest,
									lzwParm_t*&					curlzwParm
								)
{
	assert( newState != NULL || oldState != NULL );
	assert_16_byte_aligned( curHeader );
	assert_16_byte_aligned( curObjDest );
	
	int32 dataSize = newState != NULL ? newState->buffer.Size() : 0;
	int totalSize = OBJ_DEST_SIZE_ALIGN16( dataSize );
	
	if( curObjParm - submitDeltaJobsInfo.objParms >= submitDeltaJobsInfo.maxObjParms )
	{
		idLib::Error( "Out of parms for snapshot jobs.\n" );
	}
	
	// Check to see if we are out of dest write space, and need to flush the jobs
	bool needToSubmit = ( curObjDest - submitDeltaJobsInfo.objMemory ) + totalSize >= submitDeltaJobsInfo.maxObjMemory;
	needToSubmit |= ( curHeader - submitDeltaJobsInfo.headers >= submitDeltaJobsInfo.maxHeaders );
	
	if( needToSubmit )
	{
		// If this obj will put us over the limit, then submit the jobs now, and start over re-using the same buffers
		SubmitLZWJob( submitDeltaJobsInfo, baseObjParm, curObjParm, curlzwParm, true );
		curHeader	= submitDeltaJobsInfo.headers;
		curObjDest	= submitDeltaJobsInfo.objMemory;
	}
	
	// Setup obj parms
	assert( submitDeltaJobsInfo.visIndex < 256 );
	curObjParm->visIndex	= submitDeltaJobsInfo.visIndex;
	curObjParm->destHeader	= curHeader;
	curObjParm->dest		= curObjDest;
	
	memset( &curObjParm->newState, 0, sizeof( curObjParm->newState ) );
	memset( &curObjParm->oldState, 0, sizeof( curObjParm->oldState ) );
	
	if( newState != NULL )
	{
		assert( newState->buffer.Size() <= 65535 );
		
		curObjParm->newState.valid		= 1;
		curObjParm->newState.data		= newState->buffer.Ptr();
		curObjParm->newState.size		= newState->buffer.Size();
		curObjParm->newState.objectNum	= newState->objectNum;
		curObjParm->newState.visMask	= newState->visMask;
	}
	
	if( oldState != NULL )
	{
		assert( oldState->buffer.Size() <= 65535 );
		
		curObjParm->oldState.valid		= 1;
		curObjParm->oldState.data		= oldState->buffer.Ptr();
		curObjParm->oldState.size		= oldState->buffer.Size();
		curObjParm->oldState.objectNum	= oldState->objectNum;
		curObjParm->oldState.visMask	= oldState->visMask;
	}
	
	assert_16_byte_aligned( curObjParm );
	assert_16_byte_aligned( curObjParm->newState.data );
	assert_16_byte_aligned( curObjParm->oldState.data );
	
	SnapshotObjectJob( curObjParm );
	
	// Advance past header + data
	curObjDest += totalSize;
	
	// Advance parm pointer
	curObjParm++;
	
	// Advance header pointer
	curHeader++;
}
コード例 #12
0
/*
====================
R_CopyOverlaySurface
====================
*/
static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* indexes, int numIndexes, const overlay_t* overlay, const idDrawVert* sourceVerts )
{
	assert_16_byte_aligned( &verts[numVerts] );
	assert_16_byte_aligned( &indexes[numIndexes] );
	assert_16_byte_aligned( overlay->verts );
	assert_16_byte_aligned( overlay->indexes );
	assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
	assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
	
#if defined(USE_INTRINSICS)
	
	const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
	
	// copy vertices
	for( int i = 0; i < overlay->numVerts; i++ )
	{
		const overlayVertex_t& overlayVert = overlay->verts[i];
		const idDrawVert& srcVert = sourceVerts[overlayVert.vertexNum];
		idDrawVert& dstVert = verts[numVerts + i];
		
		__m128i v0 = _mm_load_si128( ( const __m128i* )( ( byte* )&srcVert +  0 ) );
		__m128i v1 = _mm_load_si128( ( const __m128i* )( ( byte* )&srcVert + 16 ) );
		__m128i st = _mm_cvtsi32_si128( *( unsigned int* )overlayVert.st );
		
		st = _mm_shuffle_epi32( st, _MM_SHUFFLE( 0, 1, 2, 3 ) );
		v0 = _mm_and_si128( v0, vector_int_clear_last );
		v0 = _mm_or_si128( v0, st );
		
		_mm_stream_si128( ( __m128i* )( ( byte* )&dstVert +  0 ), v0 );
		_mm_stream_si128( ( __m128i* )( ( byte* )&dstVert + 16 ), v1 );
	}
	
	// copy indexes
	assert( ( overlay->numIndexes & 7 ) == 0 );
	assert( sizeof( triIndex_t ) == 2 );
	for( int i = 0; i < overlay->numIndexes; i += 8 )
	{
		__m128i vi = _mm_load_si128( ( const __m128i* )&overlay->indexes[i] );
		
		vi = _mm_add_epi16( vi, vector_short_num_verts );
		
		_mm_stream_si128( ( __m128i* )&indexes[numIndexes + i], vi );
	}
	
	_mm_sfence();
	
#else
	
	// copy vertices
	for( int i = 0; i < overlay->numVerts; i++ )
	{
		const overlayVertex_t& overlayVert = overlay->verts[i];
	
		// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
		verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
	
		// RB begin
		verts[numVerts + i].SetTexCoordS( overlayVert.st[0] );
		verts[numVerts + i].SetTexCoordT( overlayVert.st[1] );
		// RB end
	}
	
	// copy indexes
	for( int i = 0; i < overlay->numIndexes; i += 2 )
	{
		assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
		WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
	}
	
#endif
}
コード例 #13
0
/*
====================
R_OverlayPointCullSkinned
====================
*/
static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, halfFloat_t* texCoordT, const idPlane* planes, const idDrawVert* verts, const int numVerts, const idJointMat* joints )
{
	assert_16_byte_aligned( cullBits );
	assert_16_byte_aligned( texCoordS );
	assert_16_byte_aligned( texCoordT );
	assert_16_byte_aligned( verts );
	
#if defined(USE_INTRINSICS)
	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
	
	const __m128 vector_float_zero	= { 0.0f, 0.0f, 0.0f, 0.0f };
	const __m128 vector_float_one	= { 1.0f, 1.0f, 1.0f, 1.0f };
	const __m128i vector_int_mask0	= _mm_set1_epi32( 1 << 0 );
	const __m128i vector_int_mask1	= _mm_set1_epi32( 1 << 1 );
	const __m128i vector_int_mask2	= _mm_set1_epi32( 1 << 2 );
	const __m128i vector_int_mask3	= _mm_set1_epi32( 1 << 3 );
	
	const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() );
	const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() );
	
	const __m128 p0X = _mm_splat_ps( p0, 0 );
	const __m128 p0Y = _mm_splat_ps( p0, 1 );
	const __m128 p0Z = _mm_splat_ps( p0, 2 );
	const __m128 p0W = _mm_splat_ps( p0, 3 );
	
	const __m128 p1X = _mm_splat_ps( p1, 0 );
	const __m128 p1Y = _mm_splat_ps( p1, 1 );
	const __m128 p1Z = _mm_splat_ps( p1, 2 );
	const __m128 p1W = _mm_splat_ps( p1, 3 );
	
	for( int i = 0; i < numVerts; )
	{
	
		const int nextNumVerts = vertsODS.FetchNextBatch() - 4;
		
		for( ; i <= nextNumVerts; i += 4 )
		{
			const __m128 v0 = LoadSkinnedDrawVertPosition( vertsODS[i + 0], joints );
			const __m128 v1 = LoadSkinnedDrawVertPosition( vertsODS[i + 1], joints );
			const __m128 v2 = LoadSkinnedDrawVertPosition( vertsODS[i + 2], joints );
			const __m128 v3 = LoadSkinnedDrawVertPosition( vertsODS[i + 3], joints );
			
			const __m128 r0 = _mm_unpacklo_ps( v0, v2 );	// v0.x, v2.x, v0.z, v2.z
			const __m128 r1 = _mm_unpackhi_ps( v0, v2 );	// v0.y, v2.y, v0.w, v2.w
			const __m128 r2 = _mm_unpacklo_ps( v1, v3 );	// v1.x, v3.x, v1.z, v3.z
			const __m128 r3 = _mm_unpackhi_ps( v1, v3 );	// v1.y, v3.y, v1.w, v3.w
			
			const __m128 vX = _mm_unpacklo_ps( r0, r2 );	// v0.x, v1.x, v2.x, v3.x
			const __m128 vY = _mm_unpackhi_ps( r0, r2 );	// v0.y, v1.y, v2.y, v3.y
			const __m128 vZ = _mm_unpacklo_ps( r1, r3 );	// v0.z, v1.z, v2.z, v3.z
			
			const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) );
			const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) );
			const __m128 d2 = _mm_sub_ps( vector_float_one, d0 );
			const __m128 d3 = _mm_sub_ps( vector_float_one, d1 );
			
			__m128i flt16S = FastF32toF16( __m128c( d0 ) );
			__m128i flt16T = FastF32toF16( __m128c( d1 ) );
			
			_mm_storel_epi64( ( __m128i* )&texCoordS[i], flt16S );
			_mm_storel_epi64( ( __m128i* )&texCoordT[i], flt16T );
			
			__m128i c0 = __m128c( _mm_cmplt_ps( d0, vector_float_zero ) );
			__m128i c1 = __m128c( _mm_cmplt_ps( d1, vector_float_zero ) );
			__m128i c2 = __m128c( _mm_cmplt_ps( d2, vector_float_zero ) );
			__m128i c3 = __m128c( _mm_cmplt_ps( d3, vector_float_zero ) );
			
			c0 = _mm_and_si128( c0, vector_int_mask0 );
			c1 = _mm_and_si128( c1, vector_int_mask1 );
			c2 = _mm_and_si128( c2, vector_int_mask2 );
			c3 = _mm_and_si128( c3, vector_int_mask3 );
			
			c0 = _mm_or_si128( c0, c1 );
			c2 = _mm_or_si128( c2, c3 );
			c0 = _mm_or_si128( c0, c2 );
			
			c0 = _mm_packs_epi32( c0, c0 );
			c0 = _mm_packus_epi16( c0, c0 );
			
			*( unsigned int* )&cullBits[i] = _mm_cvtsi128_si32( c0 );
		}
	}
	
#else
	
	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
	
	for( int i = 0; i < numVerts; )
	{
	
		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
	
		for( ; i <= nextNumVerts; i++ )
		{
			const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
	
			const float d0 = planes[0].Distance( transformed );
			const float d1 = planes[1].Distance( transformed );
			const float d2 = 1.0f - d0;
			const float d3 = 1.0f - d1;
	
			halfFloat_t s = Scalar_FastF32toF16( d0 );
			halfFloat_t t = Scalar_FastF32toF16( d1 );
	
			texCoordS[i] = s;
			texCoordT[i] = t;
	
			byte bits;
			bits  = IEEE_FLT_SIGNBITSET( d0 ) << 0;
			bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
			bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
			bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
	
			cullBits[i] = bits;
		}
	}
	
#endif
}