Ejemplo n.º 1
0
void LuxelSpaceToWorld( lightinfo_t const *l, fltx4 s, fltx4 t, FourVectors &world )
{
	world.DuplicateVector ( l->luxelOrigin );
	FourVectors st;

	s = AddSIMD ( s, ReplicateX4 ( l->face->m_LightmapTextureMinsInLuxels[0] ) );
	st.DuplicateVector ( l->luxelToWorldSpace[0] );
	st *= s;
	world += st;

	t = AddSIMD ( t, ReplicateX4 ( l->face->m_LightmapTextureMinsInLuxels[1] ) );
	st.DuplicateVector ( l->luxelToWorldSpace[1] );
	st *= t;
	world += st;
}
Ejemplo n.º 2
0
void LightDesc_t::ComputeLightAtPointsForDirectional(
	const FourVectors &pos, const FourVectors &normal,
	FourVectors &color, bool DoHalfLambert ) const
{
	FourVectors delta;
	delta.DuplicateVector(m_Direction);
//	delta.VectorNormalizeFast();
	fltx4 strength=delta*normal;
	if (DoHalfLambert)
	{
		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
	}
	else
		strength=MaxSIMD(Four_Zeros,delta*normal);
		
	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
}
Ejemplo n.º 3
0
void MapLinearIntensities(FourVectors const &intens,uint32 *p1, uint32 *p2, uint32 *p3, uint32 *p4)
{
	// convert four pixels worth of sse-style rgb into argb lwords
	// NOTE the _mm_empty macro is voodoo. do not mess with this routine casually - simply throwing
	// anything that ends up generating a fpu stack references in here would be bad news.
	static fltx4 pixscale={255.0,255.0,255.0,255.0};
	fltx4 r,g,b;
	r=MinSIMD(pixscale,MulSIMD(pixscale,PowSIMD(intens.x,IGAMMA)));
	g=MinSIMD(pixscale,MulSIMD(pixscale,PowSIMD(intens.y,IGAMMA)));
	b=MinSIMD(pixscale,MulSIMD(pixscale,PowSIMD(intens.z,IGAMMA)));
	// now, convert to integer
	r=AndSIMD( AddSIMD( r, Four_MagicNumbers ), PIXMASK );
	g=AndSIMD( AddSIMD( g, Four_MagicNumbers ), PIXMASK );
	b=AndSIMD( AddSIMD( b, Four_MagicNumbers ), PIXMASK );

	*(p1)=(SubInt(r, 0))|(SubInt(g, 0)<<8)|(SubInt(b, 0)<<16);
	*(p2)=(SubInt(r, 1))|(SubInt(g, 1)<<8)|(SubInt(b, 1)<<16);
	*(p3)=(SubInt(r, 2))|(SubInt(g, 2)<<8)|(SubInt(b, 2)<<16);
	*(p4)=(SubInt(r, 3))|(SubInt(g, 3)<<8)|(SubInt(b, 3)<<16);
}
Ejemplo n.º 4
0
	inline fltx4 RandSIMD( void )
	{
		// ret= rand[k]+rand[j]
		fltx4 retval=AddSIMD( *m_pRand_K, *m_pRand_J );
		
		// if ( ret>=1.0) ret-=1.0
		fltx4 overflow_mask=CmpGeSIMD( retval, Four_Ones );
		retval=SubSIMD( retval, AndSIMD( Four_Ones, overflow_mask ) );
		
		*m_pRand_K = retval;
		
		// update pointers w/ wrap-around
		if ( --m_pRand_J < m_RandY )
			m_pRand_J=m_RandY+54;
		if ( --m_pRand_K < m_RandY )
			m_pRand_K=m_RandY+54;
		
		return retval;
	}
Ejemplo n.º 5
0
void RayTracingEnvironment::RenderScene(
	int width, int height,								   // width and height of desired rendering
	int stride,											 // actual width in pixels of target buffer
	uint32 *output_buffer,									// pointer to destination 
	Vector CameraOrigin,									// eye position
	Vector ULCorner,										// word space coordinates of upper left
															// monitor corner
	Vector URCorner,										// top right corner
	Vector LLCorner,										// lower left
	Vector LRCorner,										// lower right
	RayTraceLightingMode_t lmode)
{
	// first, compute deltas
	Vector dxvector=URCorner;
	dxvector-=ULCorner;
	dxvector*=(1.0/width);
	Vector dxvectortimes2=dxvector;
	dxvectortimes2+=dxvector;

	Vector dyvector=LLCorner;
	dyvector-=ULCorner;
	dyvector*=(1.0/height);


	// block_offsets-relative offsets for eahc of the 4 pixels in the block, in sse format
	FourVectors block_offsets;
	block_offsets.LoadAndSwizzle(Vector(0,0,0),dxvector,dyvector,dxvector+dyvector);
	
	FourRays myrays;
	myrays.origin.DuplicateVector(CameraOrigin);
	
	// tmprays is used fo rthe case when we cannot trace 4 rays at once.
	FourRays tmprays;
	tmprays.origin.DuplicateVector(CameraOrigin);

	// now, we will ray trace pixels. we will do the rays in a 2x2 pattern
	for(int y=0;y<height;y+=2)
	{
		Vector SLoc=dyvector;
		SLoc*=((float) y);
		SLoc+=ULCorner;
		uint32 *dest=output_buffer+y*stride;
		for(int x=0;x<width;x+=2)
		{
			myrays.direction.DuplicateVector(SLoc);
			myrays.direction+=block_offsets;
			myrays.direction.VectorNormalize();
			
			RayTracingResult rslt;
			Trace4Rays(myrays,all_zeros,TraceLimit, &rslt);
			if ((rslt.HitIds[0]==-1) && (rslt.HitIds[1]==-1) && 
				(rslt.HitIds[2]==-1) && (rslt.HitIds[3]==-1))
				MapLinearIntensities(BackgroundColor,dest,dest+1,dest+stride,dest+stride+1);
			else
			{
				// make sure normal points back towards ray origin
				fltx4 ndoti=rslt.surface_normal*myrays.direction;
				fltx4 bad_dirs=AndSIMD(CmpGtSIMD(ndoti,Four_Zeros),
										   LoadAlignedSIMD((float *) signmask));

				// flip signs of all "wrong" normals
				rslt.surface_normal.x=XorSIMD(bad_dirs,rslt.surface_normal.x);
				rslt.surface_normal.y=XorSIMD(bad_dirs,rslt.surface_normal.y);
				rslt.surface_normal.z=XorSIMD(bad_dirs,rslt.surface_normal.z);

				FourVectors intens;
				intens.DuplicateVector(Vector(0,0,0));
				// set up colors
				FourVectors surf_colors;
				surf_colors.DuplicateVector(Vector(0,0,0));
				for(int i=0;i<4;i++)
				{
					if (rslt.HitIds[i]>=0)
					{
						surf_colors.X(i)=TriangleColors[rslt.HitIds[i]].x;
						surf_colors.Y(i)=TriangleColors[rslt.HitIds[i]].y;
						surf_colors.Z(i)=TriangleColors[rslt.HitIds[i]].z;
					}

				}
				FourVectors surface_pos=myrays.direction;
				surface_pos*=rslt.HitDistance;
				surface_pos+=myrays.origin;
				
				switch(lmode)
				{
					case DIRECT_LIGHTING:
					{
						// light all points
						for(int l=0;l<LightList.Count();l++)
						{
							LightList[l].ComputeLightAtPoints(surface_pos,rslt.surface_normal,
															  intens);
						}
					}
					break;

					case DIRECT_LIGHTING_WITH_SHADOWS:
					{
						// light all points
						for(int l=0;l<LightList.Count();l++)
						{
							FourVectors ldir;
							ldir.DuplicateVector(LightList[l].m_Position);
							ldir-=surface_pos;
							fltx4 MaxT=ldir.length();
							ldir.VectorNormalizeFast();
							// now, compute shadow flag
							FourRays myrays;
							myrays.origin=surface_pos;
							FourVectors epsilon=ldir;
							epsilon*=0.01;
							myrays.origin+=epsilon;
							myrays.direction=ldir;
							RayTracingResult shadowtest;
							Trace4Rays(myrays,Four_Zeros,MaxT, &shadowtest);
							fltx4 unshadowed=CmpGtSIMD(shadowtest.HitDistance,MaxT);
							if (! (IsAllZeros(unshadowed)))
							{
								FourVectors tmp;
								tmp.DuplicateVector(Vector(0,0,0));
								LightList[l].ComputeLightAtPoints(surface_pos,rslt.surface_normal,
																  tmp);
								intens.x=AddSIMD(intens.x,AndSIMD(tmp.x,unshadowed));
								intens.y=AddSIMD(intens.y,AndSIMD(tmp.y,unshadowed));
								intens.z=AddSIMD(intens.z,AndSIMD(tmp.z,unshadowed));
							}
						}
					}
					break;
				}
				// now, mask off non-hitting pixels
				intens.VProduct(surf_colors);
				fltx4 no_hit_mask=CmpGtSIMD(rslt.HitDistance,TraceLimit);
				
				intens.x=OrSIMD(AndSIMD(BackgroundColor.x,no_hit_mask),
								   AndNotSIMD(no_hit_mask,intens.x));
				intens.y=OrSIMD(AndSIMD(BackgroundColor.y,no_hit_mask),
								   AndNotSIMD(no_hit_mask,intens.y));
				intens.z=OrSIMD(AndSIMD(BackgroundColor.y,no_hit_mask),
								   AndNotSIMD(no_hit_mask,intens.z));

				MapLinearIntensities(intens,dest,dest+1,dest+stride,dest+stride+1);
			}
			dest+=2;
			SLoc+=dxvectortimes2;
		}
	}
}
Ejemplo n.º 6
0
fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z )
{
	// use magic to convert to integer index
	fltx4 x_idx = AndSIMD( MASK255, AddSIMD( x, Four_MagicNumbers ) );
	fltx4 y_idx = AndSIMD( MASK255, AddSIMD( y, Four_MagicNumbers ) );
	fltx4 z_idx = AndSIMD( MASK255, AddSIMD( z, Four_MagicNumbers ) );

	fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros;
	fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros;

	// FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes)
	//        Converting the indexed noise values back to vectors will cause more (128 bytes)
	//        The noise table could store vectors if we chunked it into 2x2x2 blocks.
	fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros;
#define DOPASS(i)															\
    {	unsigned int xi = SubInt( x_idx, i );								\
		unsigned int yi = SubInt( y_idx, i );								\
		unsigned int zi = SubInt( z_idx, i );								\
		SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0);						\
		SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0);						\
		SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0);						\
		xi>>=8;																\
		yi>>=8;																\
		zi>>=8;																\
																			\
		SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi );		\
		SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 );		\
		SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi );		\
		SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 );	\
		SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi );		\
		SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 );	\
		SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi );	\
		SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 );	\
    }

	DOPASS( 0 );
	DOPASS( 1 );
	DOPASS( 2 );
	DOPASS( 3 );

	// now, we have 8 lattice values for each of four points as m128s, and interpolant values for
	// each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops

	// first, do x interpolation
	fltx4 l2d00 = AddSIMD( lattice000, MulSIMD( xfrac, SubSIMD( lattice100, lattice000 ) ) );
	fltx4 l2d01 = AddSIMD( lattice001, MulSIMD( xfrac, SubSIMD( lattice101, lattice001 ) ) );
	fltx4 l2d10 = AddSIMD( lattice010, MulSIMD( xfrac, SubSIMD( lattice110, lattice010 ) ) );
	fltx4 l2d11 = AddSIMD( lattice011, MulSIMD( xfrac, SubSIMD( lattice111, lattice011 ) ) );

	// now, do y interpolation
	fltx4 l1d0 = AddSIMD( l2d00, MulSIMD( yfrac, SubSIMD( l2d10, l2d00 ) ) );
	fltx4 l1d1 = AddSIMD( l2d01, MulSIMD( yfrac, SubSIMD( l2d11, l2d01 ) ) );

	// final z interpolation
	fltx4 rslt = AddSIMD( l1d0, MulSIMD( zfrac, SubSIMD( l1d1, l1d0 ) ) );

	// map to 0..1
	return MulSIMD( Four_Twos, SubSIMD( rslt, Four_PointFives ) );


}
void CLightingManager::SortLights()
{
#if DEBUG
	for ( int i = 0; i < LSORT_COUNT; i++ )
		Assert( m_hPreSortedLights[ i ].Count() == 0 );
#endif

	m_bDrawVolumetrics = false;

	float zNear = m_flzNear + 2;

	Vector vecBloat( zNear, zNear, zNear );

	Vector camMins( m_vecViewOrigin - vecBloat );
	Vector camMaxs( m_vecViewOrigin + vecBloat );

#if DEFCFG_USE_SSE
	fltx4 zNearX4 = ReplicateX4( zNear );

	for( int i = 0; i < m_uiSortDataCount; i++ )
	{
		def_light_presortdatax4_t& s = m_pSortDataX4[i];

		fltx4 adjustedMins[3] = { SubSIMD( s.bounds_min_naive[0], zNearX4 ),
			SubSIMD( s.bounds_min_naive[1], zNearX4 ),
			SubSIMD( s.bounds_min_naive[2], zNearX4 ) };

		fltx4 adjustedMaxs[3] = { AddSIMD( s.bounds_max_naive[0], zNearX4 ),
			AddSIMD( s.bounds_max_naive[1], zNearX4 ),
			AddSIMD( s.bounds_max_naive[2], zNearX4 ) };

		fltx4 needsFullscreen = IsPointInBoundsX4( m_vecViewOrigin,
			adjustedMins, adjustedMaxs );

		//Jack: this is terrible I know
		for( int i = 0; i < s.count; i++ )
		{
			if( s.lights[i]->IsSpot() )
			{
				if( _isnan( SubFloat( needsFullscreen, i ) ) )
				{
					if( !s.lights[i]->spotFrustum.CullBox( camMins, camMaxs ) )
					{
						m_hPreSortedLights[LSORT_SPOT_FULLSCREEN].AddToTail( s.lights[i] );
					}
					else
					{
						m_hPreSortedLights[LSORT_SPOT_WORLD].AddToTail( s.lights[i] );
					}
				}
				else
				{
					m_hPreSortedLights[LSORT_SPOT_WORLD].AddToTail( s.lights[i] );
				}
			}
			else
			{
				if( _isnan( SubFloat( needsFullscreen, i ) ) )
				{
					m_hPreSortedLights[LSORT_POINT_FULLSCREEN].AddToTail( s.lights[i] );
				}
				else
				{
					m_hPreSortedLights[LSORT_POINT_WORLD].AddToTail( s.lights[i] );
				}
			}
		}

		fltx4 volume = AndSIMD( s.hasVolumetrics, s.hasVolumetrics );

		m_bDrawVolumetrics = m_bDrawVolumetrics || !IsAllZeros( volume );
	}
#else
	FOR_EACH_VEC_FAST( def_light_t*, m_hRenderLights, l )
	{
		bool bNeedsFullscreen = IsPointInBounds( m_vecViewOrigin,
			l->bounds_min_naive - vecBloat,
			l->bounds_max_naive + vecBloat );

		if ( bNeedsFullscreen && l->IsSpot() )
		{
			bNeedsFullscreen = !l->spotFrustum.CullBox( camMins, camMaxs );
		}

		const bool bVolume = ( l->HasShadow() && l->HasVolumetrics() );

		m_bDrawVolumetrics = m_bDrawVolumetrics || bVolume;

		Assert( l->iLighttype * 2 + 1 < LSORT_COUNT );
		m_hPreSortedLights[ l->iLighttype * 2 + (int)bNeedsFullscreen ].AddToTail( l );
	}
Ejemplo n.º 8
0
void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
										FourVectors &color, bool DoHalfLambert ) const
{
	FourVectors delta;
	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
	switch (m_Type)
	{
		case MATERIAL_LIGHT_POINT:
		case MATERIAL_LIGHT_SPOT:
			delta.DuplicateVector(m_Position);
			delta-=pos;
			break;
				
		case MATERIAL_LIGHT_DIRECTIONAL:
			ComputeLightAtPointsForDirectional( pos, normal, color, DoHalfLambert );
			return;

		default:
			return;
	}

	fltx4 dist2 = delta*delta;

	dist2=MaxSIMD( Four_Ones, dist2 );

	fltx4 falloff;

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
	{
		falloff = ReplicateX4(m_Attenuation0);
	}
	else
		falloff= Four_Epsilons;

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
	{
		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation1),SqrtEstSIMD(dist2)));
	}

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
	{
		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation2),dist2));
	}

	falloff=ReciprocalEstSIMD(falloff);
	// Cull out light beyond this radius
	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
	if (m_Range != 0.f)
	{
		fltx4 RangeSquared=ReplicateX4(m_RangeSquared); // !!speed!!
		falloff=AndSIMD(falloff,CmpLtSIMD(dist2,RangeSquared));
	}

	delta.VectorNormalizeFast();
	fltx4 strength=delta*normal;
	if (DoHalfLambert)
	{
		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
	}
	else
		strength=MaxSIMD(Four_Zeros,delta*normal);
		
	switch(m_Type)
	{
		case MATERIAL_LIGHT_POINT:
			// half-lambert
			break;
				
		case MATERIAL_LIGHT_SPOT:
		{
			fltx4 dot2=SubSIMD(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff


			fltx4 cone_falloff_scale=MulSIMD(ReplicateX4(m_OneOverThetaDotMinusPhiDot),
												 SubSIMD(dot2,ReplicateX4(m_PhiDot)));
			cone_falloff_scale=MinSIMD(cone_falloff_scale,Four_Ones);
			
			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
			{
				// !!speed!! could compute integer exponent needed by powsimd and store in light
				cone_falloff_scale=PowSIMD(cone_falloff_scale,m_Falloff);
			}
			strength=MulSIMD(cone_falloff_scale,strength);

			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
			// from pow function, etc
			fltx4 OutsideMask=CmpGtSIMD(dot2,ReplicateX4(m_PhiDot)); // outside light cone?
			strength=AndSIMD(OutsideMask,strength);
		}
		break;

		default:
			break;
	}
	strength=MulSIMD(strength,falloff);
	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
}
Ejemplo n.º 9
0
void RayTracingEnvironment::Trace4Rays(const FourRays &rays, fltx4 TMin, fltx4 TMax,
									   int DirectionSignMask, RayTracingResult *rslt_out,
									   int32 skip_id, ITransparentTriangleCallback *pCallback)
{
	rays.Check();

	memset(rslt_out->HitIds,0xff,sizeof(rslt_out->HitIds));

	rslt_out->HitDistance=ReplicateX4(1.0e23);

	rslt_out->surface_normal.DuplicateVector(Vector(0.,0.,0.));
	FourVectors OneOverRayDir=rays.direction;
	OneOverRayDir.MakeReciprocalSaturate();
	
	// now, clip rays against bounding box
	for(int c=0;c<3;c++)
	{
		fltx4 isect_min_t=
			MulSIMD(SubSIMD(ReplicateX4(m_MinBound[c]),rays.origin[c]),OneOverRayDir[c]);
		fltx4 isect_max_t=
			MulSIMD(SubSIMD(ReplicateX4(m_MaxBound[c]),rays.origin[c]),OneOverRayDir[c]);
		TMin=MaxSIMD(TMin,MinSIMD(isect_min_t,isect_max_t));
		TMax=MinSIMD(TMax,MaxSIMD(isect_min_t,isect_max_t));
	}
	fltx4 active=CmpLeSIMD(TMin,TMax);					// mask of which rays are active
	if (! IsAnyNegative(active) )
		return;												// missed bounding box

	int32 mailboxids[MAILBOX_HASH_SIZE];					// used to avoid redundant triangle tests
	memset(mailboxids,0xff,sizeof(mailboxids));				// !!speed!! keep around?

	int front_idx[3],back_idx[3];							// based on ray direction, whether to
															// visit left or right node first

	if (DirectionSignMask & 1)
	{
		back_idx[0]=0;
		front_idx[0]=1;
	}
		else
	{
		back_idx[0]=1;
		front_idx[0]=0;
	}
	if (DirectionSignMask & 2)
	{
		back_idx[1]=0;
		front_idx[1]=1;
	}
	else
	{
		back_idx[1]=1;
		front_idx[1]=0;
	}
	if (DirectionSignMask & 4)
	{
		back_idx[2]=0;
		front_idx[2]=1;
	}
	else
	{
		back_idx[2]=1;
		front_idx[2]=0;
	}
		
	NodeToVisit NodeQueue[MAX_NODE_STACK_LEN];
	CacheOptimizedKDNode const *CurNode=&(OptimizedKDTree[0]);
	NodeToVisit *stack_ptr=&NodeQueue[MAX_NODE_STACK_LEN];
	while(1)
	{
		while (CurNode->NodeType() != KDNODE_STATE_LEAF)		// traverse until next leaf
		{	   
			int split_plane_number=CurNode->NodeType();
			CacheOptimizedKDNode const *FrontChild=&(OptimizedKDTree[CurNode->LeftChild()]);
			
			fltx4 dist_to_sep_plane=						// dist=(split-org)/dir
				MulSIMD(
					SubSIMD(ReplicateX4(CurNode->SplittingPlaneValue),
							   rays.origin[split_plane_number]),OneOverRayDir[split_plane_number]);
			fltx4 active=CmpLeSIMD(TMin,TMax);			// mask of which rays are active

			// now, decide how to traverse children. can either do front,back, or do front and push
			// back.
			fltx4 hits_front=AndSIMD(active,CmpGeSIMD(dist_to_sep_plane,TMin));
			if (! IsAnyNegative(hits_front))
			{
				// missed the front. only traverse back
				//printf("only visit back %d\n",CurNode->LeftChild()+back_idx[split_plane_number]);
				CurNode=FrontChild+back_idx[split_plane_number];
				TMin=MaxSIMD(TMin, dist_to_sep_plane);

			}
			else
			{
				fltx4 hits_back=AndSIMD(active,CmpLeSIMD(dist_to_sep_plane,TMax));
				if (! IsAnyNegative(hits_back) )
				{
					// missed the back - only need to traverse front node
					//printf("only visit front %d\n",CurNode->LeftChild()+front_idx[split_plane_number]);
					CurNode=FrontChild+front_idx[split_plane_number];
					TMax=MinSIMD(TMax, dist_to_sep_plane);
				}
				else
				{
					// at least some rays hit both nodes.
					// must push far, traverse near
 					//printf("visit %d,%d\n",CurNode->LeftChild()+front_idx[split_plane_number],
 					//	   CurNode->LeftChild()+back_idx[split_plane_number]);
					assert(stack_ptr>NodeQueue);
					--stack_ptr;
					stack_ptr->node=FrontChild+back_idx[split_plane_number];
					stack_ptr->TMin=MaxSIMD(TMin,dist_to_sep_plane);
					stack_ptr->TMax=TMax;
					CurNode=FrontChild+front_idx[split_plane_number];
					TMax=MinSIMD(TMax,dist_to_sep_plane);
				}
			}
		}
		// hit a leaf! must do intersection check
		int ntris=CurNode->NumberOfTrianglesInLeaf();
		if (ntris)
		{
			int32 const *tlist=&(TriangleIndexList[CurNode->TriangleIndexStart()]);
			do
			{
				int tnum=*(tlist++);
				//printf("try tri %d\n",tnum);
				// check mailbox
				int mbox_slot=tnum & (MAILBOX_HASH_SIZE-1);
				TriIntersectData_t const *tri = &( OptimizedTriangleList[tnum].m_Data.m_IntersectData );
				if ( ( mailboxids[mbox_slot] != tnum ) && ( tri->m_nTriangleID != skip_id ) )
				{
					n_intersection_calculations++;
					mailboxids[mbox_slot] = tnum;
					// compute plane intersection


					FourVectors N;
					N.x = ReplicateX4( tri->m_flNx );
					N.y = ReplicateX4( tri->m_flNy );
					N.z = ReplicateX4( tri->m_flNz );

					fltx4 DDotN = rays.direction * N;
					// mask off zero or near zero (ray parallel to surface)
					fltx4 did_hit = OrSIMD( CmpGtSIMD( DDotN,FourEpsilons ),
											CmpLtSIMD( DDotN, FourNegativeEpsilons ) );

					fltx4 numerator=SubSIMD( ReplicateX4( tri->m_flD ), rays.origin * N );

					fltx4 isect_t=DivSIMD( numerator,DDotN );
					// now, we have the distance to the plane. lets update our mask
					did_hit = AndSIMD( did_hit, CmpGtSIMD( isect_t, FourZeros ) );
					//did_hit=AndSIMD(did_hit,CmpLtSIMD(isect_t,TMax));
					did_hit = AndSIMD( did_hit, CmpLtSIMD( isect_t, rslt_out->HitDistance ) );

					if ( ! IsAnyNegative( did_hit ) )
						continue;

					// now, check 3 edges
					fltx4 hitc1 = AddSIMD( rays.origin[tri->m_nCoordSelect0],
										MulSIMD( isect_t, rays.direction[ tri->m_nCoordSelect0] ) );
					fltx4 hitc2 = AddSIMD( rays.origin[tri->m_nCoordSelect1],
										   MulSIMD( isect_t, rays.direction[tri->m_nCoordSelect1] ) );
					
					// do barycentric coordinate check
					fltx4 B0 = MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[0] ), hitc1 );

					B0 = AddSIMD(
						B0,
						MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[1] ), hitc2 ) );
					B0 = AddSIMD(
						B0, ReplicateX4( tri->m_ProjectedEdgeEquations[2] ) );

					did_hit = AndSIMD( did_hit, CmpGeSIMD( B0, FourZeros ) );

					fltx4 B1 = MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[3] ), hitc1 );
					B1 = AddSIMD(
						B1,
						MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[4]), hitc2 ) );

					B1 = AddSIMD(
						B1, ReplicateX4( tri->m_ProjectedEdgeEquations[5] ) );
					
					did_hit = AndSIMD( did_hit, CmpGeSIMD( B1, FourZeros ) );

					fltx4 B2 = AddSIMD( B1, B0 );
					did_hit = AndSIMD( did_hit, CmpLeSIMD( B2, Four_Ones ) );

					if ( ! IsAnyNegative( did_hit ) )
						continue;

					// if the triangle is transparent
					if ( tri->m_nFlags & FCACHETRI_TRANSPARENT )
					{
						if ( pCallback )
						{
							// assuming a triangle indexed as v0, v1, v2
							// the projected edge equations are set up such that the vert opposite the first
							// equation is v2, and the vert opposite the second equation is v0
							// Therefore we pass them back in 1, 2, 0 order
							// Also B2 is currently B1 + B0 and needs to be 1 - (B1+B0) in order to be a real
							// barycentric coordinate.  Compute that now and pass it to the callback
							fltx4 b2 = SubSIMD( Four_Ones, B2 );
							if ( pCallback->VisitTriangle_ShouldContinue( *tri, rays, &did_hit, &B1, &b2, &B0, tnum ) )
							{
								did_hit = Four_Zeros;
							}
						}
					}
					// now, set the hit_id and closest_hit fields for any enabled rays
					fltx4 replicated_n = ReplicateIX4(tnum);
					StoreAlignedSIMD((float *) rslt_out->HitIds,
								 OrSIMD(AndSIMD(replicated_n,did_hit),
										   AndNotSIMD(did_hit,LoadAlignedSIMD(
															 (float *) rslt_out->HitIds))));
					rslt_out->HitDistance=OrSIMD(AndSIMD(isect_t,did_hit),
									 AndNotSIMD(did_hit,rslt_out->HitDistance));

					rslt_out->surface_normal.x=OrSIMD(
						AndSIMD(N.x,did_hit),
						AndNotSIMD(did_hit,rslt_out->surface_normal.x));
					rslt_out->surface_normal.y=OrSIMD(
						AndSIMD(N.y,did_hit),
						AndNotSIMD(did_hit,rslt_out->surface_normal.y));
					rslt_out->surface_normal.z=OrSIMD(
						AndSIMD(N.z,did_hit),
						AndNotSIMD(did_hit,rslt_out->surface_normal.z));
					
				}
			} while (--ntris);
			// now, check if all rays have terminated
			fltx4 raydone=CmpLeSIMD(TMax,rslt_out->HitDistance);
			if (! IsAnyNegative(raydone))
			{
				return;
			}
		}
		
 		if (stack_ptr==&NodeQueue[MAX_NODE_STACK_LEN])
		{
			return;
		}
		// pop stack!
		CurNode=stack_ptr->node;
		TMin=stack_ptr->TMin;
		TMax=stack_ptr->TMax;
		stack_ptr++;
	}
}