inline fltx4 IsPointInBoundsX4( const Vector point, fltx4 boundsMin[3], fltx4 boundsMax[3] )
{
	fltx4 pointX = ReplicateX4( point.x );
	fltx4 pointY = ReplicateX4( point.y );
	fltx4 pointZ = ReplicateX4( point.z );

	return	AndSIMD
			(
				AndSIMD
				(
					AndSIMD
					(
						CmpGeSIMD( pointX, boundsMin[0] ),
						CmpLeSIMD( pointX, boundsMax[0] )
					),
					AndSIMD
					(
						CmpGeSIMD( pointY, boundsMin[1] ),
						CmpLeSIMD( pointY, boundsMax[1] )
					)
				),
				AndSIMD
				(
					CmpGeSIMD( pointZ, boundsMin[2] ),
					CmpLeSIMD( pointZ, boundsMax[2] )
				)
			);
}
void RayTracingEnvironment::Trace4Rays(const FourRays &rays, fltx4 TMin, fltx4 TMax,
									   int DirectionSignMask, RayTracingResult *rslt_out,
									   int32 skip_id, ITransparentTriangleCallback *pCallback)
{
	rays.Check();

	memset(rslt_out->HitIds,0xff,sizeof(rslt_out->HitIds));

	rslt_out->HitDistance=ReplicateX4(1.0e23);

	rslt_out->surface_normal.DuplicateVector(Vector(0.,0.,0.));
	FourVectors OneOverRayDir=rays.direction;
	OneOverRayDir.MakeReciprocalSaturate();
	
	// now, clip rays against bounding box
	for(int c=0;c<3;c++)
	{
		fltx4 isect_min_t=
			MulSIMD(SubSIMD(ReplicateX4(m_MinBound[c]),rays.origin[c]),OneOverRayDir[c]);
		fltx4 isect_max_t=
			MulSIMD(SubSIMD(ReplicateX4(m_MaxBound[c]),rays.origin[c]),OneOverRayDir[c]);
		TMin=MaxSIMD(TMin,MinSIMD(isect_min_t,isect_max_t));
		TMax=MinSIMD(TMax,MaxSIMD(isect_min_t,isect_max_t));
	}
	fltx4 active=CmpLeSIMD(TMin,TMax);					// mask of which rays are active
	if (! IsAnyNegative(active) )
		return;												// missed bounding box

	int32 mailboxids[MAILBOX_HASH_SIZE];					// used to avoid redundant triangle tests
	memset(mailboxids,0xff,sizeof(mailboxids));				// !!speed!! keep around?

	int front_idx[3],back_idx[3];							// based on ray direction, whether to
															// visit left or right node first

	if (DirectionSignMask & 1)
	{
		back_idx[0]=0;
		front_idx[0]=1;
	}
		else
	{
		back_idx[0]=1;
		front_idx[0]=0;
	}
	if (DirectionSignMask & 2)
	{
		back_idx[1]=0;
		front_idx[1]=1;
	}
	else
	{
		back_idx[1]=1;
		front_idx[1]=0;
	}
	if (DirectionSignMask & 4)
	{
		back_idx[2]=0;
		front_idx[2]=1;
	}
	else
	{
		back_idx[2]=1;
		front_idx[2]=0;
	}
		
	NodeToVisit NodeQueue[MAX_NODE_STACK_LEN];
	CacheOptimizedKDNode const *CurNode=&(OptimizedKDTree[0]);
	NodeToVisit *stack_ptr=&NodeQueue[MAX_NODE_STACK_LEN];
	while(1)
	{
		while (CurNode->NodeType() != KDNODE_STATE_LEAF)		// traverse until next leaf
		{	   
			int split_plane_number=CurNode->NodeType();
			CacheOptimizedKDNode const *FrontChild=&(OptimizedKDTree[CurNode->LeftChild()]);
			
			fltx4 dist_to_sep_plane=						// dist=(split-org)/dir
				MulSIMD(
					SubSIMD(ReplicateX4(CurNode->SplittingPlaneValue),
							   rays.origin[split_plane_number]),OneOverRayDir[split_plane_number]);
			fltx4 active=CmpLeSIMD(TMin,TMax);			// mask of which rays are active

			// now, decide how to traverse children. can either do front,back, or do front and push
			// back.
			fltx4 hits_front=AndSIMD(active,CmpGeSIMD(dist_to_sep_plane,TMin));
			if (! IsAnyNegative(hits_front))
			{
				// missed the front. only traverse back
				//printf("only visit back %d\n",CurNode->LeftChild()+back_idx[split_plane_number]);
				CurNode=FrontChild+back_idx[split_plane_number];
				TMin=MaxSIMD(TMin, dist_to_sep_plane);

			}
			else
			{
				fltx4 hits_back=AndSIMD(active,CmpLeSIMD(dist_to_sep_plane,TMax));
				if (! IsAnyNegative(hits_back) )
				{
					// missed the back - only need to traverse front node
					//printf("only visit front %d\n",CurNode->LeftChild()+front_idx[split_plane_number]);
					CurNode=FrontChild+front_idx[split_plane_number];
					TMax=MinSIMD(TMax, dist_to_sep_plane);
				}
				else
				{
					// at least some rays hit both nodes.
					// must push far, traverse near
 					//printf("visit %d,%d\n",CurNode->LeftChild()+front_idx[split_plane_number],
 					//	   CurNode->LeftChild()+back_idx[split_plane_number]);
					assert(stack_ptr>NodeQueue);
					--stack_ptr;
					stack_ptr->node=FrontChild+back_idx[split_plane_number];
					stack_ptr->TMin=MaxSIMD(TMin,dist_to_sep_plane);
					stack_ptr->TMax=TMax;
					CurNode=FrontChild+front_idx[split_plane_number];
					TMax=MinSIMD(TMax,dist_to_sep_plane);
				}
			}
		}
		// hit a leaf! must do intersection check
		int ntris=CurNode->NumberOfTrianglesInLeaf();
		if (ntris)
		{
			int32 const *tlist=&(TriangleIndexList[CurNode->TriangleIndexStart()]);
			do
			{
				int tnum=*(tlist++);
				//printf("try tri %d\n",tnum);
				// check mailbox
				int mbox_slot=tnum & (MAILBOX_HASH_SIZE-1);
				TriIntersectData_t const *tri = &( OptimizedTriangleList[tnum].m_Data.m_IntersectData );
				if ( ( mailboxids[mbox_slot] != tnum ) && ( tri->m_nTriangleID != skip_id ) )
				{
					n_intersection_calculations++;
					mailboxids[mbox_slot] = tnum;
					// compute plane intersection


					FourVectors N;
					N.x = ReplicateX4( tri->m_flNx );
					N.y = ReplicateX4( tri->m_flNy );
					N.z = ReplicateX4( tri->m_flNz );

					fltx4 DDotN = rays.direction * N;
					// mask off zero or near zero (ray parallel to surface)
					fltx4 did_hit = OrSIMD( CmpGtSIMD( DDotN,FourEpsilons ),
											CmpLtSIMD( DDotN, FourNegativeEpsilons ) );

					fltx4 numerator=SubSIMD( ReplicateX4( tri->m_flD ), rays.origin * N );

					fltx4 isect_t=DivSIMD( numerator,DDotN );
					// now, we have the distance to the plane. lets update our mask
					did_hit = AndSIMD( did_hit, CmpGtSIMD( isect_t, FourZeros ) );
					//did_hit=AndSIMD(did_hit,CmpLtSIMD(isect_t,TMax));
					did_hit = AndSIMD( did_hit, CmpLtSIMD( isect_t, rslt_out->HitDistance ) );

					if ( ! IsAnyNegative( did_hit ) )
						continue;

					// now, check 3 edges
					fltx4 hitc1 = AddSIMD( rays.origin[tri->m_nCoordSelect0],
										MulSIMD( isect_t, rays.direction[ tri->m_nCoordSelect0] ) );
					fltx4 hitc2 = AddSIMD( rays.origin[tri->m_nCoordSelect1],
										   MulSIMD( isect_t, rays.direction[tri->m_nCoordSelect1] ) );
					
					// do barycentric coordinate check
					fltx4 B0 = MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[0] ), hitc1 );

					B0 = AddSIMD(
						B0,
						MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[1] ), hitc2 ) );
					B0 = AddSIMD(
						B0, ReplicateX4( tri->m_ProjectedEdgeEquations[2] ) );

					did_hit = AndSIMD( did_hit, CmpGeSIMD( B0, FourZeros ) );

					fltx4 B1 = MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[3] ), hitc1 );
					B1 = AddSIMD(
						B1,
						MulSIMD( ReplicateX4( tri->m_ProjectedEdgeEquations[4]), hitc2 ) );

					B1 = AddSIMD(
						B1, ReplicateX4( tri->m_ProjectedEdgeEquations[5] ) );
					
					did_hit = AndSIMD( did_hit, CmpGeSIMD( B1, FourZeros ) );

					fltx4 B2 = AddSIMD( B1, B0 );
					did_hit = AndSIMD( did_hit, CmpLeSIMD( B2, Four_Ones ) );

					if ( ! IsAnyNegative( did_hit ) )
						continue;

					// if the triangle is transparent
					if ( tri->m_nFlags & FCACHETRI_TRANSPARENT )
					{
						if ( pCallback )
						{
							// assuming a triangle indexed as v0, v1, v2
							// the projected edge equations are set up such that the vert opposite the first
							// equation is v2, and the vert opposite the second equation is v0
							// Therefore we pass them back in 1, 2, 0 order
							// Also B2 is currently B1 + B0 and needs to be 1 - (B1+B0) in order to be a real
							// barycentric coordinate.  Compute that now and pass it to the callback
							fltx4 b2 = SubSIMD( Four_Ones, B2 );
							if ( pCallback->VisitTriangle_ShouldContinue( *tri, rays, &did_hit, &B1, &b2, &B0, tnum ) )
							{
								did_hit = Four_Zeros;
							}
						}
					}
					// now, set the hit_id and closest_hit fields for any enabled rays
					fltx4 replicated_n = ReplicateIX4(tnum);
					StoreAlignedSIMD((float *) rslt_out->HitIds,
								 OrSIMD(AndSIMD(replicated_n,did_hit),
										   AndNotSIMD(did_hit,LoadAlignedSIMD(
															 (float *) rslt_out->HitIds))));
					rslt_out->HitDistance=OrSIMD(AndSIMD(isect_t,did_hit),
									 AndNotSIMD(did_hit,rslt_out->HitDistance));

					rslt_out->surface_normal.x=OrSIMD(
						AndSIMD(N.x,did_hit),
						AndNotSIMD(did_hit,rslt_out->surface_normal.x));
					rslt_out->surface_normal.y=OrSIMD(
						AndSIMD(N.y,did_hit),
						AndNotSIMD(did_hit,rslt_out->surface_normal.y));
					rslt_out->surface_normal.z=OrSIMD(
						AndSIMD(N.z,did_hit),
						AndNotSIMD(did_hit,rslt_out->surface_normal.z));
					
				}
			} while (--ntris);
			// now, check if all rays have terminated
			fltx4 raydone=CmpLeSIMD(TMax,rslt_out->HitDistance);
			if (! IsAnyNegative(raydone))
			{
				return;
			}
		}
		
 		if (stack_ptr==&NodeQueue[MAX_NODE_STACK_LEN])
		{
			return;
		}
		// pop stack!
		CurNode=stack_ptr->node;
		TMin=stack_ptr->TMin;
		TMax=stack_ptr->TMax;
		stack_ptr++;
	}
}