Example #1
0
File: sa.c Project: drigz/5R1
real bump(real x, real y)
{
    real cxs = cosr(x); cxs *= cxs;
    real cys = cosr(y); cys *= cys;

    return fabsr((cxs*cxs + cys*cys - 2*cxs*cys) / sqrtr(x*x+2*y*y));
}
Example #2
0
void cullPoints (s32 n, f32 p[], s32 m, s32 i0, s32 iret[])
{
	// compute the centroid of the polygon in cx,cy
	s32 i,j;
	f32 a,cx,cy,q;
	f32 A[8];
	s32 avail[8];
	if (n==1) {
		cx = p[0];
		cy = p[1];
	}
	else if (n==2) {
		cx = (f32)(0.5)*(p[0] + p[2]);
		cy = (f32)(0.5)*(p[1] + p[3]);
	}
	else {
		a = 0;
		cx = 0;
		cy = 0;
		for (i=0; i<(n-1); i++) {
			q = p[i*2]*p[i*2+3] - p[i*2+2]*p[i*2+1];
			a += q;
			cx += q*(p[i*2]+p[i*2+2]);
			cy += q*(p[i*2+1]+p[i*2+3]);
		}
		q = p[n*2-2]*p[1] - p[0]*p[n*2-1];
		a = 1.f/((f32)(3.0)*(a+q));
		cx = a*(cx + q*(p[n*2-2]+p[0]));
		cy = a*(cy + q*(p[n*2-1]+p[1]));
	}

	// compute the angle of each point w.r.t. the centroid
	
	for (i=0; i<n; i++) A[i] = atan2r(p[i*2+1]-cy,p[i*2]-cx);

	// search for points that have angles closest to A[i0] + i*(2*pi/m).
	
	for (i=0; i<n; i++) avail[i] = 1;
	avail[i0] = 0;
	iret[0] = i0;
	iret++;
	for (j=1; j<m; j++) {
		f32 maxdiff=1e9,diff;
		a = (f32)(j)*(2*N3DPi/m) + A[i0];
		if (a > N3DPi) a -= 2*N3DPi;
		for (i=0; i<n; i++) {
			if (avail[i]) {
				diff = fabsr (A[i]-a);
				if (diff > N3DPi) diff = 2*N3DPi - diff;
				if (diff < maxdiff) {
					maxdiff = diff;
					*iret = i;
				}
			}
		}
		avail[*iret] = 0;
		iret++;
	}
}
Example #3
0
/* n inner lobatto nodes (excluding -1,1) */
static void lobatto_nodes_aux(real *z, int n)
{
  int i,j,np=n+1;
  for(i=0; i<=n/2-1; ++i) {
    real ox, x = cosr( (n-i)*PI/np );
    do {
      ox = x;
      x -= legendre_d1(np,x)/legendre_d2(np,x);
    } while(fabsr(x-ox)>-x*EPS);
    z[i] = x - legendre_d1(np,x)/legendre_d2(np,x);
  }
  if(n&1) z[n/2]=0;
  for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) z[j]=-z[i];
}
Example #4
0
/* n nodes */
void gauss_nodes(real *z, int n)
{
  int i,j;
  for(i=0; i<=n/2-1; ++i) {
    real ox, x = cosr( (2*n-2*i-1)*(PI/2)/n );
    do {
      ox = x;
      x -= legendre(n,x)/legendre_d1(n,x);
    } while(fabsr(x-ox)>-x*EPS);
    z[i] = x - legendre(n,x)/legendre_d1(n,x);
  }
  if(n&1) z[n/2]=0;
  for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) z[j]=-z[i];
}
Example #5
0
File: sa.c Project: drigz/5R1
real sa(unsigned seed)
{
    srandom(seed);

    real T;
    if (initial_temp_method != constant)
        T = INFINITY;
    else
        T = initial_temp;

    real step_size_x = init_step_size, step_size_y = init_step_size;
    real pos_x = 5, pos_y = 5;
    real obj = bump(pos_x, pos_y);
    real obj_pen = obj;

    int samples_remaining = 5000-1;

    real obj_d[5000];
    int n_obj_d = 0;
    real accepts[5000];
    int n_accepts = 0;

    int num_trials = 0, num_acceptances = 0;
    int initial_trials = 500;
    int max_trials = temp_length;
    int max_acceptances = 0.6*temp_length;

    real alpha = 0.1, omega = 2.1;

    real best_obj = obj;
    real best_x = pos_x, best_y = pos_y;
    int best_time = samples_remaining;

    while (samples_remaining > 0)
    {
        if (best_time - samples_remaining > 500)
        {
            best_time = samples_remaining;
            pos_x = best_x;
            pos_y = best_y;
            obj = best_obj;
            obj_pen = best_obj + penalty_weight * penalty(pos_x, pos_y) / T;

            step_size_x = step_size_y = init_step_size;
        }

        real step_x, step_y;
        if (step_method == gaussian)
        {
            step_x = step_size_x * randn();
            step_y = step_size_y * randn();
        }
        else
        {
            step_x = step_size_x * (2*randf()-1);
            step_y = step_size_y * (2*randf()-1);
        }

        real new_x = pos_x+step_x, new_y = pos_y+step_y;

        real new_pen = penalty_weight * penalty(new_x, new_y);

        if (T == INFINITY && new_pen != 0)
            continue;

        real new_obj = bump(new_x, new_y);
        real new_obj_pen = new_obj + new_pen / T;
        samples_remaining--;

        num_trials++;

        if (new_pen == 0)
        {
            if (new_obj > best_obj)
            {
                best_obj = new_obj;
                best_x = new_x;
                best_y = new_y;
                best_time = samples_remaining;
            }
        }

        real p;
        if (step_method == parks)
        {
            real step_norm = sqrtr(step_x*step_x + step_y*step_y);
            p = exp(- (obj_pen - new_obj_pen) / (T * step_norm));
        }
        else
        {
            p = exp(- (obj_pen - new_obj_pen) / T);
        }

        if (randf() < p)
        {
            num_acceptances++;
            obj_d[n_obj_d++] = new_obj_pen - obj_pen;
            pos_x = new_x;
            pos_y = new_y;
            obj = new_obj;
            obj_pen = new_obj_pen;

            accepts[n_accepts++] = new_obj_pen;

            if (T != INFINITY && step_method == parks)
            {
                step_size_x = (1-alpha)*step_size_x + alpha*omega*fabsr(step_x);
                step_size_y = (1-alpha)*step_size_y + alpha*omega*fabsr(step_y);
            }
        }

        bool reduced_T = false;

        if (T == INFINITY)
        {
            if (num_trials >= initial_trials)
            {
                if (initial_temp_method == kirkpatrick)
                    T = T_kirkpatrick(obj_d, n_obj_d);
                else if (initial_temp_method == white)
                    T = std(obj_d, n_obj_d);
                else
                {
                    fprintf(stderr, "unknown temp method");
                    exit(1);
                }
                reduced_T = true;
            }
        }
        else if (num_trials >= max_trials || num_acceptances >= max_acceptances)
        {
            if (temp_decay_method == huang)
            {
                real factor;
                if (n_accepts < 2)
                    factor = 0.5;
                else
                {
                    factor = exp(-0.7*T/std(accepts, n_accepts));
                    if (factor < 0.5)
                        factor = 0.5;
                }
                T *= factor;
            }
            else
                T *= temp_decay;
            reduced_T = true;
        }

        if (reduced_T)
        {
            //printf("%g\n", T);
            n_obj_d = 0;
            num_trials = 0;
            num_acceptances = 0;
            n_accepts = 1;
            accepts[0] = obj_pen;
        }
    }

    return best_obj;
}
Example #6
0
s32 xColBoxPlane(iCollisionObject* o1,iCollisionObject* o2)
{
	X_Assert(o1->muClass==iCollisionObject::Col_BBox);
	X_Assert(o2->muClass==iCollisionObject::Col_Plane);
	cBBox	*box   = (cBBox*)o1;
	cPlane	*plane = (cPlane*)o2;
	const f32* r = box->mpkWorld->rot;
	const f32* n = plane->mkPlane;

	vec4_t a,p;
	f32 b1,b2,b3;
	xMul0_344(a,n,r);	// this is safe
	xQ4Mul(a,a,box->mkHalfSide);
	xQ4Scale(a,2.0f);
	b1 = fabsr(a[0]);
	b2 = fabsr(a[1]);
	b3 = fabsr(a[2]);
	// early exit test
	f32 depth;
	depth = n[PND] + 0.5f*(b1+b2+b3) - xVDotR(n,box->mpkWorld->pos);
	if (X_IsFloatNeg(depth)) return 0;

	// find number of contacts requested
	const s32 maxc = 3;
	// find deepest point
	xQ4CpyMac(p,box->mpkWorld->pos);

#define P1(i,op) xQ4ScaleN##op(p,&r[i*4],box->mkHalfSide[i]);
#define PALL(i) if (a[i]>0) { P1(i,Sub) } else { P1(i,Add) }
	PALL(0);
	PALL(1);
	PALL(2);
#undef P1
#undef PALL
	// the deepest point is the first contact point
	xQ4CpyMac(gpkContactHolder[0].pos,p);
	xQ4CpyMac(gpkContactHolder[0].norm,n);
	gpkContactHolder[0].depth = depth;	// 
	s32 ret = 1;		// ret is number of contact points found so far
	// get the second and third contact points by starting from `p' and going
	// along the two sides with the smallest projected length.
#define P1(i,j,op) \
	xQ4CpyMac(gpkContactHolder[i].pos,p);\
	xQ4ScaleN##op(gpkContactHolder[i].pos,&r[j*4],2*box->mkHalfSide[j]);
#define PALL(ctact,side,sideinc) \
	depth -= b ## sideinc; \
	if (depth < 0) goto done; \
	if (a[sideinc-1] > 0) { P1(ctact,side,Add) } else { P1(ctact,side,Sub) } \
	gpkContactHolder[ctact].depth = depth; \
	ret++;

	xQ4CpyMac(gpkContactHolder[1].norm,n);
	xQ4CpyMac(gpkContactHolder[2].norm,n);
	if (b1 < b2) 
	{
		if (b3 < b1) goto use_side_3; 
		else 
		{
			PALL(1,0,1);	// use side 1
			if (b2 < b3) goto contact2_2; 
			else goto contact2_3;
		}
	}
	else 
	{
		if (b3 < b2) 
		{
use_side_3:	// use side 3
			PALL(1,2,3);
			if (b1 < b2) goto contact2_1; 
			else goto contact2_2;
		}
		else 
		{
			PALL(1,1,2);	// use side 2
			if (b1 < b3) goto contact2_1; 
			else goto contact2_3;
		}
	}

contact2_1: PALL(2,0,1); goto done;
contact2_2: PALL(2,1,2); goto done;
contact2_3: PALL(2,2,3); goto done;
#undef P1
#undef PALL

done:
	return ret;
}
Example #7
0
s32 cBoxCollide( 
				const f32* p1,const f32* r1,const f32* sz1, 
				const f32* p2,const f32* r2,const f32* sz2,
				cContact* c,f32* normal)
{
	const f32 fudge_factor = 1.05f;
	vec4_t p,pp1,pp2,nC; //secured_v
	mat34_t r,q;
	f32	s2,s;
	s32 code;
	const f32* nR;
	//mat34_t r1,r2;
	s32 invertnormal,i,j;
	nC[3] = 0;
	//xMTrans_34(r1,r1t);
	//xMTrans_34(r2,r2t);

	xQ4Sub(p,p2,p1);
	xMul0_344(pp1,p,r1);
	xMul0_344(pp2,p,r2);
	// r(i,j) = row(r1,i). row(r2,j)
	xMul0_34(r,r1,r2);
	q[0] = fabsr(r[0]); q[1] = fabsr(r[1]); q[2] = fabsr(r[2]); q[3] = 0;
	q[4] = fabsr(r[4]); q[5] = fabsr(r[5]); q[6] = fabsr(r[6]); q[7] = 0;
	q[8] = fabsr(r[8]); q[9] = fabsr(r[9]); q[10] = fabsr(r[10]); q[11] = 0;

#define XSATEST(e1,e2,n,cc) \
	s2 = fabsr(e1) - (e2); \
	if (s2 > 0 ) return 0;	\
	if (s2 > s) \
	{ \
	s = s2; nR = (n); \
	invertnormal = ((e1) < 0); \
	code = (cc); \
	}

	s =-N3DInfinity;
	invertnormal = 0;
	code = 0;
	// note that nC has not been used yet
	// we use nC for sz1 rough calculation 
	// here
	// separating axis = u1,u2,u3
	xMul0_344(nC,sz2,q);		// 3 multiplication with SSE
	xQ4Add(nC,nC,sz1);		// one addition with SSE
	//TODO: HERE we need to take a descision, we can
	//well over escape 3 multiplication for 1, in case
	//we use xVDot for each test, but for average case
	//I guess this will work better
	XSATEST(pp1[0],nC[0],r1+0,1);
	XSATEST(pp1[1],nC[1],r1+4,2);
	XSATEST(pp1[2],nC[2],r1+8,3);

	xMul1_344(nC,sz1,q);		// 3 multiplication with SSE
	xQ4Add(nC,nC,sz2);		// one addition with SSE
	// separating axis = v1,v2,v3
	XSATEST(pp2[0],nC[0],r2+0,4);
	XSATEST(pp2[1],nC[1],r2+4,5);
	XSATEST(pp2[2],nC[2],r2+8,6);

	// note that nC has not been used yet

#undef XSATEST
#define XSATEST(expr1,e2,n1,n2,n3,cc) \
	s2 = fabsr(expr1) - (e2); \
	if (s2 > 0) return 0; \
	l = sqrtr((n1)*(n1) + (n2)*(n2) + (n3)*(n3)); \
	if (l > 0) { \
	s2 /= l; \
	if (s2*fudge_factor > s) { \
	s = s2; \
	nR = 0; \
	nC[0] = (n1)/l; nC[1] = (n2)/l; nC[2] = (n3)/l; \
	invertnormal = ((expr1) < 0); \
	code = (cc); \
	} \
	}

	{
		// we need some temp vectors here
		vec4_t tmp1,tmp2;
		f32  l;
		// with SSE these are effectively
		xQ4ScaleS(tmp1,(f32*)&r[4],pp1[2]);	// one multiplication
		xQ4ScaleNSub(tmp1,(f32*)&r[8],pp1[1]);	// one addition and one multiplication
		xQ4ScaleS(tmp2,(f32*)&q[8],sz1[1]);		// one multiplication
		xQ4ScaleNAdd(tmp2,(f32*)&q[4],sz1[2]); // one addition and one multiplication

		// separating axis = u1 x (v1,v2,v3)
		XSATEST(tmp1[0],(tmp2[0]+sz2[1]*q[2]+sz2[2]*q[1]),0,-r[8],r[4],7);
		XSATEST(tmp1[1],(tmp2[1]+sz2[0]*q[2]+sz2[2]*q[0]),0,-r[9],r[5],8);
		XSATEST(tmp1[2],(tmp2[2]+sz2[0]*q[1]+sz2[1]*q[0]),0,-r[10],r[6],9);

		xQ4ScaleS(tmp1,(f32*)&r[8],pp1[0]);	// one multiplication
		xQ4ScaleNSub(tmp1,(f32*)&r[0],pp1[2]);	// one addition and one multiplication
		xQ4ScaleS(tmp2,(f32*)&q[8],sz1[0]);		// one multiplication
		xQ4ScaleNAdd(tmp2,(f32*)&q[0],sz1[2]); // one addition and one multiplication

		// separating axis = u2 x (v1,v2,v3)
		XSATEST(tmp1[0],(tmp2[0]+sz2[1]*q[6]+sz2[2]*q[5]),r[8],0,-r[0],10);
		XSATEST(tmp1[1],(tmp2[1]+sz2[0]*q[6]+sz2[2]*q[4]),r[9],0,-r[1],11);
		XSATEST(tmp1[2],(tmp2[2]+sz2[0]*q[5]+sz2[1]*q[4]),r[10],0,-r[2],12);

		xQ4ScaleS(tmp1,(f32*)&r[0],pp1[1]);	// one multiplication
		xQ4ScaleNSub(tmp1,(f32*)&r[4],pp1[0]);	// one addition and one multiplication
		xQ4ScaleS(tmp2,(f32*)&q[4],sz1[0]);		// one multiplication
		xQ4ScaleNAdd(tmp2,(f32*)&q[0],sz1[1]); // one addition and one multiplication

		// separating axis = u3 x (v1,v2,v3)
		XSATEST(tmp1[0],(tmp2[0]+sz2[1]*q[10]+sz2[2]*q[9]),-r[4],-r[0],0,13);
		XSATEST(tmp1[1],(tmp2[1]+sz2[0]*q[10]+sz2[2]*q[8]),-r[5],-r[1],0,14);
		XSATEST(tmp1[2],(tmp2[2]+sz2[0]*q[9]+sz2[1]*q[8]),-r[6],-r[2],0,15);
	}

#undef XSATEST
	if(!code) return 0;
	if(nR)
	{
		xQ4CpyMac(normal,nR);
	}
	else
	{
		xMul1_344(normal,nC,r1);
	}
	if(invertnormal)
	{
		xQ4Scale(normal,-1.0f);
	//	normal[0] = -normal[0];
	//	normal[1] = -normal[1];
	//	normal[2] = -normal[2];
	}

	s = -s;

	if (code > 6) 
	{
		// an edge from box 1 touches an edge from box 2.
		// find a point pa on the intersecting edge of box 1
		vec4_t	pa,pb,ua,ub;
		f32		alpha,beta;

		xQ4CpyMac(pa,p1);
		xQ4CpyMac(pb,p2);

		xMul0_344(ua,normal,r1);		

		for (j=0; j<3; j++) 
		{
			// add sign
			if( ISFLOATNEGETIVE(ua[j]) )
			{
				xQ4ScaleNSub(pa,&r1[j*4],sz1[j]);
			}
			else 
			{
				xQ4ScaleNAdd(pa,&r1[j*4],sz1[j]);
			}
		}

		// find a point pb on the intersecting edge of box 2
		xMul0_344(ua,normal,r2);		
		for (j=0; j<3; j++) 
		{			
			// add sign
			if( ISFLOATNEGETIVE(ua[j]) )
			{
				xQ4ScaleNSub(pb,&r2[j*4],sz2[j]);
			}
			else 
			{
				xQ4ScaleNAdd(pb,&r2[j*4],sz2[j]);
			}
		}

		// highly doubted
		xQ4CpyMac(ua,&r1[(((code)-7)/3)*4]);
		xQ4CpyMac(ub,&r2[(((code)-7)%3)*4]);
	
		{
			// line closest approach (pa,ua,pb,ub,&alpha,&beta);
			vec4_t p;// secured_v
			f32 uaub,q1,q2,d;

			xQ4Sub(p,pb,pa);
			xVDot(&uaub,ua,ub);
			xVDot(&q1,ua,p);
			xVDot(&q2,ub,p);
			d = 1-uaub*uaub;
			if (d <= 0.0001f) 
			{
				alpha = 0;
				beta  = 0;
			}
			else 
			{
				d = 1/d;
				alpha = (q1 - uaub*q2)*d;
				beta  = (uaub*q1 - q2)*d;
			}
		}

		xQ4ScaleNAdd(pa,ua,alpha);
		xQ4ScaleNAdd(pb,ub,beta);
		xQ4Add(c->pos,pa,pb);
		xQ4Scale(c->pos,0.5f);
		c->depth = s;
		return 1;
	}

	// okay, we have sz1 face-something intersection (because the separating
	// axis is perpendicular to sz1 face). define face 'sz1' to be the reference
	// face (i.e. the normal vector is perpendicular to this) and face 'sz2' to be
	// the incident face (the closest face of the other box).  
	{
		vec4_t center; // secured_v
		const f32 *ra,*rb;
		const f32 *pa,*pb,*sa,*sb;
		s32 lanr,a1,a2;
		f32 quad[8];	// 2D coordinate of incident face (x,y pairs)
		// nr = normal vector of reference face dotted with axes of incident box.
		// anr = absolute values of nr.
		vec4_t normal2,nr,anr;	// secured_v
		if (code <= 3) 
		{
			ra = r1;
			rb = r2;
			pa = (f32*)p1;
			pb = (f32*)p2;
			sa = (f32*)sz1;
			sb = (f32*)sz2;
			xQ4CpyMac(normal2,normal);
		}
		else 
		{    
			ra = r2;
			rb = r1;
			pa = (f32*)p2;
			pb = (f32*)p1;
			sa = (f32*)sz2;
			sb = (f32*)sz1;
			normal2[0] = -normal[0];
			normal2[1] = -normal[1];
			normal2[2] = -normal[2];
			normal2[3] = 0;
		}

		xMul0_344(nr,normal2,rb);
		anr[0] = fabsr (nr[0]);
		anr[1] = fabsr (nr[1]);
		anr[2] = fabsr (nr[2]);
		anr[3] = 0; // secured_v

		// find the largest compontent of anr: this corresponds to the normal
		// for the indident face. the other axis numbers of the indicent face
		// are stored in a1,a2.

		if (anr[1] > anr[0]) 
		{
			if (anr[1] > anr[2]) 
			{    
				a1 = 0;
				lanr = 1;
				a2 = 2;
			}
			else 
			{
				a1 = 0;
				a2 = 1;
				lanr = 2;
			}
		}
		else 
		{    
			if (anr[0] > anr[2]) 
			{
				lanr = 0;
				a1 = 1;
				a2 = 2;
			}
			else 
			{      
				a1 = 0;
				a2 = 1;
				lanr = 2;
			}
		}
		// compute center point of incident face, in reference-face coordinates
		xQ4Sub(center,pb,pa);
		if (nr[lanr] < 0) 
		{
			xQ4ScaleNAdd(center,&rb[4*lanr],sb[lanr]);
		}
		else 
		{   
			xQ4ScaleNSub(center,&rb[4*lanr],sb[lanr]);
		}

		// find the normal and non-normal axis numbers of the reference box
		{
			vec4_t point[8];		// penetrating contact points
			vec4_t m_;
			s32 codeN,code1,code2;
			f32 c1,c2;
			f32 rect[2];
			f32 ret[16];
			s32 inr;
			s32 cnum = 0;			// number of penetrating contact points found

			f32 dep[8];			// depths for those points
			f32 det1;

			if (code <= 3) 
				codeN = code-1; 
			else 
				codeN = code-4;
			if (codeN==0) 
			{
				code1 = 1;
				code2 = 2;
			}
			else if (codeN==1) 
			{
				code1 = 0;
				code2 = 2;
			}
			else 
			{
				code1 = 0;
				code2 = 1;
			}

			// find the four corners of the incident face, in reference-face coordinates
			{
				xVDot(&c1,center,&ra[code1*4]);
				xVDot(&c2,center,&ra[code1*4]);
			}
			// r(i,j) = col(r1,i). col(r2,j)
			if(code <= 3)
			{
				m_[0] = r[code1*4+a1];
				m_[1] = r[code1*4+a2];
				m_[2] = r[code2*4+a1];
				m_[3] = r[code2*4+a2];
			}
			else
			{
				m_[0] = r[a1*4+code1];
				m_[1] = r[a2*4+code1];
				m_[2] = r[a1*4+code2];
				m_[3] = r[a2*4+code2];
			}
			{
				f32 k1 = m_[0]*sb[a1];
				f32 k2 = m_[2]*sb[a1];
				f32 k3 = m_[1]*sb[a2];
				f32 k4 = m_[3]*sb[a2];
				quad[0] = c1 - k1 - k3;
				quad[1] = c2 - k2 - k4;
				quad[2] = c1 - k1 + k3;
				quad[3] = c2 - k2 + k4;
				quad[4] = c1 + k1 + k3;
				quad[5] = c2 + k2 + k4;
				quad[6] = c1 + k1 - k3;
				quad[7] = c2 + k2 - k4;	
			}

			// find the size of the reference face	
			rect[0] = sa[code1];
			rect[1] = sa[code2];
			
			// intersect the incident and reference faces
			{
				// s32 n = intersectRectQuad (rect,quad,ret);
				//(rect[2],p[8],ret[16])
				// q (and r) contain nq (and nr) coordinate points for the current (and
				// chopped) polygons
				s32 nq=4;
				f32 buffer[16];
				f32 *q = quad;
				f32 *r = ret;
				s32 dir,sign;
				for (dir=0; dir <= 1; dir++) 
				{
					// direction notation: xy[0] = x axis, xy[1] = y axis
					for (sign=-1; sign <= 1; sign += 2) 
					{
						// chop q along the line xy[dir] = sign*rect[dir]
						f32 *pq = q;
						f32 *pr = r;
						inr = 0;
						for (i=nq; i > 0; i--) 
						{
							f32 *nextq;
							// go through all points in q and all lines between adjacent points
							if (sign*pq[dir] < rect[dir]) 
							{
								// this point is inside the chopping line
								pr[0] = pq[0];
								pr[1] = pq[1];
								pr += 2;
								inr++;
								if (inr & 8) 
								{
									q = r;
									goto done;
								}
							}
							nextq = (i > 1) ? pq+2 : q;
							if ((sign*pq[dir] < rect[dir]) ^ (sign*nextq[dir] < rect[dir])) 
							{
								// this line crosses the chopping line
								pr[1-dir] = pq[1-dir] + (nextq[1-dir]-pq[1-dir]) /
									(nextq[dir]-pq[dir]) * (sign*rect[dir]-pq[dir]);
								pr[dir] = sign*rect[dir];
								pr += 2;
								inr++;
								if (inr & 8) 
								{
									q = r;
									goto done;
								}
							}
							pq += 2;
						}
						q = r;
						r = (q==ret) ? buffer : ret;
						nq = inr;
					}
				}
done:
				if (q != ret) 
					copyDwords(ret,q,inr*2);
			}

			if (inr < 1) return 0;		// this should never happen

			// convert the intersection points into reference-face coordinates,
			// and compute the contact position and depth for each point. only keep
			// those points that have sz1 positive (penetrating) depth. delete points in
			// the 'ret' array as necessary so that 'point' and 'ret' correspond.
			det1 = 1/(m_[0]*m_[3] - m_[1]*m_[2]);
			xQ4Scale(m_,det1);
			for (j=0; j < inr; j++) 
			{
				f32 k1 =  m_[3]*(ret[j*2]-c1) - m_[1]*(ret[j*2+1]-c2);
				f32 k2 = -m_[2]*(ret[j*2]-c1) + m_[0]*(ret[j*2+1]-c2);
				xQ4CpyMac(point[cnum],center);
				xQ4ScaleNAdd(point[cnum],&rb[a1*4],k1);
				xQ4ScaleNAdd(point[cnum],&rb[a2*4],k2);
				dep[cnum] = sa[codeN] - xVDotR(normal2,point[cnum]);
				if(dep[cnum] >= 0) 
				{
					ret[cnum*2] = ret[j*2];
					ret[cnum*2+1] = ret[j*2+1];
					cnum++;
				}
			}

			if (cnum < 1) 
				return 0;	// this should never happen
			// we can't generate more contacts than we actually have
			if (cnum <= N3DMaxContact) 
			{
				// we have less contacts than we need, so we use them all
				for (j=0; j < cnum; j++) 
				{
					cContact *con = c+j;
					xQ4Add(con->pos,point[j],pa);
					con->depth = dep[j];	// this seems ok
				}
			}
			else 
			{
				// we have more contacts than are wanted, some of them must be culled.
				// find the deepest point, it is always the first contact.
				s32 i1 = 0;
				f32 maxdepth = dep[0];
				s32 iret[8];
				for (i=1; i<cnum; i++) 
				{
					if (dep[i] > maxdepth) 
					{
						maxdepth = dep[i];
						i1 = i;
					}
				}

				{
					// compute the centroid of the polygon in cx,cy
					f32 a,cx,cy,q;
					f32 ang[8];
					s32 avail[8];
					if (cnum==1) 
					{
						cx = ret[0];
						cy = ret[1];
					}
					else if (cnum==2) 
					{
						cx = 0.5f*(ret[0] + ret[2]);
						cy = 0.5f*(ret[1] + ret[3]);
					}
					else 
					{
						a = 0;
						cx = 0;
						cy = 0;
						for (i=0;i<(cnum-1); i++) 
						{      
							q = ret[i*2]*ret[i*2+3] - ret[i*2+2]*ret[i*2+1];
							a += q;
							cx += q*(ret[i*2]+ret[i*2+2]);
							cy += q*(ret[i*2+1]+ret[i*2+3]);
						}
						q = ret[cnum*2-2]*ret[1] - ret[0]*ret[cnum*2-1];
						a = 1/((3*(a+q)));
						cx = a*(cx + q*(ret[cnum*2-2]+ret[0]));
						cy = a*(cy + q*(ret[cnum*2-1]+ret[1]));
					}

					// compute the angle of each point w.r.t. the centroid
					for (i=0; i<cnum; i++) 
						ang[i] = atan2r(p[i*2+1]-cy,p[i*2]-cx);
					// search for points that have angles closest to sz1[i1] + i*(2*pi/m).

					for (i=0; i<cnum; i++) avail[i] = 1;
					avail[i1] = 0;
					iret[0] = i1;
					{
						s32* piret = iret;
						piret++;
						for (j=1; j<N3DMaxContact; j++) 
						{
							f32 maxdiff=1e9,diff;
							a = (f32)j*(2*N3DPi/N3DMaxContact) + ang[i1];
							if(a > N3DPi) a -= 2*N3DPi;
							
							for (i=0; i<cnum; i++) 
							{
								if (avail[i]) 
								{	
									diff = fabsr (ang[i]-a);
									if (diff > N3DPi) diff = 2*N3DPi - diff;
									if (diff < maxdiff) 
									{
										maxdiff = diff;
										*piret = i;
									}
								}
							}
							avail[*piret] = 0;
							piret++;
						}
					}
				}

				for (j=0; j < N3DMaxContact; j++) 
				{
					cContact *con = c+j;
					xQ4Add(con->pos,point[iret[j]],pa);
					con->depth = dep[iret[j]];	// this seems ok
				}
				cnum = N3DMaxContact;
			}
			return cnum;
		}
	}
	// return_code = code;	
}
int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const real tol = 1.0e-5;
    
    memset(A, 0, NY * NX * sizeof(real));
    
    // set rhs
    for (int iy = 1; iy < NY-1; iy++)
    {
        for( int ix = 1; ix < NX-1; ix++ )
        {
            const real x = -1.0 + (2.0*ix/(NX-1));
            const real y = -1.0 + (2.0*iy/(NY-1));
            rhs[iy][ix] = expr(-10.0*(x*x + y*y));
        }
    }
    
    printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX);

    StartTimer();
    int iter  = 0;
    real error = 1.0;
    
    #pragma acc data copy(A) copyin(rhs) create(Anew)
    while ( error > tol && iter < iter_max )
    {
        error = 0.0;

        #pragma acc kernels
        for (int iy = 1; iy < NY-1; iy++)
        {
            for( int ix = 1; ix < NX-1; ix++ )
            {
                Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1]
                                                       + A[iy-1][ix] + A[iy+1][ix] ));
                error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix]));
            }
        }
        
        #pragma acc kernels
        for (int iy = 1; iy < NY-1; iy++)
        {
            for( int ix = 1; ix < NX-1; ix++ )
            {
                A[iy][ix] = Anew[iy][ix];
            }
        }
        
        //Periodic boundary conditions
        #pragma acc kernels
        for( int ix = 1; ix < NX-1; ix++ )
        {
                A[0][ix]      = A[(NY-2)][ix];
                A[(NY-1)][ix] = A[1][ix];
        }
        #pragma acc kernels
        for (int iy = 1; iy < NY-1; iy++)
        {
                A[iy][0]      = A[iy][(NX-2)];
                A[iy][(NX-1)] = A[iy][1];
        }
        
        if((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }

    double runtime = GetTimer();

    printf( "%dx%d: 1 GPU: %8.4f s\n", NY,NX, runtime/ 1000.0 );

    return 0;
}
int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const real tol = 1.0e-5;

    int rank = 0;
    int size = 1;

    //Initialize MPI and determine rank and size
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    
    if ( size > MAX_MPI_SIZE )
    {
        if ( 0 == rank )
        {
            fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE);
        }
        return -1;
    }
    
    dim2 size2d = size_to_2Dsize(size);
    int sizex = size2d.x;
    int sizey = size2d.y;
    assert(sizex*sizey == size);
    
    int rankx = rank%sizex;
    int ranky = rank/sizex;

    memset(A, 0, NY * NX * sizeof(real));
    memset(Aref, 0, NY * NX * sizeof(real));
    
    // set rhs
    for (int iy = 1; iy < NY-1; iy++)
    {
        for( int ix = 1; ix < NX-1; ix++ )
        {
            const real x = -1.0 + (2.0*ix/(NX-1));
            const real y = -1.0 + (2.0*iy/(NY-1));
            rhs[iy][ix] = expr(-10.0*(x*x + y*y));
        }
    }
    
#if _OPENACC
    acc_device_t device_type = acc_get_device_type();
    if ( acc_device_nvidia == device_type )
    {
        int ngpus=acc_get_num_devices(acc_device_nvidia);
        
        int devicenum=rank%ngpus;
        acc_set_device_num(devicenum,acc_device_nvidia);
    }
    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(device_type);
#endif /*_OPENACC*/

    // Ensure correctness if NX%sizex != 0
    int chunk_sizex = ceil( (1.0*NX)/sizex );

    int ix_start = rankx * chunk_sizex;
    int ix_end   = ix_start + chunk_sizex;

    // Do not process boundaries
    ix_start = max( ix_start, 1 );
    ix_end = min( ix_end, NX - 1 );

    // Ensure correctness if NY%sizey != 0
    int chunk_sizey = ceil( (1.0*NY)/sizey );

    int iy_start = ranky * chunk_sizey;
    int iy_end   = iy_start + chunk_sizey;

    // Do not process boundaries
    iy_start = max( iy_start, 1 );
    iy_end = min( iy_end, NY - 1 );

    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    poisson2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    real error = 1.0;
    
    #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right)
    while ( error > tol && iter < iter_max )
    {
        error = 0.0;

        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1]
                                                       + A[iy-1][ix] + A[iy+1][ix] ));
                error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix]));
            }
        }
        
        real globalerror = 0.0;
        MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD );
        error = globalerror;
        
        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                A[iy][ix] = Anew[iy][ix];
            }
        }

        //Periodic boundary conditions
        int topy    = (ranky == 0) ? (sizey-1) : ranky-1;
        int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1;
        int top    = topy    * sizex + rankx;
        int bottom = bottomy * sizex + rankx;
        #pragma acc host_data use_device( A )
        {
            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
            MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
            MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        
        int leftx  = (rankx == 0) ? (sizex-1) : rankx-1;
        int rightx = (rankx == (sizex-1)) ? 0 : rankx+1;
        int left   = ranky * sizex + leftx;
        int right  = ranky * sizex + rightx;
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                to_left[iy]  = A[iy][ix_start];
                to_right[iy] = A[iy][ix_end-1];
        }
        #pragma acc host_data use_device( to_left, from_left, to_right, from_right )
        {
            //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right 
            MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left   , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left
            MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left  , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                A[iy][ix_start-1] = from_left[iy];
                A[iy][ix_end]     = from_right[iy];
        }
        
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }

    MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }

    MPI_Finalize();
    return 0;
}
Example #10
0
s32 dBoxBox (const f32* p1, const f32* R1, const f32* side1,
			 const f32* p2, const f32* R2, const f32* side2,
			 cContact* contact,f32* normal)
{
	s32 maxc = 4;
	const f32 fudge_factor = (f32)(1.05);
	vec4_t p,pp,normalC;
	const f32 *normalR = 0;
	f32 depth;
	f32 A[3],B[3],R11,R12,R13,R21,R22,R23,R31,R32,R33,
		Q11,Q12,Q13,Q21,Q22,Q23,Q31,Q32,Q33,s,s2,l;
	s32 i,j,invert_normal,code;
	const f32 *Ra,*Rb,*pa,*pb,*Sa,*Sb;
	vec4_t normal2,nr,anr;vec4_t center;
	s32 lanr,a1,a2;
	s32 codeN,code1,code2;

	// get vector from centers of box 1 to box 2, relative to box 1
	p[0] = p2[0] - p1[0];
	p[1] = p2[1] - p1[1];
	p[2] = p2[2] - p1[2];
	p[3] = 0;
	xMul0_344 (pp,p,R1);		// get pp = p relative to body 1

	// get side lengths / 2
	A[0] = side1[0]*(f32)(0.5);
	A[1] = side1[1]*(f32)(0.5);
	A[2] = side1[2]*(f32)(0.5);
	B[0] = side2[0]*(f32)(0.5);
	B[1] = side2[1]*(f32)(0.5);
	B[2] = side2[2]*(f32)(0.5);

	// Rij is R1'*R2, i.e. the relative rotation between R1 and R2
	R11 = dDOT(R1+0,R2+0); R12 = dDOT(R1+0,R2+4); R13 = dDOT(R1+0,R2+8);
	R21 = dDOT(R1+4,R2+0); R22 = dDOT(R1+4,R2+4); R23 = dDOT(R1+4,R2+8);
	R31 = dDOT(R1+8,R2+0); R32 = dDOT(R1+8,R2+4); R33 = dDOT(R1+8,R2+8);

	Q11 = fabsr(R11); Q12 = fabsr(R12); Q13 = fabsr(R13);
	Q21 = fabsr(R21); Q22 = fabsr(R22); Q23 = fabsr(R23);
	Q31 = fabsr(R31); Q32 = fabsr(R32); Q33 = fabsr(R33);

	// for all 15 possible separating axes:
	//   * see if the axis separates the boxes. if so, return 0.
	//   * find the depth of the penetration along the separating axis (s2)
	//   * if this is the largest depth so far, record it.
	// the normal vector will be set to the separating axis with the smallest
	// depth. note: normalR is set to point to a column of R1 or R2 if that is
	// the smallest depth normal so far. otherwise normalR is 0 and normalC is
	// set to a vector relative to body 1. invert_normal is 1 if the sign of
	// the normal should be flipped.

#define TST(expr_1,expr2,norm,cc) \
	expr1 = expr_1;\
	s2 = fabsr(expr1) - (expr2); \
	if (s2 > 0) return 0; \
	if (s2 > s) { \
	s = s2; \
	normalR = norm; \
	invert_normal = ((expr1) < 0); \
	code = (cc); \
	}

	f32 expr1;
	s = -N3DInfinity;
	invert_normal = 0;
	code = 0;

	// separating axis = u1,u2,u3
	TST (pp[0],(A[0] + B[0]*Q11 + B[1]*Q12 + B[2]*Q13),R1+0,1);
	TST (pp[1],(A[1] + B[0]*Q21 + B[1]*Q22 + B[2]*Q23),R1+4,2);
	TST (pp[2],(A[2] + B[0]*Q31 + B[1]*Q32 + B[2]*Q33),R1+8,3);

	// separating axis = v1,v2,v3
	TST (dDOT(R2+0,p),(A[0]*Q11 + A[1]*Q21 + A[2]*Q31 + B[0]),R2+0,4);
	TST (dDOT(R2+4,p),(A[0]*Q12 + A[1]*Q22 + A[2]*Q32 + B[1]),R2+4,5);
	TST (dDOT(R2+8,p),(A[0]*Q13 + A[1]*Q23 + A[2]*Q33 + B[2]),R2+8,6);

	// note: cross product axes need to be scaled when s is computed.
	// normal (n1,n2,n3) is relative to box 1.
#undef TST
#define TST(expr1,expr2,n1,n2,n3,cc) \
	s2 = fabsr(expr1) - (expr2); \
	if (s2 > 0) return 0; \
	l = fsqrtr((n1)*(n1) + (n2)*(n2) + (n3)*(n3)); \
	if (l > 0) { \
	s2 /= l; \
	if (s2*fudge_factor > s) { \
	s = s2; \
	normalR = 0; \
	normalC[0] = (n1)/l; normalC[1] = (n2)/l; normalC[2] = (n3)/l; \
	invert_normal = ((expr1) < 0); \
	code = (cc); \
	} \
	}

	// separating axis = u1 x (v1,v2,v3)
	TST(pp[2]*R21-pp[1]*R31,(A[1]*Q31+A[2]*Q21+B[1]*Q13+B[2]*Q12),0,-R31,R21,7);
	TST(pp[2]*R22-pp[1]*R32,(A[1]*Q32+A[2]*Q22+B[0]*Q13+B[2]*Q11),0,-R32,R22,8);
	TST(pp[2]*R23-pp[1]*R33,(A[1]*Q33+A[2]*Q23+B[0]*Q12+B[1]*Q11),0,-R33,R23,9);

	// separating axis = u2 x (v1,v2,v3)
	TST(pp[0]*R31-pp[2]*R11,(A[0]*Q31+A[2]*Q11+B[1]*Q23+B[2]*Q22),R31,0,-R11,10);
	TST(pp[0]*R32-pp[2]*R12,(A[0]*Q32+A[2]*Q12+B[0]*Q23+B[2]*Q21),R32,0,-R12,11);
	TST(pp[0]*R33-pp[2]*R13,(A[0]*Q33+A[2]*Q13+B[0]*Q22+B[1]*Q21),R33,0,-R13,12);

	// separating axis = u3 x (v1,v2,v3)
	TST(pp[1]*R11-pp[0]*R21,(A[0]*Q21+A[1]*Q11+B[1]*Q33+B[2]*Q32),-R21,R11,0,13);
	TST(pp[1]*R12-pp[0]*R22,(A[0]*Q22+A[1]*Q12+B[0]*Q33+B[2]*Q31),-R22,R12,0,14);
	TST(pp[1]*R13-pp[0]*R23,(A[0]*Q23+A[1]*Q13+B[0]*Q32+B[1]*Q31),-R23,R13,0,15);

#undef TST

	if (!code) return 0;

	// if we get to this point, the boxes interpenetrate. compute the normal
	// in global coordinates.
	if (normalR) {
		normal[0] = normalR[0];
		normal[1] = normalR[1];
		normal[2] = normalR[2];
	}
	else {
		xMul1_344 (normal,normalC,R1);
	}
	if (invert_normal) {
		normal[0] = -normal[0];
		normal[1] = -normal[1];
		normal[2] = -normal[2];
	}
	depth = -s;

	// compute contact point(s)

	if (code > 6) {
		// an edge from box 1 touches an edge from box 2.
		// find a point pa on the intersecting edge of box 1
		vec4_t pa,pb;
		f32 sign;f32 alpha,beta;
		vec4_t ua,ub;
		for (i=0; i<3; i++) pa[i] = p1[i];
		for (j=0; j<3; j++) {
			sign = (dDOT(normal,(R1+j*4)) > 0) ? (f32)(1.0) : (f32)(-1.0);
			xQ4ScaleNAdd(pa,&R1[j*4],sign * A[j]);
		}

		// find a point pb on the intersecting edge of box 2
		for (i=0; i<3; i++) pb[i] = p2[i];
		for (j=0; j<3; j++) {
			sign = (dDOT(normal,(R2+j*4)) > 0) ? (f32)(-1.0) : (f32)(1.0);
			xQ4ScaleNAdd(pb,&R2[j*4],sign * B[j]);
		}

		
		
		for (i=0; i<3; i++) ua[i] = R1[(((code)-7)/3)*4 + i];
		for (i=0; i<3; i++) ub[i] = R2[(((code)-7)%3)*4 + i];

		dLineClosestApproach (pa,ua,pb,ub,&alpha,&beta);
		for (i=0; i<3; i++) pa[i] += ua[i]*alpha;
		for (i=0; i<3; i++) pb[i] += ub[i]*beta;

		for (i=0; i<3; i++) contact[0].pos[i] = (f32)(0.5)*(pa[i]+pb[i]);
		contact[0].depth = depth;
		return 1;
	}

	// okay, we have a face-something intersection (because the separating
	// axis is perpendicular to a face). define face 'a' to be the reference
	// face (i.e. the normal vector is perpendicular to this) and face 'b' to be
	// the incident face (the closest face of the other box).

	if (code <= 3) {
		Ra = R1;
		Rb = R2;
		pa = p1;
		pb = p2;
		Sa = A;
		Sb = B;
	}
	else {
		Ra = R2;
		Rb = R1;
		pa = p2;
		pb = p1;
		Sa = B;
		Sb = A;
	}

	// nr = normal vector of reference face dotted with axes of incident box.
	// anr = absolute values of nr.
	
	if (code <= 3) {
		normal2[0] = normal[0];
		normal2[1] = normal[1];
		normal2[2] = normal[2];
	}
	else {
		normal2[0] = -normal[0];
		normal2[1] = -normal[1];
		normal2[2] = -normal[2];
	}
	xMul0_344 (nr,normal2,Rb);
	anr[0] = fabsr (nr[0]);
	anr[1] = fabsr (nr[1]);
	anr[2] = fabsr (nr[2]);

	// find the largest compontent of anr: this corresponds to the normal
	// for the indident face. the other axis numbers of the indicent face
	// are stored in a1,a2.
	if (anr[1] > anr[0]) {
		if (anr[1] > anr[2]) {
			a1 = 0;
			lanr = 1;
			a2 = 2;
		}
		else {
			a1 = 0;
			a2 = 1;
			lanr = 2;
		}
	}
	else {
		if (anr[0] > anr[2]) {
			lanr = 0;
			a1 = 1;
			a2 = 2;
		}
		else {
			a1 = 0;
			a2 = 1;
			lanr = 2;
		}
	}

	// compute center point of incident face, in reference-face coordinates
	
	if (nr[lanr] < 0) {
		for (i=0; i<3; i++) center[i] = pb[i] - pa[i] + Sb[lanr] * Rb[i+lanr*4];
	}
	else {
		for (i=0; i<3; i++) center[i] = pb[i] - pa[i] - Sb[lanr] * Rb[i+lanr*4];
	}

	// find the normal and non-normal axis numbers of the reference box	
	if (code <= 3) codeN = code-1; else codeN = code-4;
	if (codeN==0) {
		code1 = 1;
		code2 = 2;
	}
	else if (codeN==1) {
		code1 = 0;
		code2 = 2;
	}
	else {
		code1 = 0;
		code2 = 1;
	}

	// find the four corners of the incident face, in reference-face coordinates
	f32 quad[8];	// 2D coordinate of incident face (x,y pairs)
	f32 c1,c2,m11,m12,m21,m22;
	c1 = dDOT (center,(Ra+code1*4));
	c2 = dDOT (center,(Ra+code2*4));
	// optimize this? - we have already computed this data above, but it is not
	// stored in an easy-to-index format. for now it's quicker just to recompute
	// the four dot products.
	m11 = dDOT (Ra+code1*4,Rb+a1*4);
	m12 = dDOT (Ra+code1*4,Rb+a2*4);
	m21 = dDOT (Ra+code2*4,Rb+a1*4);
	m22 = dDOT (Ra+code2*4,Rb+a2*4);
	{
		f32 k1 = m11*Sb[a1];
		f32 k2 = m21*Sb[a1];
		f32 k3 = m12*Sb[a2];
		f32 k4 = m22*Sb[a2];
		quad[0] = c1 - k1 - k3;
		quad[1] = c2 - k2 - k4;
		quad[2] = c1 - k1 + k3;
		quad[3] = c2 - k2 + k4;
		quad[4] = c1 + k1 + k3;
		quad[5] = c2 + k2 + k4;
		quad[6] = c1 + k1 - k3;
		quad[7] = c2 + k2 - k4;
	}

	// find the size of the reference face
	f32 rect[2];
	rect[0] = Sa[code1];
	rect[1] = Sa[code2];

	// intersect the incident and reference faces
	f32 ret[16];
	s32 n = intersectRectQuad (rect,quad,ret);
	if (n < 1) return 0;		// this should never happen

	// convert the intersection points into reference-face coordinates,
	// and compute the contact position and depth for each point. only keep
	// those points that have a positive (penetrating) depth. delete points in
	// the 'ret' array as necessary so that 'point' and 'ret' correspond.
	f32 point[3*8];		// penetrating contact points
	f32 dep[8];			// depths for those points
	f32 det1 = 1.f/(m11*m22 - m12*m21);
	m11 *= det1;
	m12 *= det1;
	m21 *= det1;
	m22 *= det1;
	s32 cnum = 0;			// number of penetrating contact points found
	for (j=0; j < n; j++) {
		f32 k1 =  m22*(ret[j*2]-c1) - m12*(ret[j*2+1]-c2);
		f32 k2 = -m21*(ret[j*2]-c1) + m11*(ret[j*2+1]-c2);
		for (i=0; i<3; i++) point[cnum*3+i] =
			center[i] + k1*Rb[i+a1*4] + k2*Rb[i+a2*4];
		dep[cnum] = Sa[codeN] - dDOT(normal2,point+cnum*3);
		if (dep[cnum] >= 0) {
			ret[cnum*2] = ret[j*2];
			ret[cnum*2+1] = ret[j*2+1];
			cnum++;
		}
	}
	if (cnum < 1) return 0;	// this should never happen

	// we can't generate more contacts than we actually have
	if (maxc > cnum) maxc = cnum;
	if (maxc < 1) maxc = 1;

	if (cnum <= maxc) {
		// we have less contacts than we need, so we use them all
		for (j=0; j < cnum; j++) {
			cContact *con = &contact[j];
			for (i=0; i<3; i++) con->pos[i] = point[j*3+i] + pa[i];
			con->depth = dep[j];
		}
	}
	else {
		// we have more contacts than are wanted, some of them must be culled.
		// find the deepest point, it is always the first contact.
		s32 i1 = 0;
		f32 maxdepth = dep[0];
		for (i=1; i<cnum; i++) {
			if (dep[i] > maxdepth) {
				maxdepth = dep[i];
				i1 = i;
			}
		}

		//s32 iret[8];
		//cullPoints (cnum,ret,maxc,i1,iret);

		for (j=0; j < maxc; j++) {
			cContact *con = &contact[j];
			for (i=0; i<3; i++) con->pos[i] = point[j*3+i] + pa[i];
			con->depth = dep[j];
		}
		cnum = maxc;
	}
	return cnum;
}