real bump(real x, real y) { real cxs = cosr(x); cxs *= cxs; real cys = cosr(y); cys *= cys; return fabsr((cxs*cxs + cys*cys - 2*cxs*cys) / sqrtr(x*x+2*y*y)); }
void cullPoints (s32 n, f32 p[], s32 m, s32 i0, s32 iret[]) { // compute the centroid of the polygon in cx,cy s32 i,j; f32 a,cx,cy,q; f32 A[8]; s32 avail[8]; if (n==1) { cx = p[0]; cy = p[1]; } else if (n==2) { cx = (f32)(0.5)*(p[0] + p[2]); cy = (f32)(0.5)*(p[1] + p[3]); } else { a = 0; cx = 0; cy = 0; for (i=0; i<(n-1); i++) { q = p[i*2]*p[i*2+3] - p[i*2+2]*p[i*2+1]; a += q; cx += q*(p[i*2]+p[i*2+2]); cy += q*(p[i*2+1]+p[i*2+3]); } q = p[n*2-2]*p[1] - p[0]*p[n*2-1]; a = 1.f/((f32)(3.0)*(a+q)); cx = a*(cx + q*(p[n*2-2]+p[0])); cy = a*(cy + q*(p[n*2-1]+p[1])); } // compute the angle of each point w.r.t. the centroid for (i=0; i<n; i++) A[i] = atan2r(p[i*2+1]-cy,p[i*2]-cx); // search for points that have angles closest to A[i0] + i*(2*pi/m). for (i=0; i<n; i++) avail[i] = 1; avail[i0] = 0; iret[0] = i0; iret++; for (j=1; j<m; j++) { f32 maxdiff=1e9,diff; a = (f32)(j)*(2*N3DPi/m) + A[i0]; if (a > N3DPi) a -= 2*N3DPi; for (i=0; i<n; i++) { if (avail[i]) { diff = fabsr (A[i]-a); if (diff > N3DPi) diff = 2*N3DPi - diff; if (diff < maxdiff) { maxdiff = diff; *iret = i; } } } avail[*iret] = 0; iret++; } }
/* n inner lobatto nodes (excluding -1,1) */ static void lobatto_nodes_aux(real *z, int n) { int i,j,np=n+1; for(i=0; i<=n/2-1; ++i) { real ox, x = cosr( (n-i)*PI/np ); do { ox = x; x -= legendre_d1(np,x)/legendre_d2(np,x); } while(fabsr(x-ox)>-x*EPS); z[i] = x - legendre_d1(np,x)/legendre_d2(np,x); } if(n&1) z[n/2]=0; for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) z[j]=-z[i]; }
/* n nodes */ void gauss_nodes(real *z, int n) { int i,j; for(i=0; i<=n/2-1; ++i) { real ox, x = cosr( (2*n-2*i-1)*(PI/2)/n ); do { ox = x; x -= legendre(n,x)/legendre_d1(n,x); } while(fabsr(x-ox)>-x*EPS); z[i] = x - legendre(n,x)/legendre_d1(n,x); } if(n&1) z[n/2]=0; for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) z[j]=-z[i]; }
real sa(unsigned seed) { srandom(seed); real T; if (initial_temp_method != constant) T = INFINITY; else T = initial_temp; real step_size_x = init_step_size, step_size_y = init_step_size; real pos_x = 5, pos_y = 5; real obj = bump(pos_x, pos_y); real obj_pen = obj; int samples_remaining = 5000-1; real obj_d[5000]; int n_obj_d = 0; real accepts[5000]; int n_accepts = 0; int num_trials = 0, num_acceptances = 0; int initial_trials = 500; int max_trials = temp_length; int max_acceptances = 0.6*temp_length; real alpha = 0.1, omega = 2.1; real best_obj = obj; real best_x = pos_x, best_y = pos_y; int best_time = samples_remaining; while (samples_remaining > 0) { if (best_time - samples_remaining > 500) { best_time = samples_remaining; pos_x = best_x; pos_y = best_y; obj = best_obj; obj_pen = best_obj + penalty_weight * penalty(pos_x, pos_y) / T; step_size_x = step_size_y = init_step_size; } real step_x, step_y; if (step_method == gaussian) { step_x = step_size_x * randn(); step_y = step_size_y * randn(); } else { step_x = step_size_x * (2*randf()-1); step_y = step_size_y * (2*randf()-1); } real new_x = pos_x+step_x, new_y = pos_y+step_y; real new_pen = penalty_weight * penalty(new_x, new_y); if (T == INFINITY && new_pen != 0) continue; real new_obj = bump(new_x, new_y); real new_obj_pen = new_obj + new_pen / T; samples_remaining--; num_trials++; if (new_pen == 0) { if (new_obj > best_obj) { best_obj = new_obj; best_x = new_x; best_y = new_y; best_time = samples_remaining; } } real p; if (step_method == parks) { real step_norm = sqrtr(step_x*step_x + step_y*step_y); p = exp(- (obj_pen - new_obj_pen) / (T * step_norm)); } else { p = exp(- (obj_pen - new_obj_pen) / T); } if (randf() < p) { num_acceptances++; obj_d[n_obj_d++] = new_obj_pen - obj_pen; pos_x = new_x; pos_y = new_y; obj = new_obj; obj_pen = new_obj_pen; accepts[n_accepts++] = new_obj_pen; if (T != INFINITY && step_method == parks) { step_size_x = (1-alpha)*step_size_x + alpha*omega*fabsr(step_x); step_size_y = (1-alpha)*step_size_y + alpha*omega*fabsr(step_y); } } bool reduced_T = false; if (T == INFINITY) { if (num_trials >= initial_trials) { if (initial_temp_method == kirkpatrick) T = T_kirkpatrick(obj_d, n_obj_d); else if (initial_temp_method == white) T = std(obj_d, n_obj_d); else { fprintf(stderr, "unknown temp method"); exit(1); } reduced_T = true; } } else if (num_trials >= max_trials || num_acceptances >= max_acceptances) { if (temp_decay_method == huang) { real factor; if (n_accepts < 2) factor = 0.5; else { factor = exp(-0.7*T/std(accepts, n_accepts)); if (factor < 0.5) factor = 0.5; } T *= factor; } else T *= temp_decay; reduced_T = true; } if (reduced_T) { //printf("%g\n", T); n_obj_d = 0; num_trials = 0; num_acceptances = 0; n_accepts = 1; accepts[0] = obj_pen; } } return best_obj; }
s32 xColBoxPlane(iCollisionObject* o1,iCollisionObject* o2) { X_Assert(o1->muClass==iCollisionObject::Col_BBox); X_Assert(o2->muClass==iCollisionObject::Col_Plane); cBBox *box = (cBBox*)o1; cPlane *plane = (cPlane*)o2; const f32* r = box->mpkWorld->rot; const f32* n = plane->mkPlane; vec4_t a,p; f32 b1,b2,b3; xMul0_344(a,n,r); // this is safe xQ4Mul(a,a,box->mkHalfSide); xQ4Scale(a,2.0f); b1 = fabsr(a[0]); b2 = fabsr(a[1]); b3 = fabsr(a[2]); // early exit test f32 depth; depth = n[PND] + 0.5f*(b1+b2+b3) - xVDotR(n,box->mpkWorld->pos); if (X_IsFloatNeg(depth)) return 0; // find number of contacts requested const s32 maxc = 3; // find deepest point xQ4CpyMac(p,box->mpkWorld->pos); #define P1(i,op) xQ4ScaleN##op(p,&r[i*4],box->mkHalfSide[i]); #define PALL(i) if (a[i]>0) { P1(i,Sub) } else { P1(i,Add) } PALL(0); PALL(1); PALL(2); #undef P1 #undef PALL // the deepest point is the first contact point xQ4CpyMac(gpkContactHolder[0].pos,p); xQ4CpyMac(gpkContactHolder[0].norm,n); gpkContactHolder[0].depth = depth; // s32 ret = 1; // ret is number of contact points found so far // get the second and third contact points by starting from `p' and going // along the two sides with the smallest projected length. #define P1(i,j,op) \ xQ4CpyMac(gpkContactHolder[i].pos,p);\ xQ4ScaleN##op(gpkContactHolder[i].pos,&r[j*4],2*box->mkHalfSide[j]); #define PALL(ctact,side,sideinc) \ depth -= b ## sideinc; \ if (depth < 0) goto done; \ if (a[sideinc-1] > 0) { P1(ctact,side,Add) } else { P1(ctact,side,Sub) } \ gpkContactHolder[ctact].depth = depth; \ ret++; xQ4CpyMac(gpkContactHolder[1].norm,n); xQ4CpyMac(gpkContactHolder[2].norm,n); if (b1 < b2) { if (b3 < b1) goto use_side_3; else { PALL(1,0,1); // use side 1 if (b2 < b3) goto contact2_2; else goto contact2_3; } } else { if (b3 < b2) { use_side_3: // use side 3 PALL(1,2,3); if (b1 < b2) goto contact2_1; else goto contact2_2; } else { PALL(1,1,2); // use side 2 if (b1 < b3) goto contact2_1; else goto contact2_3; } } contact2_1: PALL(2,0,1); goto done; contact2_2: PALL(2,1,2); goto done; contact2_3: PALL(2,2,3); goto done; #undef P1 #undef PALL done: return ret; }
s32 cBoxCollide( const f32* p1,const f32* r1,const f32* sz1, const f32* p2,const f32* r2,const f32* sz2, cContact* c,f32* normal) { const f32 fudge_factor = 1.05f; vec4_t p,pp1,pp2,nC; //secured_v mat34_t r,q; f32 s2,s; s32 code; const f32* nR; //mat34_t r1,r2; s32 invertnormal,i,j; nC[3] = 0; //xMTrans_34(r1,r1t); //xMTrans_34(r2,r2t); xQ4Sub(p,p2,p1); xMul0_344(pp1,p,r1); xMul0_344(pp2,p,r2); // r(i,j) = row(r1,i). row(r2,j) xMul0_34(r,r1,r2); q[0] = fabsr(r[0]); q[1] = fabsr(r[1]); q[2] = fabsr(r[2]); q[3] = 0; q[4] = fabsr(r[4]); q[5] = fabsr(r[5]); q[6] = fabsr(r[6]); q[7] = 0; q[8] = fabsr(r[8]); q[9] = fabsr(r[9]); q[10] = fabsr(r[10]); q[11] = 0; #define XSATEST(e1,e2,n,cc) \ s2 = fabsr(e1) - (e2); \ if (s2 > 0 ) return 0; \ if (s2 > s) \ { \ s = s2; nR = (n); \ invertnormal = ((e1) < 0); \ code = (cc); \ } s =-N3DInfinity; invertnormal = 0; code = 0; // note that nC has not been used yet // we use nC for sz1 rough calculation // here // separating axis = u1,u2,u3 xMul0_344(nC,sz2,q); // 3 multiplication with SSE xQ4Add(nC,nC,sz1); // one addition with SSE //TODO: HERE we need to take a descision, we can //well over escape 3 multiplication for 1, in case //we use xVDot for each test, but for average case //I guess this will work better XSATEST(pp1[0],nC[0],r1+0,1); XSATEST(pp1[1],nC[1],r1+4,2); XSATEST(pp1[2],nC[2],r1+8,3); xMul1_344(nC,sz1,q); // 3 multiplication with SSE xQ4Add(nC,nC,sz2); // one addition with SSE // separating axis = v1,v2,v3 XSATEST(pp2[0],nC[0],r2+0,4); XSATEST(pp2[1],nC[1],r2+4,5); XSATEST(pp2[2],nC[2],r2+8,6); // note that nC has not been used yet #undef XSATEST #define XSATEST(expr1,e2,n1,n2,n3,cc) \ s2 = fabsr(expr1) - (e2); \ if (s2 > 0) return 0; \ l = sqrtr((n1)*(n1) + (n2)*(n2) + (n3)*(n3)); \ if (l > 0) { \ s2 /= l; \ if (s2*fudge_factor > s) { \ s = s2; \ nR = 0; \ nC[0] = (n1)/l; nC[1] = (n2)/l; nC[2] = (n3)/l; \ invertnormal = ((expr1) < 0); \ code = (cc); \ } \ } { // we need some temp vectors here vec4_t tmp1,tmp2; f32 l; // with SSE these are effectively xQ4ScaleS(tmp1,(f32*)&r[4],pp1[2]); // one multiplication xQ4ScaleNSub(tmp1,(f32*)&r[8],pp1[1]); // one addition and one multiplication xQ4ScaleS(tmp2,(f32*)&q[8],sz1[1]); // one multiplication xQ4ScaleNAdd(tmp2,(f32*)&q[4],sz1[2]); // one addition and one multiplication // separating axis = u1 x (v1,v2,v3) XSATEST(tmp1[0],(tmp2[0]+sz2[1]*q[2]+sz2[2]*q[1]),0,-r[8],r[4],7); XSATEST(tmp1[1],(tmp2[1]+sz2[0]*q[2]+sz2[2]*q[0]),0,-r[9],r[5],8); XSATEST(tmp1[2],(tmp2[2]+sz2[0]*q[1]+sz2[1]*q[0]),0,-r[10],r[6],9); xQ4ScaleS(tmp1,(f32*)&r[8],pp1[0]); // one multiplication xQ4ScaleNSub(tmp1,(f32*)&r[0],pp1[2]); // one addition and one multiplication xQ4ScaleS(tmp2,(f32*)&q[8],sz1[0]); // one multiplication xQ4ScaleNAdd(tmp2,(f32*)&q[0],sz1[2]); // one addition and one multiplication // separating axis = u2 x (v1,v2,v3) XSATEST(tmp1[0],(tmp2[0]+sz2[1]*q[6]+sz2[2]*q[5]),r[8],0,-r[0],10); XSATEST(tmp1[1],(tmp2[1]+sz2[0]*q[6]+sz2[2]*q[4]),r[9],0,-r[1],11); XSATEST(tmp1[2],(tmp2[2]+sz2[0]*q[5]+sz2[1]*q[4]),r[10],0,-r[2],12); xQ4ScaleS(tmp1,(f32*)&r[0],pp1[1]); // one multiplication xQ4ScaleNSub(tmp1,(f32*)&r[4],pp1[0]); // one addition and one multiplication xQ4ScaleS(tmp2,(f32*)&q[4],sz1[0]); // one multiplication xQ4ScaleNAdd(tmp2,(f32*)&q[0],sz1[1]); // one addition and one multiplication // separating axis = u3 x (v1,v2,v3) XSATEST(tmp1[0],(tmp2[0]+sz2[1]*q[10]+sz2[2]*q[9]),-r[4],-r[0],0,13); XSATEST(tmp1[1],(tmp2[1]+sz2[0]*q[10]+sz2[2]*q[8]),-r[5],-r[1],0,14); XSATEST(tmp1[2],(tmp2[2]+sz2[0]*q[9]+sz2[1]*q[8]),-r[6],-r[2],0,15); } #undef XSATEST if(!code) return 0; if(nR) { xQ4CpyMac(normal,nR); } else { xMul1_344(normal,nC,r1); } if(invertnormal) { xQ4Scale(normal,-1.0f); // normal[0] = -normal[0]; // normal[1] = -normal[1]; // normal[2] = -normal[2]; } s = -s; if (code > 6) { // an edge from box 1 touches an edge from box 2. // find a point pa on the intersecting edge of box 1 vec4_t pa,pb,ua,ub; f32 alpha,beta; xQ4CpyMac(pa,p1); xQ4CpyMac(pb,p2); xMul0_344(ua,normal,r1); for (j=0; j<3; j++) { // add sign if( ISFLOATNEGETIVE(ua[j]) ) { xQ4ScaleNSub(pa,&r1[j*4],sz1[j]); } else { xQ4ScaleNAdd(pa,&r1[j*4],sz1[j]); } } // find a point pb on the intersecting edge of box 2 xMul0_344(ua,normal,r2); for (j=0; j<3; j++) { // add sign if( ISFLOATNEGETIVE(ua[j]) ) { xQ4ScaleNSub(pb,&r2[j*4],sz2[j]); } else { xQ4ScaleNAdd(pb,&r2[j*4],sz2[j]); } } // highly doubted xQ4CpyMac(ua,&r1[(((code)-7)/3)*4]); xQ4CpyMac(ub,&r2[(((code)-7)%3)*4]); { // line closest approach (pa,ua,pb,ub,&alpha,&beta); vec4_t p;// secured_v f32 uaub,q1,q2,d; xQ4Sub(p,pb,pa); xVDot(&uaub,ua,ub); xVDot(&q1,ua,p); xVDot(&q2,ub,p); d = 1-uaub*uaub; if (d <= 0.0001f) { alpha = 0; beta = 0; } else { d = 1/d; alpha = (q1 - uaub*q2)*d; beta = (uaub*q1 - q2)*d; } } xQ4ScaleNAdd(pa,ua,alpha); xQ4ScaleNAdd(pb,ub,beta); xQ4Add(c->pos,pa,pb); xQ4Scale(c->pos,0.5f); c->depth = s; return 1; } // okay, we have sz1 face-something intersection (because the separating // axis is perpendicular to sz1 face). define face 'sz1' to be the reference // face (i.e. the normal vector is perpendicular to this) and face 'sz2' to be // the incident face (the closest face of the other box). { vec4_t center; // secured_v const f32 *ra,*rb; const f32 *pa,*pb,*sa,*sb; s32 lanr,a1,a2; f32 quad[8]; // 2D coordinate of incident face (x,y pairs) // nr = normal vector of reference face dotted with axes of incident box. // anr = absolute values of nr. vec4_t normal2,nr,anr; // secured_v if (code <= 3) { ra = r1; rb = r2; pa = (f32*)p1; pb = (f32*)p2; sa = (f32*)sz1; sb = (f32*)sz2; xQ4CpyMac(normal2,normal); } else { ra = r2; rb = r1; pa = (f32*)p2; pb = (f32*)p1; sa = (f32*)sz2; sb = (f32*)sz1; normal2[0] = -normal[0]; normal2[1] = -normal[1]; normal2[2] = -normal[2]; normal2[3] = 0; } xMul0_344(nr,normal2,rb); anr[0] = fabsr (nr[0]); anr[1] = fabsr (nr[1]); anr[2] = fabsr (nr[2]); anr[3] = 0; // secured_v // find the largest compontent of anr: this corresponds to the normal // for the indident face. the other axis numbers of the indicent face // are stored in a1,a2. if (anr[1] > anr[0]) { if (anr[1] > anr[2]) { a1 = 0; lanr = 1; a2 = 2; } else { a1 = 0; a2 = 1; lanr = 2; } } else { if (anr[0] > anr[2]) { lanr = 0; a1 = 1; a2 = 2; } else { a1 = 0; a2 = 1; lanr = 2; } } // compute center point of incident face, in reference-face coordinates xQ4Sub(center,pb,pa); if (nr[lanr] < 0) { xQ4ScaleNAdd(center,&rb[4*lanr],sb[lanr]); } else { xQ4ScaleNSub(center,&rb[4*lanr],sb[lanr]); } // find the normal and non-normal axis numbers of the reference box { vec4_t point[8]; // penetrating contact points vec4_t m_; s32 codeN,code1,code2; f32 c1,c2; f32 rect[2]; f32 ret[16]; s32 inr; s32 cnum = 0; // number of penetrating contact points found f32 dep[8]; // depths for those points f32 det1; if (code <= 3) codeN = code-1; else codeN = code-4; if (codeN==0) { code1 = 1; code2 = 2; } else if (codeN==1) { code1 = 0; code2 = 2; } else { code1 = 0; code2 = 1; } // find the four corners of the incident face, in reference-face coordinates { xVDot(&c1,center,&ra[code1*4]); xVDot(&c2,center,&ra[code1*4]); } // r(i,j) = col(r1,i). col(r2,j) if(code <= 3) { m_[0] = r[code1*4+a1]; m_[1] = r[code1*4+a2]; m_[2] = r[code2*4+a1]; m_[3] = r[code2*4+a2]; } else { m_[0] = r[a1*4+code1]; m_[1] = r[a2*4+code1]; m_[2] = r[a1*4+code2]; m_[3] = r[a2*4+code2]; } { f32 k1 = m_[0]*sb[a1]; f32 k2 = m_[2]*sb[a1]; f32 k3 = m_[1]*sb[a2]; f32 k4 = m_[3]*sb[a2]; quad[0] = c1 - k1 - k3; quad[1] = c2 - k2 - k4; quad[2] = c1 - k1 + k3; quad[3] = c2 - k2 + k4; quad[4] = c1 + k1 + k3; quad[5] = c2 + k2 + k4; quad[6] = c1 + k1 - k3; quad[7] = c2 + k2 - k4; } // find the size of the reference face rect[0] = sa[code1]; rect[1] = sa[code2]; // intersect the incident and reference faces { // s32 n = intersectRectQuad (rect,quad,ret); //(rect[2],p[8],ret[16]) // q (and r) contain nq (and nr) coordinate points for the current (and // chopped) polygons s32 nq=4; f32 buffer[16]; f32 *q = quad; f32 *r = ret; s32 dir,sign; for (dir=0; dir <= 1; dir++) { // direction notation: xy[0] = x axis, xy[1] = y axis for (sign=-1; sign <= 1; sign += 2) { // chop q along the line xy[dir] = sign*rect[dir] f32 *pq = q; f32 *pr = r; inr = 0; for (i=nq; i > 0; i--) { f32 *nextq; // go through all points in q and all lines between adjacent points if (sign*pq[dir] < rect[dir]) { // this point is inside the chopping line pr[0] = pq[0]; pr[1] = pq[1]; pr += 2; inr++; if (inr & 8) { q = r; goto done; } } nextq = (i > 1) ? pq+2 : q; if ((sign*pq[dir] < rect[dir]) ^ (sign*nextq[dir] < rect[dir])) { // this line crosses the chopping line pr[1-dir] = pq[1-dir] + (nextq[1-dir]-pq[1-dir]) / (nextq[dir]-pq[dir]) * (sign*rect[dir]-pq[dir]); pr[dir] = sign*rect[dir]; pr += 2; inr++; if (inr & 8) { q = r; goto done; } } pq += 2; } q = r; r = (q==ret) ? buffer : ret; nq = inr; } } done: if (q != ret) copyDwords(ret,q,inr*2); } if (inr < 1) return 0; // this should never happen // convert the intersection points into reference-face coordinates, // and compute the contact position and depth for each point. only keep // those points that have sz1 positive (penetrating) depth. delete points in // the 'ret' array as necessary so that 'point' and 'ret' correspond. det1 = 1/(m_[0]*m_[3] - m_[1]*m_[2]); xQ4Scale(m_,det1); for (j=0; j < inr; j++) { f32 k1 = m_[3]*(ret[j*2]-c1) - m_[1]*(ret[j*2+1]-c2); f32 k2 = -m_[2]*(ret[j*2]-c1) + m_[0]*(ret[j*2+1]-c2); xQ4CpyMac(point[cnum],center); xQ4ScaleNAdd(point[cnum],&rb[a1*4],k1); xQ4ScaleNAdd(point[cnum],&rb[a2*4],k2); dep[cnum] = sa[codeN] - xVDotR(normal2,point[cnum]); if(dep[cnum] >= 0) { ret[cnum*2] = ret[j*2]; ret[cnum*2+1] = ret[j*2+1]; cnum++; } } if (cnum < 1) return 0; // this should never happen // we can't generate more contacts than we actually have if (cnum <= N3DMaxContact) { // we have less contacts than we need, so we use them all for (j=0; j < cnum; j++) { cContact *con = c+j; xQ4Add(con->pos,point[j],pa); con->depth = dep[j]; // this seems ok } } else { // we have more contacts than are wanted, some of them must be culled. // find the deepest point, it is always the first contact. s32 i1 = 0; f32 maxdepth = dep[0]; s32 iret[8]; for (i=1; i<cnum; i++) { if (dep[i] > maxdepth) { maxdepth = dep[i]; i1 = i; } } { // compute the centroid of the polygon in cx,cy f32 a,cx,cy,q; f32 ang[8]; s32 avail[8]; if (cnum==1) { cx = ret[0]; cy = ret[1]; } else if (cnum==2) { cx = 0.5f*(ret[0] + ret[2]); cy = 0.5f*(ret[1] + ret[3]); } else { a = 0; cx = 0; cy = 0; for (i=0;i<(cnum-1); i++) { q = ret[i*2]*ret[i*2+3] - ret[i*2+2]*ret[i*2+1]; a += q; cx += q*(ret[i*2]+ret[i*2+2]); cy += q*(ret[i*2+1]+ret[i*2+3]); } q = ret[cnum*2-2]*ret[1] - ret[0]*ret[cnum*2-1]; a = 1/((3*(a+q))); cx = a*(cx + q*(ret[cnum*2-2]+ret[0])); cy = a*(cy + q*(ret[cnum*2-1]+ret[1])); } // compute the angle of each point w.r.t. the centroid for (i=0; i<cnum; i++) ang[i] = atan2r(p[i*2+1]-cy,p[i*2]-cx); // search for points that have angles closest to sz1[i1] + i*(2*pi/m). for (i=0; i<cnum; i++) avail[i] = 1; avail[i1] = 0; iret[0] = i1; { s32* piret = iret; piret++; for (j=1; j<N3DMaxContact; j++) { f32 maxdiff=1e9,diff; a = (f32)j*(2*N3DPi/N3DMaxContact) + ang[i1]; if(a > N3DPi) a -= 2*N3DPi; for (i=0; i<cnum; i++) { if (avail[i]) { diff = fabsr (ang[i]-a); if (diff > N3DPi) diff = 2*N3DPi - diff; if (diff < maxdiff) { maxdiff = diff; *piret = i; } } } avail[*piret] = 0; piret++; } } } for (j=0; j < N3DMaxContact; j++) { cContact *con = c+j; xQ4Add(con->pos,point[iret[j]],pa); con->depth = dep[iret[j]]; // this seems ok } cnum = N3DMaxContact; } return cnum; } } // return_code = code; }
int main(int argc, char** argv) { int iter_max = 1000; const real tol = 1.0e-5; memset(A, 0, NY * NX * sizeof(real)); // set rhs for (int iy = 1; iy < NY-1; iy++) { for( int ix = 1; ix < NX-1; ix++ ) { const real x = -1.0 + (2.0*ix/(NX-1)); const real y = -1.0 + (2.0*iy/(NY-1)); rhs[iy][ix] = expr(-10.0*(x*x + y*y)); } } printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX); StartTimer(); int iter = 0; real error = 1.0; #pragma acc data copy(A) copyin(rhs) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc kernels for (int iy = 1; iy < NY-1; iy++) { for( int ix = 1; ix < NX-1; ix++ ) { Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1] + A[iy-1][ix] + A[iy+1][ix] )); error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix])); } } #pragma acc kernels for (int iy = 1; iy < NY-1; iy++) { for( int ix = 1; ix < NX-1; ix++ ) { A[iy][ix] = Anew[iy][ix]; } } //Periodic boundary conditions #pragma acc kernels for( int ix = 1; ix < NX-1; ix++ ) { A[0][ix] = A[(NY-2)][ix]; A[(NY-1)][ix] = A[1][ix]; } #pragma acc kernels for (int iy = 1; iy < NY-1; iy++) { A[iy][0] = A[iy][(NX-2)]; A[iy][(NX-1)] = A[iy][1]; } if((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } double runtime = GetTimer(); printf( "%dx%d: 1 GPU: %8.4f s\n", NY,NX, runtime/ 1000.0 ); return 0; }
int main(int argc, char** argv) { int iter_max = 1000; const real tol = 1.0e-5; int rank = 0; int size = 1; //Initialize MPI and determine rank and size MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if ( size > MAX_MPI_SIZE ) { if ( 0 == rank ) { fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE); } return -1; } dim2 size2d = size_to_2Dsize(size); int sizex = size2d.x; int sizey = size2d.y; assert(sizex*sizey == size); int rankx = rank%sizex; int ranky = rank/sizex; memset(A, 0, NY * NX * sizeof(real)); memset(Aref, 0, NY * NX * sizeof(real)); // set rhs for (int iy = 1; iy < NY-1; iy++) { for( int ix = 1; ix < NX-1; ix++ ) { const real x = -1.0 + (2.0*ix/(NX-1)); const real y = -1.0 + (2.0*iy/(NY-1)); rhs[iy][ix] = expr(-10.0*(x*x + y*y)); } } #if _OPENACC acc_device_t device_type = acc_get_device_type(); if ( acc_device_nvidia == device_type ) { int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); } // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(device_type); #endif /*_OPENACC*/ // Ensure correctness if NX%sizex != 0 int chunk_sizex = ceil( (1.0*NX)/sizex ); int ix_start = rankx * chunk_sizex; int ix_end = ix_start + chunk_sizex; // Do not process boundaries ix_start = max( ix_start, 1 ); ix_end = min( ix_end, NX - 1 ); // Ensure correctness if NY%sizey != 0 int chunk_sizey = ceil( (1.0*NY)/sizey ); int iy_start = ranky * chunk_sizey; int iy_end = iy_start + chunk_sizey; // Do not process boundaries iy_start = max( iy_start, 1 ); iy_end = min( iy_end, NY - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); poisson2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; real error = 1.0; #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1] + A[iy-1][ix] + A[iy+1][ix] )); error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix])); } } real globalerror = 0.0; MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { A[iy][ix] = Anew[iy][ix]; } } //Periodic boundary conditions int topy = (ranky == 0) ? (sizey-1) : ranky-1; int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1; int top = topy * sizex + rankx; int bottom = bottomy * sizex + rankx; #pragma acc host_data use_device( A ) { //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } int leftx = (rankx == 0) ? (sizex-1) : rankx-1; int rightx = (rankx == (sizex-1)) ? 0 : rankx+1; int left = ranky * sizex + leftx; int right = ranky * sizex + rightx; #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { to_left[iy] = A[iy][ix_start]; to_right[iy] = A[iy][ix_end-1]; } #pragma acc host_data use_device( to_left, from_left, to_right, from_right ) { //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { A[iy][ix_start-1] = from_left[iy]; A[iy][ix_end] = from_right[iy]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0) { printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }
s32 dBoxBox (const f32* p1, const f32* R1, const f32* side1, const f32* p2, const f32* R2, const f32* side2, cContact* contact,f32* normal) { s32 maxc = 4; const f32 fudge_factor = (f32)(1.05); vec4_t p,pp,normalC; const f32 *normalR = 0; f32 depth; f32 A[3],B[3],R11,R12,R13,R21,R22,R23,R31,R32,R33, Q11,Q12,Q13,Q21,Q22,Q23,Q31,Q32,Q33,s,s2,l; s32 i,j,invert_normal,code; const f32 *Ra,*Rb,*pa,*pb,*Sa,*Sb; vec4_t normal2,nr,anr;vec4_t center; s32 lanr,a1,a2; s32 codeN,code1,code2; // get vector from centers of box 1 to box 2, relative to box 1 p[0] = p2[0] - p1[0]; p[1] = p2[1] - p1[1]; p[2] = p2[2] - p1[2]; p[3] = 0; xMul0_344 (pp,p,R1); // get pp = p relative to body 1 // get side lengths / 2 A[0] = side1[0]*(f32)(0.5); A[1] = side1[1]*(f32)(0.5); A[2] = side1[2]*(f32)(0.5); B[0] = side2[0]*(f32)(0.5); B[1] = side2[1]*(f32)(0.5); B[2] = side2[2]*(f32)(0.5); // Rij is R1'*R2, i.e. the relative rotation between R1 and R2 R11 = dDOT(R1+0,R2+0); R12 = dDOT(R1+0,R2+4); R13 = dDOT(R1+0,R2+8); R21 = dDOT(R1+4,R2+0); R22 = dDOT(R1+4,R2+4); R23 = dDOT(R1+4,R2+8); R31 = dDOT(R1+8,R2+0); R32 = dDOT(R1+8,R2+4); R33 = dDOT(R1+8,R2+8); Q11 = fabsr(R11); Q12 = fabsr(R12); Q13 = fabsr(R13); Q21 = fabsr(R21); Q22 = fabsr(R22); Q23 = fabsr(R23); Q31 = fabsr(R31); Q32 = fabsr(R32); Q33 = fabsr(R33); // for all 15 possible separating axes: // * see if the axis separates the boxes. if so, return 0. // * find the depth of the penetration along the separating axis (s2) // * if this is the largest depth so far, record it. // the normal vector will be set to the separating axis with the smallest // depth. note: normalR is set to point to a column of R1 or R2 if that is // the smallest depth normal so far. otherwise normalR is 0 and normalC is // set to a vector relative to body 1. invert_normal is 1 if the sign of // the normal should be flipped. #define TST(expr_1,expr2,norm,cc) \ expr1 = expr_1;\ s2 = fabsr(expr1) - (expr2); \ if (s2 > 0) return 0; \ if (s2 > s) { \ s = s2; \ normalR = norm; \ invert_normal = ((expr1) < 0); \ code = (cc); \ } f32 expr1; s = -N3DInfinity; invert_normal = 0; code = 0; // separating axis = u1,u2,u3 TST (pp[0],(A[0] + B[0]*Q11 + B[1]*Q12 + B[2]*Q13),R1+0,1); TST (pp[1],(A[1] + B[0]*Q21 + B[1]*Q22 + B[2]*Q23),R1+4,2); TST (pp[2],(A[2] + B[0]*Q31 + B[1]*Q32 + B[2]*Q33),R1+8,3); // separating axis = v1,v2,v3 TST (dDOT(R2+0,p),(A[0]*Q11 + A[1]*Q21 + A[2]*Q31 + B[0]),R2+0,4); TST (dDOT(R2+4,p),(A[0]*Q12 + A[1]*Q22 + A[2]*Q32 + B[1]),R2+4,5); TST (dDOT(R2+8,p),(A[0]*Q13 + A[1]*Q23 + A[2]*Q33 + B[2]),R2+8,6); // note: cross product axes need to be scaled when s is computed. // normal (n1,n2,n3) is relative to box 1. #undef TST #define TST(expr1,expr2,n1,n2,n3,cc) \ s2 = fabsr(expr1) - (expr2); \ if (s2 > 0) return 0; \ l = fsqrtr((n1)*(n1) + (n2)*(n2) + (n3)*(n3)); \ if (l > 0) { \ s2 /= l; \ if (s2*fudge_factor > s) { \ s = s2; \ normalR = 0; \ normalC[0] = (n1)/l; normalC[1] = (n2)/l; normalC[2] = (n3)/l; \ invert_normal = ((expr1) < 0); \ code = (cc); \ } \ } // separating axis = u1 x (v1,v2,v3) TST(pp[2]*R21-pp[1]*R31,(A[1]*Q31+A[2]*Q21+B[1]*Q13+B[2]*Q12),0,-R31,R21,7); TST(pp[2]*R22-pp[1]*R32,(A[1]*Q32+A[2]*Q22+B[0]*Q13+B[2]*Q11),0,-R32,R22,8); TST(pp[2]*R23-pp[1]*R33,(A[1]*Q33+A[2]*Q23+B[0]*Q12+B[1]*Q11),0,-R33,R23,9); // separating axis = u2 x (v1,v2,v3) TST(pp[0]*R31-pp[2]*R11,(A[0]*Q31+A[2]*Q11+B[1]*Q23+B[2]*Q22),R31,0,-R11,10); TST(pp[0]*R32-pp[2]*R12,(A[0]*Q32+A[2]*Q12+B[0]*Q23+B[2]*Q21),R32,0,-R12,11); TST(pp[0]*R33-pp[2]*R13,(A[0]*Q33+A[2]*Q13+B[0]*Q22+B[1]*Q21),R33,0,-R13,12); // separating axis = u3 x (v1,v2,v3) TST(pp[1]*R11-pp[0]*R21,(A[0]*Q21+A[1]*Q11+B[1]*Q33+B[2]*Q32),-R21,R11,0,13); TST(pp[1]*R12-pp[0]*R22,(A[0]*Q22+A[1]*Q12+B[0]*Q33+B[2]*Q31),-R22,R12,0,14); TST(pp[1]*R13-pp[0]*R23,(A[0]*Q23+A[1]*Q13+B[0]*Q32+B[1]*Q31),-R23,R13,0,15); #undef TST if (!code) return 0; // if we get to this point, the boxes interpenetrate. compute the normal // in global coordinates. if (normalR) { normal[0] = normalR[0]; normal[1] = normalR[1]; normal[2] = normalR[2]; } else { xMul1_344 (normal,normalC,R1); } if (invert_normal) { normal[0] = -normal[0]; normal[1] = -normal[1]; normal[2] = -normal[2]; } depth = -s; // compute contact point(s) if (code > 6) { // an edge from box 1 touches an edge from box 2. // find a point pa on the intersecting edge of box 1 vec4_t pa,pb; f32 sign;f32 alpha,beta; vec4_t ua,ub; for (i=0; i<3; i++) pa[i] = p1[i]; for (j=0; j<3; j++) { sign = (dDOT(normal,(R1+j*4)) > 0) ? (f32)(1.0) : (f32)(-1.0); xQ4ScaleNAdd(pa,&R1[j*4],sign * A[j]); } // find a point pb on the intersecting edge of box 2 for (i=0; i<3; i++) pb[i] = p2[i]; for (j=0; j<3; j++) { sign = (dDOT(normal,(R2+j*4)) > 0) ? (f32)(-1.0) : (f32)(1.0); xQ4ScaleNAdd(pb,&R2[j*4],sign * B[j]); } for (i=0; i<3; i++) ua[i] = R1[(((code)-7)/3)*4 + i]; for (i=0; i<3; i++) ub[i] = R2[(((code)-7)%3)*4 + i]; dLineClosestApproach (pa,ua,pb,ub,&alpha,&beta); for (i=0; i<3; i++) pa[i] += ua[i]*alpha; for (i=0; i<3; i++) pb[i] += ub[i]*beta; for (i=0; i<3; i++) contact[0].pos[i] = (f32)(0.5)*(pa[i]+pb[i]); contact[0].depth = depth; return 1; } // okay, we have a face-something intersection (because the separating // axis is perpendicular to a face). define face 'a' to be the reference // face (i.e. the normal vector is perpendicular to this) and face 'b' to be // the incident face (the closest face of the other box). if (code <= 3) { Ra = R1; Rb = R2; pa = p1; pb = p2; Sa = A; Sb = B; } else { Ra = R2; Rb = R1; pa = p2; pb = p1; Sa = B; Sb = A; } // nr = normal vector of reference face dotted with axes of incident box. // anr = absolute values of nr. if (code <= 3) { normal2[0] = normal[0]; normal2[1] = normal[1]; normal2[2] = normal[2]; } else { normal2[0] = -normal[0]; normal2[1] = -normal[1]; normal2[2] = -normal[2]; } xMul0_344 (nr,normal2,Rb); anr[0] = fabsr (nr[0]); anr[1] = fabsr (nr[1]); anr[2] = fabsr (nr[2]); // find the largest compontent of anr: this corresponds to the normal // for the indident face. the other axis numbers of the indicent face // are stored in a1,a2. if (anr[1] > anr[0]) { if (anr[1] > anr[2]) { a1 = 0; lanr = 1; a2 = 2; } else { a1 = 0; a2 = 1; lanr = 2; } } else { if (anr[0] > anr[2]) { lanr = 0; a1 = 1; a2 = 2; } else { a1 = 0; a2 = 1; lanr = 2; } } // compute center point of incident face, in reference-face coordinates if (nr[lanr] < 0) { for (i=0; i<3; i++) center[i] = pb[i] - pa[i] + Sb[lanr] * Rb[i+lanr*4]; } else { for (i=0; i<3; i++) center[i] = pb[i] - pa[i] - Sb[lanr] * Rb[i+lanr*4]; } // find the normal and non-normal axis numbers of the reference box if (code <= 3) codeN = code-1; else codeN = code-4; if (codeN==0) { code1 = 1; code2 = 2; } else if (codeN==1) { code1 = 0; code2 = 2; } else { code1 = 0; code2 = 1; } // find the four corners of the incident face, in reference-face coordinates f32 quad[8]; // 2D coordinate of incident face (x,y pairs) f32 c1,c2,m11,m12,m21,m22; c1 = dDOT (center,(Ra+code1*4)); c2 = dDOT (center,(Ra+code2*4)); // optimize this? - we have already computed this data above, but it is not // stored in an easy-to-index format. for now it's quicker just to recompute // the four dot products. m11 = dDOT (Ra+code1*4,Rb+a1*4); m12 = dDOT (Ra+code1*4,Rb+a2*4); m21 = dDOT (Ra+code2*4,Rb+a1*4); m22 = dDOT (Ra+code2*4,Rb+a2*4); { f32 k1 = m11*Sb[a1]; f32 k2 = m21*Sb[a1]; f32 k3 = m12*Sb[a2]; f32 k4 = m22*Sb[a2]; quad[0] = c1 - k1 - k3; quad[1] = c2 - k2 - k4; quad[2] = c1 - k1 + k3; quad[3] = c2 - k2 + k4; quad[4] = c1 + k1 + k3; quad[5] = c2 + k2 + k4; quad[6] = c1 + k1 - k3; quad[7] = c2 + k2 - k4; } // find the size of the reference face f32 rect[2]; rect[0] = Sa[code1]; rect[1] = Sa[code2]; // intersect the incident and reference faces f32 ret[16]; s32 n = intersectRectQuad (rect,quad,ret); if (n < 1) return 0; // this should never happen // convert the intersection points into reference-face coordinates, // and compute the contact position and depth for each point. only keep // those points that have a positive (penetrating) depth. delete points in // the 'ret' array as necessary so that 'point' and 'ret' correspond. f32 point[3*8]; // penetrating contact points f32 dep[8]; // depths for those points f32 det1 = 1.f/(m11*m22 - m12*m21); m11 *= det1; m12 *= det1; m21 *= det1; m22 *= det1; s32 cnum = 0; // number of penetrating contact points found for (j=0; j < n; j++) { f32 k1 = m22*(ret[j*2]-c1) - m12*(ret[j*2+1]-c2); f32 k2 = -m21*(ret[j*2]-c1) + m11*(ret[j*2+1]-c2); for (i=0; i<3; i++) point[cnum*3+i] = center[i] + k1*Rb[i+a1*4] + k2*Rb[i+a2*4]; dep[cnum] = Sa[codeN] - dDOT(normal2,point+cnum*3); if (dep[cnum] >= 0) { ret[cnum*2] = ret[j*2]; ret[cnum*2+1] = ret[j*2+1]; cnum++; } } if (cnum < 1) return 0; // this should never happen // we can't generate more contacts than we actually have if (maxc > cnum) maxc = cnum; if (maxc < 1) maxc = 1; if (cnum <= maxc) { // we have less contacts than we need, so we use them all for (j=0; j < cnum; j++) { cContact *con = &contact[j]; for (i=0; i<3; i++) con->pos[i] = point[j*3+i] + pa[i]; con->depth = dep[j]; } } else { // we have more contacts than are wanted, some of them must be culled. // find the deepest point, it is always the first contact. s32 i1 = 0; f32 maxdepth = dep[0]; for (i=1; i<cnum; i++) { if (dep[i] > maxdepth) { maxdepth = dep[i]; i1 = i; } } //s32 iret[8]; //cullPoints (cnum,ret,maxc,i1,iret); for (j=0; j < maxc; j++) { cContact *con = &contact[j]; for (i=0; i<3; i++) con->pos[i] = point[j*3+i] + pa[i]; con->depth = dep[j]; } cnum = maxc; } return cnum; }