inline void DEP_CURRENT_3D (double * const current, int const * const size, int const * const offset, double * const norm, t_vpbuf3D * const part) { typedef struct Current { double j1, j2, j3; } t_current; t_current *pjpart, *p0; dvec vwl1[ORDER], vwl2[ORDER], vwl3[ORDER]; dvec vwp1[NP][NP], vwp2[NP][NP], vwp3[NP][NP]; __m128d vx0, vx1, vy0, vy1, vz0, vz1, vq; DECLARE_ALIGNED_16( int ix[4] ); DECLARE_ALIGNED_16( int iy[4] ); DECLARE_ALIGNED_16( int iz[4] ); __m128d vqnx, vqny, vqnz; __m128d vs0x[NP], vs1x[NP], vs0y[NP], vs1y[NP], vs0z[NP], vs1z[NP]; int np; int i, k, k1, k2, k3; int const Dy = size[0]; int const Dz = Dy * size[1]; t_current* const pj = (t_current *) ( current + ( offset[0] - OFFSET ) * 3 + ( offset[1] - OFFSET ) * 3 * Dy + ( offset[2] - OFFSET ) * 3 * Dz ); __m128d const vnorm1 = _mm_set1_pd(norm[0]); __m128d const vnorm2 = _mm_set1_pd(norm[1]); __m128d const vnorm3 = _mm_set1_pd(norm[2]); np = part -> np; // If number of particles in buffer is not multiple of 2 add dummy particles to the end if ( np % 2 != 0 ) { for( i = 0; i < 2 - np%2; i ++ ) { t_vp3D * vpt = (t_vp3D *) part -> p + i; vpt -> x0 = vpt -> x1 = 0.; vpt -> y0 = vpt -> y1 = 0.; vpt -> z0 = vpt -> z1 = 0.; vpt -> q = 0.; vpt -> ix = vpt -> iy = vpt -> iz = 1; } } double* vp = (double *) part -> buf; for( i = 0; i < np; i += 2, vp += 2 * 10 ) { // load 2 particles LOAD2VP3D( vp, vx0, vx1, vy0, vy1, vz0, vz1, vq, ix, iy, iz ); // Get spline weights SPLINE( vx0, vs0x ); SPLINE( vx1, vs1x ); SPLINE( vy0, vs0y ); SPLINE( vy1, vs1y ); SPLINE( vz0, vs0z ); SPLINE( vz1, vs1z ); vqnx = _mm_mul_pd( vq, vnorm1); vqny = _mm_mul_pd( vq, vnorm2); vqnz = _mm_mul_pd( vq, vnorm3); // get longitudinal weights WL( vqnx, vx0, vx1, (__m128d *) vwl1 ); WL( vqny, vy0, vy1, (__m128d *) vwl2 ); WL( vqnz, vz0, vz1, (__m128d *) vwl3 ); // get perpendicular weights for( k2 = 0; k2 < NP; k2++ ) { for( k1 = 0; k1 < NP; k1++ ) { __m128d tmp1, tmp2; const __m128d oneHalf = _mm_set1_pd(0.5); // wp1[k2][k1] = s0y[k1]*s0z[k2] + s1y[k1]*s1z[k2] + // 0.5*( s0y[k1]*s1z[k2] + s1y[k1]*s0z[k2] ) tmp1 = _mm_add_pd( _mm_mul_pd( vs0y[k1], vs0z[k2]), _mm_mul_pd( vs1y[k1], vs1z[k2])); tmp2 = _mm_add_pd( _mm_mul_pd( vs0y[k1], vs1z[k2]), _mm_mul_pd( vs1y[k1], vs0z[k2])); vwp1[k2][k1].v2 = _mm_add_pd( tmp1, _mm_mul_pd( tmp2, oneHalf)); tmp1 = _mm_add_pd( _mm_mul_pd( vs0x[k1], vs0z[k2]), _mm_mul_pd( vs1x[k1], vs1z[k2])); tmp2 = _mm_add_pd( _mm_mul_pd( vs0x[k1], vs1z[k2]), _mm_mul_pd( vs1x[k1], vs0z[k2])); vwp2[k2][k1].v2 = _mm_add_pd( tmp1, _mm_mul_pd( tmp2, oneHalf)); tmp1 = _mm_add_pd( _mm_mul_pd( vs0x[k1], vs0y[k2]), _mm_mul_pd( vs1x[k1], vs1y[k2])); tmp2 = _mm_add_pd( _mm_mul_pd( vs0x[k1], vs1y[k2]), _mm_mul_pd( vs1x[k1], vs0y[k2])); vwp3[k2][k1].v2 = _mm_add_pd( tmp1, _mm_mul_pd( tmp2, oneHalf)); } } // loop by particle on the outside loop for ( k = 0; k < 2; k ++ ) { pjpart = pj + ix[k] + iy[k] * Dy + iz[k] * Dz; // accumulate j1 for( k3 = 0; k3 < NP; k3++ ) { for( k2 = 0; k2 < NP; k2++ ) { p0 = pjpart + k2*Dy + k3*Dz; for ( k1 = 0; k1 < ORDER; k1++ ) { p0[k1].j1 += vwl1[k1].v[k] * vwp1[k3][k2].v[k]; } } } // accumulate j2 for( k3 = 0; k3 < NP; k3++ ) { for( k2 = 0; k2 < ORDER; k2++ ) { p0 = pjpart + k2*Dy + k3*Dz; for ( k1 = 0; k1 < NP; k1++ ) { p0[k1].j2 += vwl2[k2].v[k] * vwp2[k3][k1].v[k]; } } } // accumulate j3 for( k3 = 0; k3 < ORDER; k3++ ) { for( k2 = 0; k2 < NP; k2++ ) { p0 = pjpart + k2*Dy + k3*Dz; for ( k1 = 0; k1 < NP; k1++ ) { p0[k1].j3 += vwl3[k3].v[k] * vwp3[k2][k1].v[k]; } } } } } }
int pkdBucketInteract(PKD pkd,int iBucket,int iOrder) { const KDN * const pkdn = &pkd->kdNodes[iBucket]; PARTICLE * p = &pkd->pStore[pkdn->pLower]; // get vals int nActive = 0; int n = pkdn->pUpper - pkdn->pLower + 1; int nPart = pkd->nPart; int nCellSoft = pkd->nCellSoft; int nCellNewt = pkd->nCellNewt; #ifdef COMPLETE_LOCAL int nMultiFlop[5] = MEVAL_FLOP; #else int nMultiFlop[5] = QEVAL_FLOP; #endif const ILP * const ilp = pkd->ilp; const ILCS * const ilcs = pkd->ilcs; const ILCN * const ilcn = pkd->ilcn; int i; /* ** Now process the two interaction lists for each active particle. */ for (i=0; i<n; ++i) { int j; double fPot,ax,ay,az; double x,y,z,dx,dy,dz,d2,h,twoh,a,b,c,d; double dir2,qirx,qiry,qirz,qir,tr,qir3; double idt2; /* reciprocal square of symmetric timestep */ double gam[6]; double dir; if (!TYPEQueryACTIVE(&(p[i]))) continue; ++nActive; ax = 0.0; ay = 0.0; az = 0.0; fPot = 0.0; x = p[i].r[0]; y = p[i].r[1]; z = p[i].r[2]; h = p[i].fSoft; /* ** Scoring for Part (+,*) ** Without sqrt = (10,8) ** 1/sqrt est. = (6,11) ** SPLINEM = (0,3) for soft = (8,30) ** Total = (16,22) (24,49) ** = 38 for soft = 73 */ #if !(NATIVE_SQRT) for (j=0;j<nPart;++j) { dx = x - ilp[j].x; dy = y - ilp[j].y; dz = z - ilp[j].z; d2a[j] = dx*dx + dy*dy + dz*dz; } if (nPart>0) v_sqrt1(nPart,d2a,sqrttmp); #endif for (j=0; j<nPart; ++j) { dx = x - ilp[j].x; dy = y - ilp[j].y; dz = z - ilp[j].z; twoh = h + ilp[j].h; #if (NATIVE_SQRT) d2 = dx*dx + dy*dy + dz*dz; SPLINE(d2,twoh,a,b); #else SPLINEM(sqrttmp[j],d2a[j],twoh,a,b); #endif idt2 = (p[i].fMass + ilp[j].m)*b; if (idt2 > p[i].dtGrav) p[i].dtGrav = idt2; a *= ilp[j].m; b *= ilp[j].m; fPot -= a; ax -= dx*b; ay -= dy*b; az -= dz*b; } /* ** Scoring for CellSoft (+,*) ** Without sqrt = (27,29) ** 1/sqrt est. = (6,11) ** SPLINEQ = (0,9) for soft = (13,62) ** Total = (33,49) (46,102) ** = 82 for soft = 148 */ #if !(NATIVE_SQRT) for (j=0;j<nCellSoft;++j) { dx = x - ilcs[j].x; dy = y - ilcs[j].y; dz = z - ilcs[j].z; d2a[j] = dx*dx + dy*dy + dz*dz; } if (nCellSoft>0) v_sqrt1(nCellSoft,d2a,sqrttmp); #endif for (j=0;j<nCellSoft;++j) { dx = x - ilcs[j].x; dy = y - ilcs[j].y; dz = z - ilcs[j].z; twoh = h + ilcs[j].h; #if (NATIVE_SQRT) d2 = dx*dx + dy*dy + dz*dz; dir = 1.0/sqrt(d2); SPLINEQ(dir,d2,twoh,a,b,c,d); #else SPLINEQ(sqrttmp[j],d2a[j],twoh,a,b,c,d); #endif qirx = ilcs[j].xx*dx + ilcs[j].xy*dy + ilcs[j].xz*dz; qiry = ilcs[j].xy*dx + ilcs[j].yy*dy + ilcs[j].yz*dz; qirz = ilcs[j].xz*dx + ilcs[j].yz*dy + ilcs[j].zz*dz; qir = 0.5*(qirx*dx + qiry*dy + qirz*dz); tr = 0.5*(ilcs[j].xx + ilcs[j].yy + ilcs[j].zz); qir3 = b*ilcs[j].m + d*qir - c*tr; fPot -= a*ilcs[j].m + c*qir - b*tr; ax -= qir3*dx - c*qirx; ay -= qir3*dy - c*qiry; az -= qir3*dz - c*qirz; idt2 = (p[i].fMass + ilcs[j].m)*b; if (idt2 > p[i].dtGrav) p[i].dtGrav = idt2; } /* ** Try a cache check to improve responsiveness? */ mdlCacheCheck(pkd->mdl); /* ** Scoring for CellNewt (+,*) ** Without sqrt = (5,13) ** 1/sqrt est. = (6,11) ** Subtotal = (11,24) = 35 ** Qeval (Hex) = 277 (see qeval.h) ** Total = (85,227) = 312 Flops/Newt-Interact */ #if !(NATIVE_SQRT) for (j=0;j<nCellNewt;++j) { dx = x - ilcn[j].x; dy = y - ilcn[j].y; dz = z - ilcn[j].z; d2a[j] = dx*dx + dy*dy + dz*dz; } if (nCellNewt>0) v_sqrt1(nCellNewt,d2a,sqrttmp); #endif for (j=0;j<nCellNewt;++j) { dx = x - ilcn[j].x; dy = y - ilcn[j].y; dz = z - ilcn[j].z; #if (NATIVE_SQRT) d2 = dx*dx + dy*dy + dz*dz; gam[0] = 1.0/sqrt(d2); #else gam[0] = sqrttmp[j]; #endif dir2 = gam[0]*gam[0]; gam[1] = gam[0]*dir2; gam[2] = 3*gam[1]*dir2; gam[3] = 5*gam[2]*dir2; gam[4] = 7*gam[3]*dir2; gam[5] = 9*gam[4]*dir2; #ifdef COMPLETE_LOCAL MEVAL(iOrder,ilcn[j],gam,dx,dy,dz,ax,ay,az,fPot); #else QEVAL(iOrder,ilcn[j],gam,dx,dy,dz,ax,ay,az,fPot); /* RP-DEBUG: acceleration alteration #3; source of voids! */ #endif idt2 = (p[i].fMass + ilcn[j].m)*gam[1]; if (idt2 > p[i].dtGrav) p[i].dtGrav = idt2; } p[i].fPot += fPot; p[i].a[0] += ax; p[i].a[1] += ay; p[i].a[2] += az; /* ** Try a cache check to improve responsiveness? */ mdlCacheCheck(pkd->mdl); } /* ** Do the intra-bucket interactions. ** Scoring (+,*): ** without sqrt = (14,17) ** sqrt est. = (6,11) ** SPLINE = (0,3) for soft = (8,30) ** Total = (20,31) (28,58) ** = 51 for soft = 86 ** Multiplied by (n*(n-1)/2)! */ for (i=0;i<n-1;++i) { int j; double dx,dy,dz,twoh,a,b,d2; double idt2; /* reciprocal square of symmetric timestep */ #ifdef COLLISIONS double repelPrefactor; /* RP 6-22-09 */ #ifdef REPEL_MARK_II double dvx,dvy,dvz,dxDotDv; double totalMass; #endif #endif for (j=i+1;j<n;++j) { if (!TYPEQueryACTIVE(&(p[i])) && !TYPEQueryACTIVE(&(p[j]))) continue; dx = p[j].r[0] - p[i].r[0]; dy = p[j].r[1] - p[i].r[1]; dz = p[j].r[2] - p[i].r[2]; d2 = dx*dx + dy*dy + dz*dz; twoh = p[i].fSoft + p[j].fSoft; #ifdef COLLISIONS /* RP 6-22-09: In case of repel-method overlap correction, compute linear repulsive force: */ if (pkd->bRepel && d2 < (twoh)*(twoh)) { mdlassert(pkd->mdl,d2 > 0.0); /* Particles not allowed to perfectly overlap */ /* ** Code below, in REPEL_MARK_II, is commented pending ** future improvements. It is intended to remove ** approach velocity from overlapping particles; ** however, in practice, it only removes velocities ** from UNAGGREGATED particles. The edits to velocity ** are ignored for particles in aggs, since an update ** never occurs. This is inconsistent, as a free ** particle may overlap an agg, leading to a loss of ** momentum conservation. Perhaps this can be ** resolved in the future. */ #ifdef REPEL_MARK_II /*RP-DEBUG: (7/23/09) If particles are moving *toward* one another, nullify that motion */ dvx = p[j].v[0] - p[i].v[0]; dvy = p[j].v[1] - p[i].v[1]; dvz = p[j].v[2] - p[i].v[2]; dxDotDv = dx*dvx + dy*dvy + dz*dvz; if (dxDotDv < 0.0) /* That is, if particles are approaching one another */ { dxDotDv /= d2; /* For convenience */ totalMass = p[j].fMass+p[i].fMass; p[j].v[0] -= dx*dxDotDv*p[j].fMass/totalMass; p[j].v[1] -= dy*dxDotDv*p[j].fMass/totalMass; p[j].v[2] -= dz*dxDotDv*p[j].fMass/totalMass; p[i].v[0] += dx*dxDotDv*p[i].fMass/totalMass; p[i].v[1] += dy*dxDotDv*p[i].fMass/totalMass; p[i].v[2] += dz*dxDotDv*p[i].fMass/totalMass; #ifdef SLIDING_PATCH /* RP-DEBUG-dPy revision 11/5/09 */ /* Calculate Py based on repelled velocity */ /* NOTE: Uses a full timestep as the 'event time' */ p[i].dPy = p[i].v[1] + 2.0*pkd->PP->dOrbFreq* (p[i].r[0] - (p[i].v[0]*pkd->PP->dDelta/2.0)); /*RP-DEBUG-dPy*/ p[j].dPy = p[j].v[1] + 2.0*pkd->PP->dOrbFreq* (p[j].r[0] - (p[j].v[0]*pkd->PP->dDelta/2.0)); /*RP-DEBUG-dPy*/ #endif /* SLIDING_PATCH */ } #endif /* REPEL_MARK_II */ /* Compute & apply repulsive force */ /* Note: 1. pkd->dRepelFac is computed as 'k' in master.c: k = user_constant*dt^-2 */ /* 2. Using repulsive force law: F = -m*kx */ a = 1.0/sqrt(d2); b = -pkd->dRepelFac; repelPrefactor = (twoh*a) - 1.0; idt2 = -(p[i].fMass + p[j].fMass)*b; /* RP-DEBUG: b = -dt^-2 now (It has the same units as below.)*/ if (TYPEQueryACTIVE(&(p[j]))) { p[j].fPot -= a*p[i].fMass; p[j].a[0] -= dx*repelPrefactor*b; p[j].a[1] -= dy*repelPrefactor*b; p[j].a[2] -= dz*repelPrefactor*b; if (idt2 > p[j].dtGrav) p[j].dtGrav = idt2; } if (TYPEQueryACTIVE(&(p[i]))) { p[i].fPot -= a*p[j].fMass; p[i].a[0] += dx*repelPrefactor*b; p[i].a[1] += dy*repelPrefactor*b; p[i].a[2] += dz*repelPrefactor*b; if (idt2 > p[i].dtGrav) p[i].dtGrav = idt2; } } else { #endif /* COLLISIONS */ SPLINE(d2,twoh,a,b); idt2 = (p[i].fMass + p[j].fMass)*b; if (TYPEQueryACTIVE(&(p[j]))) { p[j].fPot -= a*p[i].fMass; p[j].a[0] -= dx*b*p[i].fMass; p[j].a[1] -= dy*b*p[i].fMass; p[j].a[2] -= dz*b*p[i].fMass; if (idt2 > p[j].dtGrav) p[j].dtGrav = idt2; } if (TYPEQueryACTIVE(&(p[i]))) { p[i].fPot -= a*p[j].fMass; p[i].a[0] += dx*b*p[j].fMass; p[i].a[1] += dy*b*p[j].fMass; p[i].a[2] += dz*b*p[j].fMass; if (idt2 > p[i].dtGrav) p[i].dtGrav = idt2; } #ifdef COLLISIONS } #endif } } /* ** Compute the nFlop estimate. */ int nFlop = nActive*((nPart + n)*38 + nCellSoft*82 + nCellNewt*(35 + nMultiFlop[iOrder])); return(nFlop); }
inline void DEP_CURRENT_2D (double * const current, int const * const size, int const * const offset, double * const norm, t_vpbuf2D * const part) { typedef struct Current { double j1, j2, j3; } t_current; t_current *pjpart, *p0; dvec j3[NP][NP]; dvec vwp1[NP], vwp2[NP]; dvec vwl1[ORDER], vwl2[ORDER]; __m128d vx0, vx1, vy0, vy1, vq, vvz; DECLARE_ALIGNED_16( int ix[4] ); DECLARE_ALIGNED_16( int iy[4] ); __m128d vqnx, vqny, vqvz; __m128d vs0x[NP], vs1x[NP], vs0y[NP], vs1y[NP]; int np; int i, k, k1, k2; __m128d const oneThird = _mm_set1_pd( 1.0/3.0 ); const int Dy = size[0]; t_current* const pj = (t_current *) (current + ( offset[0] - OFFSET ) * 3 + ( offset[1] - OFFSET ) * 3 * Dy ); __m128d const vnorm1 = _mm_set1_pd(norm[0]); __m128d const vnorm2 = _mm_set1_pd(norm[1]); np = part -> np; // If number of particles in buffer is not multiple of 2 add dummy particles to the end if ( np % 2 != 0 ) { for( i = 0; i < 2 - np%2; i ++ ) { t_vp2D * vpt = ((t_vp2D *) part -> p) + i; vpt -> x0 = vpt -> x1 = 0.; vpt -> y0 = vpt -> y1 = 0.; vpt -> q = vpt -> vz = 0.; vpt -> ix = vpt -> iy = 1; } } double* vp = (double *) part -> buf; for( i = 0; i < np; i+=2, vp+=16 ) { // load 2 particles LOAD2VP2D( vp, vx0, vx1, vy0, vy1, vq, vvz, ix, iy ); // Get splines SPLINE( vx0, vs0x ); SPLINE( vx1, vs1x ); SPLINE( vy0, vs0y ); SPLINE( vy1, vs1y ); vqnx = _mm_mul_pd( vq, vnorm1); vqny = _mm_mul_pd( vq, vnorm2); vqvz = _mm_mul_pd( vq, vvz); vqvz = _mm_mul_pd( vqvz, oneThird ); // get longitudinal weights WL( vqnx, vx0, vx1, (__m128d *) vwl1 ); WL( vqny, vy0, vy1, (__m128d *) vwl2 ); // get perpendicular weights for( k = 0; k < NP; k++ ) { vwp1[k].v2 = _mm_add_pd(vs0y[k], vs1y[k]); vwp2[k].v2 = _mm_add_pd(vs0x[k], vs1x[k]); } // get j3 current for( k2 = 0; k2 < NP; k2++ ) { for ( k1 = 0; k1 < NP; k1++ ) { __m128d const oneHalf = _mm_set1_pd( 0.5 ); __m128d tmp1, tmp2; tmp1 = _mm_add_pd( _mm_mul_pd( vs0x[k1], vs0y[k2] ), _mm_mul_pd( vs1x[k1], vs1y[k2] ) ); tmp2 = _mm_add_pd( _mm_mul_pd( vs0x[k1], vs1y[k2] ), _mm_mul_pd( vs1x[k1], vs0y[k2] ) ); j3[k1][k2].v2 = _mm_mul_pd( vqvz, _mm_add_pd( tmp1, _mm_mul_pd( tmp2, oneHalf ) ) ); } } // New version loop by particle on the outside loop for ( k = 0; k < 2; k ++ ) { pjpart = pj + ix[k] + iy[k] * Dy; // accumulate j1 for( k2 = 0; k2 < NP; k2++ ) { p0 = pjpart + k2*Dy; for ( k1 = 0; k1 < ORDER; k1++ ) { p0[k1].j1 += vwl1[k1].v[k] * vwp1[k2].v[k]; } } // accumulate j2 - making k2 the outside loop gives marginal perf. gain for( k2 = 0; k2 < ORDER; k2++ ) { p0 = pjpart + k2*Dy; for ( k1 = 0; k1 < NP; k1++ ) { p0[k1].j2 += vwl2[k2].v[k] * vwp2[k1].v[k]; } } // accumulate j3 for( k2 = 0; k2 < NP; k2++ ) { p0 = pjpart + k2*Dy; for ( k1 = 0; k1 < NP; k1++ ) { p0[k1].j3 += (j3[k1][k2]).v[k]; } } } } }
bool CPath::makeSpline(Array* cp) { int division; int maxDivision = 0; float u, u_2, u_3; int cpCount = (int)cp->count(); if (cpCount < 4) return false; CC_SAFE_RELEASE(spline_); spline_ = Array::create(); CC_SAFE_RETAIN(spline_); cpCount -= 3; Point cps[4]; for (int i = 0; i < cpCount; ++i) { cps[0] = *dynamic_cast<Point*>(cp->getObjectAtIndex(i)); cps[1] = *dynamic_cast<Point*>(cp->getObjectAtIndex(i+1)); cps[2] = *dynamic_cast<Point*>(cp->getObjectAtIndex(i+2)); cps[3] = *dynamic_cast<Point*>(cp->getObjectAtIndex(i+3)); Point startPoint; Point endPoint; startPoint.x = SPLINE(0.f, 0.f, 0.f, cps[0].x, cps[1].x, cps[2].x, cps[3].x); startPoint.y = SPLINE(0.f, 0.f, 0.f, cps[0].y, cps[1].y, cps[2].y, cps[3].y); endPoint.x = SPLINE(1.f, 1.f, 1.f, cps[0].x, cps[1].x, cps[2].x, cps[3].x); endPoint.y = SPLINE(1.f, 1.f, 1.f, cps[0].y, cps[1].y, cps[2].y, cps[3].y); division = MAX(fabs(startPoint.x-endPoint.x), fabs(startPoint.y-endPoint.y)); if (division > maxDivision) { maxDivision = division; } for(int j = 0; j < division; j++) { u = (float)j / division; u_2 = u * u; u_3 = u_2 * u; Point* curveData = new Point; curveData->autorelease(); // Position curveData->x = SPLINE(u, u_2, u_3, cps[0].x, cps[1].x, cps[2].x, cps[3].x); curveData->y = SPLINE(u, u_2, u_3, cps[0].y, cps[1].y, cps[2].y, cps[3].y); spline_->addObject(curveData); } } return true; }
void CqShaderVM::SO_pspline() { AUTOFUNC; SPLINE( type_point, m_pEnv->SO_pspline ); }
void CqShaderVM::SO_cspline() { AUTOFUNC; SPLINE( type_color, m_pEnv->SO_cspline ); }
void CqShaderVM::SO_fspline() { AUTOFUNC; SPLINE( type_float, m_pEnv->SO_fspline ); }