void Timestep::get_transform_vectors(float A[3], float B[3], float C[3]) const { // notes: a, b, c are side lengths of the unit cell // alpha = angle between b and c // beta = angle between a and c // gamma = angle between a and b // convert from degrees to radians double cosBC = cos(DEGTORAD(alpha)); double cosAC = cos(DEGTORAD(beta)); double cosAB = cos(DEGTORAD(gamma)); double sinAB = sin(DEGTORAD(gamma)); // A will lie along the positive x axis. // B will lie in the x-y plane // The origin will be (0,0,0). float Ax = (float) (a_length); float Bx = (float) (b_length * cosAB); float By = (float) (b_length * sinAB); float Cx=0, Cy=0, Cz=0; // If sinAB is zero, then we can't determine C uniquely since it's defined // in terms of the angle between A and B. if (sinAB > 0) { Cx = (float) cosAC; Cy = (float) ((cosBC - cosAC * cosAB) / sinAB); Cz = sqrtf(1.0f - Cx*Cx - Cy*Cy); } Cx *= c_length; Cy *= c_length; Cz *= c_length; vec_zero(A); A[0] = Ax; vec_zero(B); B[0] = Bx; B[1] = By; vec_zero(C); C[0] = Cx; C[1] = Cy; C[2] = Cz; }
vec_t box_oriented_collision(box_t a, box_t b){ vec_t d1; vec_t d2; d1 = box_o_o_collision(a,b); if(vec_zero(d1)){ return d1; }else{ d2 = vec_inv(box_o_o_collision(b,a)); if(vec_zero(d2)){ return d2; }else{ return vec_smallest(d1,d2); } } }
void mesh_init_vertex_normals(struct mesh *mesh) { struct polygon *p, *poly_end; struct vertex *v, *vtx_end; struct vector *poly_normal; int *idx; int nvtx; vtx_end = &mesh->vtx[mesh->nvtx]; for (v = mesh->vtx; v != vtx_end; v++) vec_zero(&v->normal); poly_end = &mesh->poly[mesh->npoly]; for (p = mesh->poly; p != poly_end; p++) { poly_normal = &p->normal; idx = p->vtx_index; nvtx = p->nvtx; while (nvtx--) vec_add_to(&mesh->vtx[*idx++].normal, poly_normal); } for (v = mesh->vtx; v != vtx_end; v++) vec_normalize(&v->normal); }
void ui_set_modelview_matrix(struct matrix *mv) { struct matrix ry, rx, rm; struct vector p, o; struct matrix c, t, r, v; mat_make_rotation_around_y(&ry, ui.rotation); mat_make_rotation_around_x(&rx, ui.tilt); mat_mul_copy(&rm, &ry, &rx); vec_zero(&o); /* look at */ vec_set(&p, 0.f, 0.f, -ui.distance); /* camera position */ mat_rotate(&p, &rm); mat_make_look_at(&c, &o, &p); vec_neg(&p); mat_make_translation_from_vec(&t, &p); mat_mul_copy(&v, &c, &t); mat_make_rotation_around_x(&r, -M_PI/2.); mat_mul_copy(mv, &v, &r); }
vec_t vec_normalize(vec_t a){ if (vec_zero(a)){ return a; }else{ return vec_scale(a,1.0/vec_len(a)); } }
void hotcold_gradient_lerp(float pucker_sum, float *rgb) { vec_zero(rgb); // set default color to black // hot to cold color map // Red (1, 0, 0) -> Yellow (1, 1, 0) -> Green (0, 1, 0) -> Cyan (0, 1, 1) -> blue (0, 0, 1) float red[3] = {1.0f, 0.0f, 0.0f}; float yellow[3] = {1.0f, 1.0f, 0.0f}; float yellow2[3] = {0.8f, 1.0f, 0.0f}; float green[3] = {0.0f, 1.0f, 0.0f}; float green2[3] = {0.6f, 1.0f, 0.0f}; float cyan[3] = {0.0f, 1.0f, 1.0f}; float cyan2[3] = {0.0f, 1.0f, 0.8f}; float blue[3] = {0.0f, 0.0f, 1.0f}; if (pucker_sum < 0.25f) { lerp_color_range(rgb, pucker_sum, 0.00f, 0.25f, red, yellow); } else if (pucker_sum < 0.45f) { vec_copy(rgb, yellow); } else if (pucker_sum < 0.55f) { lerp_color_range(rgb, pucker_sum, 0.45f, 0.55f, yellow, green2); } else if (pucker_sum < 0.75f) { lerp_color_range(rgb, pucker_sum, 0.55f, 0.75f, green, cyan2); } else { lerp_color_range(rgb, pucker_sum, 0.75f, 1.00f, cyan, blue); } clamp_color(rgb); // clamp color values to legal range }
void hotcold_gradient(float pucker_sum, float *rgb) { vec_zero(rgb); // set default color to black // hot to cold color map // Red (1, 0, 0) -> Yellow (1, 1, 0) -> Green (0, 1, 0) -> Cyan (0, 1, 1) -> blue (0, 0, 1) -> magenta (1, 0, 1) if (pucker_sum < 0.40f) { //MK - envelopes here rgb[0] = 1.0f; // red rgb[1] = pucker_sum * 2.5f; // MK from red increasing green -> yellow - adjusted multiplier for large range rgb[2] = 0.0f; } else if (pucker_sum < 0.56f) { rgb[0] = 1.0f - (pucker_sum - 0.40f) * 6.25f; // from Yellow, decrease red -> green adjusted multiplier for small range rgb[1] = 1.0f; rgb[2] = 0.0f; } else if (pucker_sum < 0.64f) { rgb[0] = 0.0f; rgb[1] = 1.0f; //green rgb[2] = (pucker_sum - 0.56f) * 12.5f; // from green, increasing blue -> cyan, adjusted multiplier for small range } else if (pucker_sum < 0.76f) { rgb[0] = 0.0f; rgb[1] = 1.0f - (pucker_sum - 0.64f) * 5.0f; // from cyan, decrease green -> blue, adjusted multiplier for small range rgb[2] = 1.0f; } else { rgb[0] = (pucker_sum - 0.76f) * 0.8f; // from blue, increase red to get magenta, adjusted multiplier for very large range rgb[1] = 0.0f; rgb[2] = 1.0f; } clamp_color(rgb); // clamp color values to legal range }
/* Transpose matrix B to both: * * - increase cache hits * - simd GCC vector extensions which is made possible. * by the transposition, to increase likelyhood of SIMDs. * * Note that GCC 6 O=3 is smart enough to use SIMD * even for the naive CPU method. However this was still way faster. * */ void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n, Cache *cache) { F tmpf; size_t i, j, k, k_max, ai, bi; Vec tmp, a, b; UNUSED(cache); mat_trans((F*)B, n); k_max = (n / VECTOR_NELEMS) * VECTOR_NELEMS; for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { vec_zero(&tmp, VECTOR_NELEMS); for (k = 0; k < k_max; k += VECTOR_NELEMS) { ai = i * n + k; bi = j * n + k; vec_load(&a, VECTOR_NELEMS, A, ai); vec_load(&b, VECTOR_NELEMS, B, bi); tmp += a * b; } tmpf = 0.0; for (; k < n; ++k) { tmpf += A[i*n+k] * B[j*n+k]; } C[i*n+j] = vec_sum(tmp, VECTOR_NELEMS) + tmpf; } } mat_trans((F*)B, n); }
// Calculate Cremer-Pople Pucker Parameters and convert these to a ring colour void cremer_pople_ring_color(SmallRing &ring, float *framepos, float *rgb) { int N = ring.num(); //the number of atoms in the current ring float *xring = new float[N]; float *yring = new float[N]; float *zring = new float[N]; float *displ = new float[N]; float *q = new float[N]; float *phi = new float[N]; float Q; int m; float *atompos; int curatomid; vec_zero(rgb); // set default color to black for (int i=0; i<N; i++) { curatomid = ring[i]; atompos = framepos + 3*curatomid; // pointer arithmetic is evil :) xring[i] = atompos[0]; yring[i] = atompos[1]; zring[i] = atompos[2]; } atom_displ_from_mean_plane(xring, yring, zring, displ, N); if (N==6) { //special case - pyranose rings if (cremer_pople_params(N, displ, q, phi, m, Q)) { float cosTheta = q[2]/Q; float theta = acosf(cosTheta); float sinTheta = sinf(theta); // Q is the puckering amplitude - i.e. the intensity of the pucker. // multiply by Q to show intensity, particularly for rings with // little pucker (black) // NOTE -using abs - polar positions therefore equivalent float intensity = Q; rgb[0] = fabsf(sinTheta)*intensity; rgb[1] = fabsf(cosTheta)*intensity; rgb[2] = fabsf(sinf(3.0f*phi[1])*sinTheta)*intensity; } } else if (N==5) { //special case - furanose rings if (cremer_pople_params(N, displ, q, phi, m, Q)) { rgb[0] = 0; rgb[1] = 0; rgb[2] = Q; } } // clamp color values to legal range clamp_color(rgb); delete [] xring; delete [] yring; delete [] zring; delete [] displ; delete [] q; delete [] phi; }
// call this function from a level to enable the free camera movement void def_move() { VECTOR force,speed,dist; ANGLE aforce,aspeed; // initialize speed and distance vec_zero(speed); vec_zero(aspeed); vec_zero(dist); if (1 > def_camera) def_camera = 1; if (1 < run_mode && run_mode < 5) def_camera = 2; // prevent player movement in entity viewer mode while (def_camera) { aforce.tilt = 5*(key_pgup - key_pgdn + mouse_right*mouse_force.y); if (key_alt==0) { aforce.pan = -5*(key_force.x + mouse_right*mouse_force.x + joy_force.x); aforce.roll = 0; } else { aforce.pan = 0; aforce.roll = 5*(key_force.x + mouse_right*mouse_force.x + joy_force.x); } vec_add(&camera->pan,vec_accelerate(&dist,&aspeed,&aforce,0.8)); force.x = 7*(key_force.y + key_w - key_s + joy_force.y); force.y = 3*(key_comma - key_period + key_a - key_d); force.z = 3*(key_home - key_end); vec_accelerate(&dist,&speed,&force,0.5); if (NULL != player && 1 == def_camera) { c_move(player,&dist,nullvector,IGNORE_PASSABLE|IGNORE_PASSENTS|GLIDE); camera->genius = player; vec_set(&player->pan,&camera->pan); vec_set(&camera->x,nullvector); vec_rotate(&camera->x,&camera->pan); vec_add(&camera->x,&player->x); } else { camera->genius = NULL; vec_add(&camera->x,vec_rotate(&dist,&camera->pan)); } wait(1); } }
int BSpline::getParamPoint(float t,float outp[3]) const { if (t<V[0] || t>V[1]) { vec_zero(outp); return 0; } float *b; b=new float[K+1]; int i,j; float *btemp; btemp=new float[1+K+M]; for (i=0; i<=K+M; i++) { if (t>=T[i] && t<=T[i+1]) btemp[i]=1; else if (T[i]==T[i+1] && t==T[i]) btemp[i]=1; else btemp[i]=0; } for (j=1; j<=M; j++) { for (i=0; i<=K+M-j; i++) { float c=0; if (T[i+j]!=T[i]) { c=btemp[i]*(t-T[i])/(T[i+j]-T[i]); } if (T[i+j+1]!=T[i+1]) { c+= btemp[i+1]*(T[i+j+1]-t)/(T[i+j+1]-T[i+1]); } btemp[i]=c; } } for (i=0; i<=K; i++) { b[i]=btemp[i]; } delete []btemp; float denom=0; outp[0]=0; outp[1]=0; outp[2]=0; for (i=0; i<=K; i++) { float r=W[i]*b[i]; outp[0]+= P[i][0]*r; outp[1]+= P[i][1]*r; outp[2]+= P[i][2]*r; denom+=r; } outp[0]/=denom; outp[1]/=denom; outp[2]/=denom; delete []b; return 1; }
static void init_vars(void) { if (firsttime) { firsttime = FALSE; ivec_zero(cstatus); vec_zero(cart); bzero((char *)&(ctarget[1]),n_endeffs*sizeof(ctarget[1])); } }
//----------------------------------------------------------------------------------------------------------- // View <-> Sphere BUGGY //----------------------------------------------------------------------------------------------------------- int sc_physics_intersectViewSphere(VIEW* inView,VECTOR* inPos,var inRadius) { VECTOR vcTemp1,vcTemp2,vcTemp3; vec_zero(vcTemp1); vec_zero(vcTemp2); vec_zero(vcTemp3); var vTemp1=0; var vTemp2=0; var vTemp3=0; vec_for_angle(vcTemp1,inView.pan); vec_diff(vcTemp2,inPos,inView.x); vTemp1=vec_dot(vcTemp1,vcTemp2); vec_set(vcTemp3,inView.x); vec_scale(vcTemp1,vTemp1); vec_add(vcTemp3,vcTemp1); vec_diff(vcTemp1,inPos,vcTemp3); vTemp2=vec_length(vcTemp1); vTemp1=tanv(atanv( sqrt(2) * tanv(inView.arc/2) ))*vTemp1; return (vTemp2-vTemp1>inRadius); /* VECTOR viewDir; vec_for_angle(viewDir, inView.pan); //vec_scale(viewDir, 5); return sc_physics_intersectConeSphere (inView.x, viewDir, inView.arc, inPos, inRadius ); */ }
/* Compute the first and (optionally) second derivatives with respect to the scale parameters for the single-feature log likelihood function (feat_compute_log_likelihood). This version assumes scale parameters for the whole tree and for the subtree. Return value is log likelihood, which is computed as a by-product. Derivs will be stored in *gradient and *hessian. If hessian == NULL, it will not be computed (saves some time). */ double ff_scale_derivs_subtree(FeatFitData *d, Vector *gradient, Matrix *hessian, double ***scratch) { double retval = 0; Vector *d1 = vec_new(2); Matrix *d2 = (hessian == NULL ? NULL : mat_new(2, 2)); int i; vec_zero(gradient); if (hessian != NULL) mat_zero(hessian); for (i = d->feat->start-1; i < d->feat->end; i++) { /* offset of one */ d->cdata->tupleidx = d->cdata->msa->ss->tuple_idx[i]; retval += col_scale_derivs_subtree(d->cdata, d1, d2, scratch); vec_plus_eq(gradient, d1); if (hessian != NULL) mat_plus_eq(hessian, d2); } return retval; }
vec_t sys_get_sel_center(struct sys *sys, struct sel *sel) { vec_t center, xyz; int idx; center = vec_zero(); if (sel_get_count(sel) == 0) return (center); sel_iter_start(sel); while (sel_iter_next(sel, &idx)) { xyz = sys_get_atom_xyz(sys, idx); center = vec_add(¢er, &xyz); } vec_scale(¢er, 1.0 / sel_get_count(sel)); return (center); }
void do_physics(world_t *w){ int i = 0; particle_t *p = NULL; int j = 0; particle_t *q = NULL; box_t b = box_new(vec_new(0,0),10,10); vec_t d = vec_new(0,0); while((p = world_next_moving(w,&i))){ j = 0; p->move(p); if(particle_collides(p)){ while((q = world_next_solid(w,b,&j))){ if( q != p && particle_collide_with(q,p) && !vec_zero(d = box_oriented_collision(p->box,q->box)) ){ /*printf("collide\n");*/ /*d = box_oriented_collision(p->box,q->box);*/ if(d.x != 0.0){ p->v.x = - p->v.x * 0.5 ; }else{ p->v.x *= 0.3; } if(d.y != 0.0){ p->v.y = - p->v.y * 0.5 ; }else{ p->v.x *= 0.3; } if(p->collide){ p->collide(p,q); } p->box.pos = vec_add(p->box.pos,d); } } } } /*TODO collisions and damage*/ }
int main(int argc, char **argv) { printf("N=%d, CL_DEVICE=%d\n", N, CL_DEVICE); steps = 0; totalTime = 0; // distribute bodies in space (randomly) for(int i=0; i<N; i++) { B[i].m = rand_val(8,20); // (8,20) B[i].pos = vec_rand(); B[i].v = vec_zero(); } // ocl initialization init_ocl(); // ogl stuff init_gl(&argc, argv); glutDisplayFunc(&display); atexit(&exit_cb); glutMainLoop(); return EXIT_SUCCESS; }
void nb_kernel310_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,tsc,fs,fs2,nul; vector float dx,dy,dz; vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc; vector float fix,fiy,fiz; vector float tmp1,tmp2,tmp3,tmp4; vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); tsc=load_float_and_splat(p_tabscale); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); Vvdwtot = nul; vctot = nul; fix = nul; fiy = nul; fiz = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); ntiA = 2*ntype*type[ii]; iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_2_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_madd(fs,rinv,nul); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_3_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; qq = vec_madd(load_1_float(charge+jnra),iq,nul); load_1_pair(vdwparam+tja,&c6,&c12); do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_1(dx,dy,dz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4); tmp1 = vec_add(tmp1,tmp3); tmp2 = vec_add(tmp2,tmp4); tmp1 = vec_add(tmp1,tmp2); add_xyz_to_mem(faction+ii3,tmp1); add_xyz_to_mem(fshift+is3,tmp1); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void ATL_USERMM (const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc) { /*--- achitecture specific declarations ---*/ /*--- program specific declarations ---*/ int i, j, k; vector betavec; vector zerovec = {0.0,0.0}; const float *pA0 = A; const float *pB0 = B; float *pC0 = C; float *pC1 = C+(ldc SHIFT); const float *stM = A + (M-M%2)*KB; const float *stN = B + NB*KB; const int incAm = 2*KB-KB+4; const int incBm = -KB+4; const int incCm = (2 SHIFT); const int incAn = -(M-M%2)*KB; const int incBn = 2*KB; const int incCn = ((ldc*2-(M-M%2)) SHIFT); const int incAm_m = KB-KB+4; const int incAn_m = -(M%2)*KB; const int incCn_m = (ldc*2-(M%2))SHIFT; const float *stM_m = A + M*KB; /*--- initial arhitecture specific statements ---*/ vec_enter(); /*--- main program statements ---*/ vec_mov_mr_1(&beta,reg0); vec_mov_rm(reg0,betavec); if (M>=2) { do /* N-loop */ { do /* M-loop */ { #ifdef BETA0 vec_zero(reg0); vec_zero(reg1); vec_zero(reg2); vec_zero(reg3); #elif defined(BETA1) vec_mov_mr_1(pC0,reg0); vec_mov_mr_1(pC0+(1 SHIFT),reg1); vec_mov_mr_1(pC1,reg2); vec_mov_mr_1(pC1+(1 SHIFT),reg3); #else vec_mov_mr(betavec,reg7); vec_mov_mr_1(pC0,reg0); vec_mul_rr(reg7,reg0); vec_mov_mr_1(pC0+(1 SHIFT),reg1); vec_mul_rr(reg7,reg1); vec_mov_mr_1(pC1,reg2); vec_mul_rr(reg7,reg2); vec_mov_mr_1(pC1+(1 SHIFT),reg3); vec_mul_rr(reg7,reg3); #endif vec_mov_mr(pA0,reg4); vec_mul_mr(pB0,reg4); vec_mov_mr(pA0+KB,reg5); vec_mul_mr(pB0,reg5); vec_mov_mr(pA0,reg6); vec_mov_mr(pA0+KB,reg7); align(); for (k=0; k<KB-4; k+=16) { vec_add_rr(reg4,reg0); vec_mov_mr(pA0+2,reg4); vec_mul_mr(pB0+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+2+KB,reg5); vec_mul_mr(pB0+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+2,reg6); vec_mul_mr(pB0+2,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+2+KB,reg7); vec_mul_mr(pB0+2,reg5); vec_add_rr(reg4,reg0); vec_mov_mr(pA0+4,reg4); vec_mul_mr(pB0+2+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+4+KB,reg5); vec_mul_mr(pB0+2+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+4,reg6); vec_mul_mr(pB0+4,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+4+KB,reg7); vec_mul_mr(pB0+4,reg5); vec_add_rr(reg4,reg0); vec_mov_mr(pA0+6,reg4); vec_mul_mr(pB0+4+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+6+KB,reg5); vec_mul_mr(pB0+4+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+6,reg6); vec_mul_mr(pB0+6,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+6+KB,reg7); vec_mul_mr(pB0+6,reg5); vec_add_rr(reg4,reg0); vec_mov_mr(pA0+8,reg4); vec_mul_mr(pB0+6+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+8+KB,reg5); vec_mul_mr(pB0+6+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+8,reg6); vec_mul_mr(pB0+8,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+8+KB,reg7); vec_mul_mr(pB0+8,reg5); vec_add_rr(reg4,reg0); vec_mov_mr(pA0+10,reg4); vec_mul_mr(pB0+8+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+10+KB,reg5); vec_mul_mr(pB0+8+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+10,reg6); vec_mul_mr(pB0+10,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+10+KB,reg7); vec_mul_mr(pB0+10,reg5); vec_add_rr(reg4,reg0); vec_mov_mr(pA0+12,reg4); vec_mul_mr(pB0+10+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+12+KB,reg5); vec_mul_mr(pB0+10+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+12,reg6); vec_mul_mr(pB0+12,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+12+KB,reg7); vec_mul_mr(pB0+12,reg5); vec_add_rr(reg4,reg0); vec_mov_mr(pA0+14,reg4); vec_mul_mr(pB0+12+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+14+KB,reg5); vec_mul_mr(pB0+12+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+14,reg6); vec_mul_mr(pB0+14,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+14+KB,reg7); vec_mul_mr(pB0+14,reg5); vec_add_rr(reg4,reg0); vec_mov_mr(pA0+16,reg4); vec_mul_mr(pB0+14+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+16+KB,reg5); vec_mul_mr(pB0+14+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+16,reg6); vec_mul_mr(pB0+16,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+16+KB,reg7); vec_mul_mr(pB0+16,reg5); pA0 += 16; pB0 += 16; } vec_add_rr(reg4,reg0); vec_mov_mr(pA0+2,reg4); vec_mul_mr(pB0+KB,reg6); vec_add_rr(reg5,reg1); vec_mov_mr(pA0+2+KB,reg5); vec_mul_mr(pB0+KB,reg7); vec_add_rr(reg6,reg2); vec_mov_mr(pA0+2,reg6); vec_mul_mr(pB0+2,reg4); vec_add_rr(reg7,reg3); vec_mov_mr(pA0+2+KB,reg7); vec_mul_mr(pB0+2,reg5); vec_add_rr(reg4,reg0); vec_add_rr(reg5,reg1); vec_mul_mr(pB0+2+KB,reg6); vec_add_rr(reg6,reg2); vec_mul_mr(pB0+2+KB,reg7); vec_add_rr(reg7,reg3); vec_sum(reg0); vec_sum(reg1); vec_sum(reg2); vec_sum(reg3); vec_mov_rm_1(reg0,pC0); vec_mov_rm_1(reg1,pC0+(1 SHIFT)); vec_mov_rm_1(reg2,pC1); vec_mov_rm_1(reg3,pC1+(1 SHIFT)); pA0 += incAm; pB0 += incBm; pC0 += incCm; pC1 += incCm; } while(pA0 != stM); pA0 += incAn; pB0 += incBn; pC0 += incCn; pC1 += incCn; } while(pB0 != stN); } if (M%2>0) { pC0 = C+((M-M%2)SHIFT); pC1 = C+(ldc SHIFT)+((M-M%2)SHIFT); pA0 = A+(M-M%2)*KB; pB0 = B; do /* N-loop */ { do /* M-loop */ { #ifdef BETA0 vec_zero(reg0); vec_zero(reg1); #elif defined(BETA1) vec_mov_mr_1(pC0,reg0); vec_mov_mr_1(pC1,reg1); #else vec_mov_mr(betavec,reg7); vec_mov_mr_1(pC0,reg0); vec_mul_rr(reg7,reg0); vec_mov_mr_1(pC1,reg1); vec_mul_rr(reg7,reg1); #endif vec_mov_mr(pA0,reg6); align(); for (k=0; k<KB-4; k+=16) { vec_mov_rr(reg6,reg2); vec_mul_mr(pB0,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+2,reg6); vec_mul_mr(pB0+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+2,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+4,reg6); vec_mul_mr(pB0+2+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+4,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+6,reg6); vec_mul_mr(pB0+4+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+6,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+8,reg6); vec_mul_mr(pB0+6+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+8,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+10,reg6); vec_mul_mr(pB0+8+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+10,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+12,reg6); vec_mul_mr(pB0+10+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+12,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+14,reg6); vec_mul_mr(pB0+12+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+14,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+16,reg6); vec_mul_mr(pB0+14+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); pA0 += 16; pB0 += 16; } vec_mov_rr(reg6,reg2); vec_mul_mr(pB0,reg2); vec_mov_rr(reg6,reg3); vec_mov_mr(pA0+2,reg6); vec_mul_mr(pB0+KB,reg3); vec_add_rr(reg2,reg0); vec_add_rr(reg3,reg1); vec_mov_rr(reg6,reg2); vec_mul_mr(pB0+2,reg2); vec_add_rr(reg2,reg0); vec_mov_rr(reg6,reg3); vec_mul_mr(pB0+2+KB,reg3); vec_add_rr(reg3,reg1); vec_sum(reg0); vec_sum(reg1); vec_mov_rm_1(reg0,pC0); vec_mov_rm_1(reg1,pC1); pA0 += incAm_m; pB0 += incBm; pC0 += (1 SHIFT); pC1 += (1 SHIFT); } while(pA0 != stM_m); pA0 += incAn_m; pB0 += incBn; pC0 += incCn_m; pC1 += incCn_m; } while(pB0 != stN); } vec_exit(); }
vector float rinv,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif
void nb_kernel010nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float nul; vector float dx,dy,dz; vector float Vvdwtot,c6,c12; vector float rinvsq,rsq,rinvsix; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int ntiA,tja,tjb,tjc,tjd; #ifdef GMX_THREAD_SHM_FDECOMP int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); #ifdef GMX_THREAD_SHM_FDECOMP nthreads = *p_nthreads; do { tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without tMPI_Threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); Vvdwtot = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); ntiA = 2*ntype*type[ii]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); zero_highest_2_elements_in_vector(&rinvsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); zero_highest_3_elements_in_vector(&rinvsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; load_1_pair(vdwparam+tja,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); } /* update outer data */ add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREAD_SHM_FDECOMP nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void nb_kernel231_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float vkrf,vcrf; vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z; vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z; vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul; vector float Vvdwtot,c6,c12,VVd,VVr,FFd,FFr,tsc,r; vector float fsO,fsH1,fsH2,krsqO,krsqH1,krsqH2; vector float vctot,qqO,qqH,iqO,iqH,jq; vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z; vector float tmp1,tmp2,tmp3,tmp4; vector float rinvO,rinvH1,rinvH2,rinvsqH1,rinvsqH2; vector float rsqO,rsqH1,rsqH2; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREAD_SHM_FDECOMP int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); tsc=load_float_and_splat(p_tabscale); vfacel=load_float_and_splat(p_facel); vkrf=load_float_and_splat(p_krf); vcrf=load_float_and_splat(p_crf); ii = iinr[0]; iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul); ntiA = 2*ntype*type[ii]; #ifdef GMX_THREAD_SHM_FDECOMP nthreads = *p_nthreads; do { tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without tMPI_Threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; load_1_3atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz, &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z); vctot = nul; Vvdwtot = nul; fiOx = nul; fiOy = nul; fiOz = nul; fiH1x = nul; fiH1y = nul; fiH1z = nul; fiH2x = nul; fiH2y = nul; fiH2z = nul; nj0 = jindex[n]; nj1 = jindex[n+1]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); r = vec_madd(rinvO,rsqO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; /* load 4 j charges and multiply by iq */ jq=load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-2)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2); r = vec_madd(rinvO,rsqO,nul); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12); do_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); } else if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2); r = vec_madd(rinvO,rsqO,nul); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; /* load 2 j charges and multiply by iq */ jq=load_2_float(charge+jnra,charge+jnrb); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); } else if(k<nj1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2); r = vec_madd(rinvO,rsqO,nul); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); tja = ntiA+2*type[jnra]; /* load 1 j charges and multiply by iq */ jq=load_1_float(charge+jnra); load_1_pair(vdwparam+tja,&c6,&c12); do_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_1(dOx,dOy,dOz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ update_i_3atoms_forces(faction+ii3,fshift+is3, fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z, fiH2x,fiH2y,fiH2z); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREAD_SHM_FDECOMP nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void run_dla(params p) { time_t start, now; FILE *fp; fp = fopen(p.output_filename, "w"); if (fp == NULL) { fprintf(stderr,"ERROR: could not open output file '%s'\n", p.output_filename); exit(EXIT_FAILURE); } if(KDT_DIM == 2) { fprintf(fp,"n x y\n"); } else if(KDT_DIM == 3) { fprintf(fp,"n x y z\n"); } else { fprintf(stderr, "DLA not defined for KDT_DIM=%d\n", KDT_DIM); exit(1); } setuprandom(p.seed); // setup initial world conditions point_list *buffer = kdt_new_point_list(p.n); tree_node *root = kdt_new_tree(); point *zero = (point *)malloc(sizeof(point)); vec_zero(zero); kdt_add_point(root, zero); double curr_max_radius = 0.0f; double starting_rad = starting_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size); double death_rad = death_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size, p.outer_mult); time(&start); time(&now); int i; for(i=0; i < p.n && (now - start) < p.max_secs; i++) { if(i % (p.n / 10) == 0 && p.log_progress == 1) { log_progress((double)i / (double)p.n * 100.0f, curr_max_radius, starting_rad, kdt_max_depth(root)); } point dir, end, col; point *pt = (point *)malloc(sizeof(point)); vec_rand_unit(pt); vec_scalar_mult(pt, pt, starting_rad); int walking = 1; while(walking) { vec_rand_unit(&dir); vec_scalar_mult(&dir, &dir, p.step_size); vec_add(&end, pt, &dir); if(kdt_collision_detect(root, pt, &end, &col, p.epsilon, buffer) && sticks(p.stickiness)) { vec_copy(pt, &col); kdt_add_point(root, pt); double pt_radius = vec_length(pt); if(pt_radius > curr_max_radius) { curr_max_radius = pt_radius; starting_rad = starting_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size); death_rad = death_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size, p.outer_mult); } walking = 0; } else { vec_copy(pt, &end); if(vec_length(pt) >= death_rad) { // if we have moved outside the max radius, start over vec_rand_unit(pt); vec_scalar_mult(pt, pt, starting_rad); } } } print_point(fp, i, pt); time(&now); } if (p.log_progress == 1) { log_progress((double)i / (double)p.n * 100.0f, curr_max_radius, starting_rad, kdt_max_depth(root)); } fclose(fp); }
/* Subtree version of score test */ void ff_score_tests_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode, double *feat_pvals, double *feat_null_scales, double *feat_derivs, double *feat_sub_derivs, double *feat_teststats, FILE *logf) { int i; FeatFitData *d, *d2; Vector *grad = vec_new(2); Matrix *fim = mat_new(2, 2); double lnl, teststat; FimGrid *grid; List *inside=NULL, *outside=NULL; TreeModel *modcpy = tm_create_copy(mod); /* need separate copy of tree model with different internal scaling data for supertree/subtree case */ /* init FeatFitData -- one for null model, one for alt */ d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE); d2 = ff_init_fit_data(mod, msa, SUBTREE, NNEUT, FALSE); /* mod has the subtree info, modcpy does not */ /* precompute Fisher information matrices for a grid of scale values */ grid = col_fim_grid_sub(mod); /* prepare lists of leaves inside and outside root, for use in checking for informative substitutions */ if (mod->subtree_root != NULL) { inside = lst_new_ptr(mod->tree->nnodes); outside = lst_new_ptr(mod->tree->nnodes); tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside); } /* iterate through features */ for (i = 0; i < lst_size(gff->features); i++) { checkInterrupt(); d->feat = lst_get_ptr(gff->features, i); /* first check for informative substitution data in feature; if none, don't waste time computing likelihoods */ if (!ff_has_data_sub(mod, msa, d->feat, inside, outside)) { teststat = 0; vec_zero(grad); } else { vec_set(d->cdata->params, 0, d->cdata->init_scale); opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d, &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0], logf, NULL, NULL); /* turns out to be faster to use numerical rather than exact derivatives (judging by col case) */ d2->feat = d->feat; d2->cdata->mod->scale = d->cdata->params->data[0]; d2->cdata->mod->scale_sub = 1; tm_set_subst_matrices(d2->cdata->mod); ff_scale_derivs_subtree(d2, grad, NULL, d2->cdata->fels_scratch); fim = col_get_fim_sub(grid, d2->cdata->mod->scale); mat_scale(fim, d->feat->end - d->feat->start + 1); /* scale column-by-column FIM by length of feature (expected values are additive) */ teststat = grad->data[1]*grad->data[1] / (fim->data[1][1] - fim->data[0][1]*fim->data[1][0]/fim->data[0][0]); if (teststat < 0) { fprintf(stderr, "WARNING: teststat < 0 (%f)\n", teststat); teststat = 0; } if ((mode == ACC && grad->data[1] < 0) || (mode == CON && grad->data[1] > 0)) teststat = 0; /* derivative points toward boundary; truncate at 0 */ mat_free(fim); } if (feat_pvals != NULL) { if (mode == NNEUT || mode == CONACC) feat_pvals[i] = chisq_cdf(teststat, 1, FALSE); else feat_pvals[i] = half_chisq_cdf(teststat, 1, FALSE); /* assumes 50:50 mix of chisq and point mass at zero */ if (feat_pvals[i] < 1e-20) feat_pvals[i] = 1e-20; /* approx limit of eval of tail prob; pvals of 0 cause problems */ if (mode == CONACC && grad->data[1] > 0) feat_pvals[i] *= -1; /* mark as acceleration */ } /* store scales and log likelihood ratios if necessary */ if (feat_null_scales != NULL) feat_null_scales[i] = d->cdata->params->data[0]; if (feat_derivs != NULL) feat_derivs[i] = grad->data[0]; if (feat_sub_derivs != NULL) feat_sub_derivs[i] = grad->data[1]; if (feat_teststats != NULL) feat_teststats[i] = teststat; } ff_free_fit_data(d); ff_free_fit_data(d2); vec_free(grad); modcpy->estimate_branchlens = TM_BRANCHLENS_ALL; /* have to revert for tm_free to work correctly */ tm_free(modcpy); col_free_fim_grid(grid); if (inside != NULL) lst_free(inside); if (outside != NULL) lst_free(outside); }
void VolumeTexture::calculateTexgenPlanes(float v0[3], float v1[3], float v2[3], float v3[3]) const { int i; if (!texmap || !v) { // do something sensible vec_zero(v0); vec_zero(v1); vec_zero(v2); vec_zero(v3); v1[0] = v2[1] = v3[2] = 1; return; } // rescale texture coordinates by the portion of the // entire texture volume they reference // XXX added an additional scale factor to keep "nearest" texture modes // rounding into the populated area rather than catching black // texels in the empty part of the texture volume float tscale[3]; tscale[0] = (v->xsize / (float)size[0]) * 0.99999f; tscale[1] = (v->ysize / (float)size[1]) * 0.99999f; tscale[2] = (v->zsize / (float)size[2]) * 0.99999f; // calculate length squared of volume axes float lensq[3]; vec_zero(lensq); for (i=0; i<3; i++) { lensq[0] += float(v->xaxis[i] * v->xaxis[i]); lensq[1] += float(v->yaxis[i] * v->yaxis[i]); lensq[2] += float(v->zaxis[i] * v->zaxis[i]); } // Calculate reciprocal space lattice vectors, which are used // in the OpenGL texgen eye space plane equations in order to transform // incoming world coordinates to the correct texture coordinates. // This code should work for both orthogonal and non-orthogonal volumes. // The last step adds in the NPOT texture scaling where necessary. // Reference: Introductory Solid State Physics, H.P.Myers, page 43 float xaxdir[3], yaxdir[3], zaxdir[3]; float nxaxdir[3], nyaxdir[3], nzaxdir[3]; float bxc[3], cxa[3], axb[3]; float tmp; // copy axis direction vectors for (i=0; i<3; i++) { xaxdir[i] = float(v->xaxis[i]); yaxdir[i] = float(v->yaxis[i]); zaxdir[i] = float(v->zaxis[i]); } // calculate reciprocal lattice vector for X texture coordiante cross_prod(bxc, yaxdir, zaxdir); tmp = dot_prod(xaxdir, bxc); for (i=0; i<3; i++) { nxaxdir[i] = bxc[i] / tmp; } // calculate reciprocal lattice vector for Y texture coordiante cross_prod(cxa, zaxdir, xaxdir); tmp = dot_prod(yaxdir, cxa); for (i=0; i<3; i++) { nyaxdir[i] = cxa[i] / tmp; } // calculate reciprocal lattice vector for Z texture coordiante cross_prod(axb, xaxdir, yaxdir); tmp = dot_prod(zaxdir, axb); for (i=0; i<3; i++) { nzaxdir[i] = axb[i] / tmp; } // negate and transform the volume origin to reciprocal space // for use in the OpenGL texgen plane equation float norigin[3]; for (i=0; i<3; i++) norigin[i] = float(v->origin[i]); v0[0] = -dot_prod(norigin, nxaxdir) * tscale[0]; v0[1] = -dot_prod(norigin, nyaxdir) * tscale[1]; v0[2] = -dot_prod(norigin, nzaxdir) * tscale[2]; // scale the volume axes for the OpenGL texgen plane equation for (i=0; i<3; i++) { v1[i] = nxaxdir[i] * tscale[0]; v2[i] = nyaxdir[i] * tscale[1]; v3[i] = nzaxdir[i] * tscale[2]; } }
int BSplineSurf::getParamPoint(float s,float t,float outp[3]) const { if (s<U[0] || s>U[1]) { vec_zero(outp); return 0; } if (t<V[0] || t>V[1]) { vec_zero(outp); return 0; } float *b1,*b2; b1=new float[K1+1]; b2=new float[K2+1]; int i,j; float *btemp1,*btemp2; btemp1=new float[1+K1+M1]; btemp2=new float[1+K2+M2]; for (i=0; i<=K1+M1; i++) { if (s>=S[i] && s<=S[i+1]) btemp1[i]=1; else if (S[i]==S[i+1] && s==S[i]) btemp1[i]=1; else btemp1[i]=0; } for (j=1; j<=M1; j++) { for (i=0; i<=K1+M1-j; i++) { float c=0; if (S[i+j]!=S[i]) { c=btemp1[i]*(s-S[i])/(S[i+j]-S[i]); } if (S[i+j+1]!=S[i+1]) { c+= btemp1[i+1]*(S[i+j+1]-s)/(S[i+j+1]-S[i+1]); } btemp1[i]=c; } } for (i=0; i<=K1; i++) { b1[i]=btemp1[i]; } for (i=0; i<=K2+M2; i++) { if (t>=T[i] && t<=T[i+1]) btemp2[i]=1; else if (T[i]==T[i+1] && t==T[i]) btemp2[i]=1; else btemp2[i]=0; } for (j=1; j<=M2; j++) { for (i=0; i<=K2+M2-j; i++) { float c=0; if (T[i+j]!=T[i]) { c=btemp2[i]*(t-T[i])/(T[i+j]-T[i]); } if (T[i+j+1]!=T[i+1]) { c+= btemp2[i+1]*(T[i+j+1]-t)/(T[i+j+1]-T[i+1]); } btemp2[i]=c; } } for (i=0; i<=K2; i++) { b2[i]=btemp2[i]; } delete []btemp1; delete []btemp2; float denom=0; outp[0]=0; outp[1]=0; outp[2]=0; for (j=0; j<=K2; j++) { for (i=0; i<=K1; i++) { int ij=i+j*(K1+1); float r=W[ij]*b1[i]*b2[j]; outp[0]+= P[ij][0]*r; outp[1]+= P[ij][1]*r; outp[2]+= P[ij][2]*r; denom+=r; } } outp[0]/=denom; outp[1]/=denom; outp[2]/=denom; delete []b1; delete []b2; return 1; }
void nb_kernel133nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z,iMx,iMy,iMz; vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z,dMx,dMy,dMz; vector float Vvdwtot,c6,c12,VVd,VVr,tsc,r; vector float vfacel,nul; vector float vctot,qqM,qqH,iqM,iqH,jq; vector float rinvO,rinvH1,rinvH2,rinvM,rsqO,rsqH1,rsqH2,rsqM; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); tsc=load_float_and_splat(p_tabscale); vfacel=load_float_and_splat(p_facel); ii = iinr[0]; iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul); iqM = vec_madd(load_float_and_splat(charge+ii+3),vfacel,nul); ntiA = 2*ntype*type[ii]; #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; load_1_4atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz, &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z, &iMx,&iMy,&iMz); vctot = nul; Vvdwtot = nul; nj0 = jindex[n]; nj1 = jindex[n+1]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); r = vec_madd(rsqO,rinvO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; /* load 4 j charges and multiply by iq */ jq=load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_vonly_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } if(k<(nj1-2)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c),nul,&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_element_in_vector(&rinvO); zero_highest_element_in_vector(&rsqO); zero_highest_element_in_3_vectors(&rinvH1,&rinvH2,&rinvM); r = vec_madd(rsqO,rinvO,nul); jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; /* load 3 j charges and multiply by iq */ load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12); do_vonly_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } else if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_2_elements_in_vector(&rinvO); zero_highest_2_elements_in_vector(&rsqO); zero_highest_2_elements_in_3_vectors(&rinvM,&rinvH1,&rinvH2); r = vec_madd(rsqO,rinvO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; /* load 2 j charges and multiply by iq */ jq=load_2_float(charge+jnra,charge+jnrb); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_vonly_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } else if(k<nj1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_3_elements_in_vector(&rinvO); zero_highest_3_elements_in_vector(&rsqO); zero_highest_3_elements_in_3_vectors(&rinvH1,&rinvH2,&rinvM); r = vec_madd(rsqO,rinvO,nul); jq=load_1_float(charge+jnra); tja = ntiA+2*type[jnra]; /* load 1 j charge and multiply by iq */ load_1_pair(vdwparam+tja,&c6,&c12); do_vonly_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } /* update outer data */ add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void get_position_xyz(struct vector *p, const struct position *pos) { int i, j, br, bc, r, c; struct position bp; vec_zero(p); r = pos->square/BCOLS; c = pos->square%BCOLS; for (i = NUM_MAIN_BOARDS - 1; i >= 0; i--) { get_main_board_position(&bp, i); if (pos->level == bp.level) { br = bp.square/BCOLS; bc = bp.square%BCOLS; if (r >= br && r - br < MAIN_BOARD_SIZE && c >= bc && c - bc < MAIN_BOARD_SIZE) { struct vector board_center; get_main_board_xyz(&board_center, i); p->x = board_center.x - .5f*MAIN_BOARD_SIZE*SQUARE_SIZE + (r - br + .5f)*SQUARE_SIZE; p->y = board_center.y - .5f*MAIN_BOARD_SIZE*SQUARE_SIZE + (c - bc + .5f)*SQUARE_SIZE; p->z = board_center.z; break; } } for (j = 0; j < 8; j++) { get_attack_board_position(&bp, i, j); if (pos->level == bp.level) { br = bp.square/BCOLS; bc = bp.square%BCOLS; if (r >= br && r - br < ATTACK_BOARD_SIZE && c >= bc && c - bc < ATTACK_BOARD_SIZE) { struct vector board_center; get_attack_board_xyz(&board_center, i, j); p->x = board_center.x - .5f*ATTACK_BOARD_SIZE*SQUARE_SIZE + (r - br + .5f)*SQUARE_SIZE; p->y = board_center.y - .5f*ATTACK_BOARD_SIZE*SQUARE_SIZE + (c - bc + .5f)*SQUARE_SIZE; p->z = board_center.z; goto done; } } } } done: ; }
int main(int argc, char* argv[]) { FILE* F; MSA *msa; int *msa_gap_patterns = NULL; HMM *hmm = NULL; TreeNode *tree = NULL; int i, input_format = SS, msa_idx, quiet_mode = FALSE, ncats, nmsas, ncats_unspooled, indel_nseqs = -1; String *msa_fname, *gff_fname; List *gff_fname_list = NULL, *msa_fname_list = NULL, *msa_length_list = NULL, *model_indels_str = NULL; Matrix *traincounts = NULL; Vector *begcounts = NULL, *statecounts = NULL; CategoryMap *cm = NULL; char c; GapPatternMap *gpm = NULL; GFF_Set *gff; char *reverse_groups_tag = NULL; while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) { switch(c) { case 'i': input_format = msa_str_to_format(optarg); if (input_format == -1) die("ERROR: bad alignment format.\n"); break; case 'g': gff_fname_list = get_arg_list(optarg); break; case 'c': cm = cm_new_string_or_file(optarg); break; case 'm': msa_fname_list = get_arg_list(optarg); break; case 'M': msa_length_list = str_list_as_int(get_arg_list(optarg)); break; case 'R': reverse_groups_tag = optarg; break; case 'I': model_indels_str = get_arg_list(optarg); break; case 'n': indel_nseqs = get_arg_int(optarg); break; case 't': if (optarg[0] == '(') /* in this case, assume topology given at command line */ tree = tr_new_from_string(optarg); else tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'q': quiet_mode = TRUE; break; case 'h': print_usage(); exit(0); case '?': die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n"); } } if (msa_fname_list == NULL) die("ERROR: -m required. Type 'hmm_train -h' for usage.\n"); if (gff_fname_list == NULL) die("ERROR: -g required in training mode. Type 'hmm_train -h' for usage.\n"); if (msa_length_list != NULL && msa_fname_list != NULL) die("ERROR: -m and -M are mutually exclusive. Type 'hmm_train -h' for usage.\n"); if (model_indels_str != NULL && tree == NULL) die("ERROR: -I requires -t. Type 'hmm_train -h' for usage.\n"); if (cm == NULL) die("ERROR: category map required.\n"); set_seed(-1); ncats = cm->ncats + 1; ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled : ncats; nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) : lst_size(msa_fname_list)); if (model_indels_str != NULL) { if (tree == NULL) die("ERROR: tree is NULL\n"); /*FIXME: indel_ncats broken */ gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE); ncats = cm->ncats + 1; /* numbers will change */ ncats_unspooled = cm->unspooler == NULL ? ncats : cm->unspooler->nstates_unspooled; } /* allocate memory for storage of "training paths" */ traincounts = mat_new(ncats_unspooled, ncats_unspooled); statecounts = vec_new(ncats_unspooled); begcounts = vec_new(ncats_unspooled); mat_zero(traincounts); vec_zero(statecounts); vec_zero(begcounts); /* create skeleton of new HMM. */ hmm = hmm_new_nstates(ncats_unspooled, 0, 0); /* Main loop: consider each MSA in turn */ for (msa_idx = 0; msa_idx < nmsas; msa_idx++) { if (msa_fname_list != NULL) { msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx); F = phast_fopen(msa_fname->chars, "r"); if (!quiet_mode) fprintf(stderr, "Reading alignment from %s ...\n", F == stdin ? "stdin" : msa_fname->chars); msa = msa_new_from_file(F, NULL); phast_fclose(F); } else { /* only lengths of alignments specified */ msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL); /* just a shell in this case */ } gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx); if (!quiet_mode) fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars); gff = gff_read_set(phast_fopen(gff_fname->chars, "r")); /* convert GFF to coordinate frame of alignment */ if (msa_length_list == NULL) { if (!quiet_mode) fprintf(stderr, "Mapping annotations to alignment ...\n"); msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */ } if (model_indels_str != NULL) { if (!quiet_mode) fprintf(stderr, "Obtaining gap patterns ...\n"); msa_gap_patterns = smalloc(msa->length * sizeof(int)); gp_set_phylo_patterns(gpm, msa_gap_patterns, msa); } /* at this point, we don't actually need the alignment anymore; if using ordered suff stats (likely with large data sets), can free them now, to avoid running out of memory */ if (msa->ss != NULL) { ss_free(msa->ss); msa->ss = NULL; } if (reverse_groups_tag != NULL) { if (!quiet_mode) fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n", reverse_groups_tag); /* we don't need to reverse complement the whole alignment -- just the gff and possibly the gap pattern array (pass a NULL msa) */ gff_group(gff, reverse_groups_tag); msa_reverse_compl_feats(NULL, gff, msa_gap_patterns); } if (!quiet_mode) fprintf(stderr, "Labeling sites by category ...\n"); msa_label_categories(msa, gff, cm); gff_free_set(gff); if (model_indels_str != NULL) { if (!quiet_mode) fprintf(stderr, "Remapping categories according to gap patterns ...\n"); if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) { /* in this case, we'll simply reassign non-trivial gap patterns randomly. This will achieve the desired effect with minimal coding, as long as the number of sites is not too small (the indel model is probably useless anyway if the number is small) */ int pat, newpat; int npatterns = 4 * indel_nseqs - 5; int complex_allowed[cm->ncats+1]; List *no_complex_names, *no_complex_nums; if (!quiet_mode) fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs); /* set up index indicating by cat no. whether complex gaps are allowed */ for (i = 0; i < ncats; i++) complex_allowed[i] = 1; no_complex_names = lst_new_ptr(10); str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names); no_complex_nums = cm_get_category_list(cm, no_complex_names, 1); for (i = 0; i < lst_size(no_complex_nums); i++) complex_allowed[lst_get_int(no_complex_nums, i)] = 0; lst_free(no_complex_nums); lst_free_strings(no_complex_names); lst_free(no_complex_names); /* now reassign all non-null numbers */ for (i = 0; i < msa->length; ) { if ((pat = msa_gap_patterns[i]) != 0) { if (complex_allowed[msa->categories[i]]) newpat = 1 + ((double)npatterns * unif_rand()); /* random number in interval [1, npatterns] */ else newpat = 1 + ((double)(npatterns-1) * unif_rand()); /* random number in interval [1,npatterns-1] (excludes complex gap pattern) */ for (; i < msa->length && msa_gap_patterns[i] == pat; i++) msa_gap_patterns[i] = newpat; /* change for whole sequence */ } else i++; } } /* obtain gapped category number for each site */ for (i = 0; i < msa->length; i++) if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL) msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]]; } if (!quiet_mode) fprintf(stderr, "Unspooling categories ...\n"); cm_spooled_to_unspooled(cm, msa->categories, msa->length); if (!quiet_mode) fprintf(stderr, "Collecting training data ...\n"); hmm_train_update_counts(traincounts, statecounts, begcounts, msa->categories, msa->length, ncats_unspooled); if (msa_gap_patterns != NULL) sfree(msa_gap_patterns); msa_free(msa); } /* now train HMM, using cumulative data */ hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL, begcounts, NULL); /* if modeling indels, adjust begin transitions so probability is distributed among different "gap pattern" states that all correspond to the same ungapped state (category); this helps avoid problems that occur when training on a few large sequences (e.g., whole chromosomes) and then testing on many shorter ones */ if (model_indels_str != NULL) { double tprob[gpm->ncats]; int nst[gpm->ncats]; /* total prob and number of states per spooled, ungapped category */ for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0; for (i = 0; i < hmm->nstates; i++) { if (vec_get(hmm->begin_transitions, i) > 0) /* have to go from unspooled space to spooled space, then to ungapped space (HMM states correspond to unspooled, gapped categories). Note that states with nonzero begin probs shouldn't be conditioned on other states. */ tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] += vec_get(hmm->begin_transitions, i); nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++; } for (i = 0; i < hmm->nstates; i++) if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0) vec_set(hmm->begin_transitions, i, tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] / nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]); /* (uniform prior) */ } /* write trained HMM */ hmm_print(stdout, hmm); if (!quiet_mode) fprintf(stderr, "Done.\n"); return 0; }
void nb_kernel100nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,nul; vector float dx,dy,dz; vector float vctot,qq,iq; vector float rinv,rsq; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); vctot = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); /* load 4 j charges and multiply by iq */ qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); vctot = vec_madd(qq,rinv,vctot); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); /* load 2 j charges and multiply by iq */ qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); vctot = vec_madd(qq,rinv,vctot); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); /* load 1 j charge and multiply by iq */ qq = vec_madd(load_1_float(charge+jnra),iq,nul); vctot = vec_madd(qq,rinv,vctot); } /* update outer data */ add_vector_to_float(Vc+gid[n],vctot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }