Ejemplo n.º 1
0
void Timestep::get_transform_vectors(float A[3], float B[3], float C[3]) const
{
  // notes: a, b, c are side lengths of the unit cell
  // alpha = angle between b and c
  //  beta = angle between a and c
  // gamma = angle between a and b

  // convert from degrees to radians
  double cosBC = cos(DEGTORAD(alpha));
  double cosAC = cos(DEGTORAD(beta));
  double cosAB = cos(DEGTORAD(gamma));
  double sinAB = sin(DEGTORAD(gamma));

  // A will lie along the positive x axis.
  // B will lie in the x-y plane
  // The origin will be (0,0,0).
  float Ax = (float) (a_length);
  float Bx = (float) (b_length * cosAB);
  float By = (float) (b_length * sinAB);

  float Cx=0, Cy=0, Cz=0;
  // If sinAB is zero, then we can't determine C uniquely since it's defined
  // in terms of the angle between A and B.
  if (sinAB > 0) {
    Cx = (float) cosAC;
    Cy = (float) ((cosBC - cosAC * cosAB) / sinAB);
    Cz = sqrtf(1.0f - Cx*Cx - Cy*Cy);
  }
  Cx *= c_length;
  Cy *= c_length;
  Cz *= c_length;
  vec_zero(A); A[0] = Ax;
  vec_zero(B); B[0] = Bx; B[1] = By;
  vec_zero(C); C[0] = Cx; C[1] = Cy; C[2] = Cz;
}
Ejemplo n.º 2
0
vec_t box_oriented_collision(box_t a, box_t b){
	vec_t d1;
	vec_t d2;
	d1 = box_o_o_collision(a,b);
	if(vec_zero(d1)){
		return d1;
	}else{
		d2 = vec_inv(box_o_o_collision(b,a));
		if(vec_zero(d2)){
			return d2;
		}else{
			return vec_smallest(d1,d2);
		}
	}
}
Ejemplo n.º 3
0
void
mesh_init_vertex_normals(struct mesh *mesh)
{
	struct polygon *p, *poly_end;
	struct vertex *v, *vtx_end;
	struct vector *poly_normal;
	int *idx;
	int nvtx;

	vtx_end = &mesh->vtx[mesh->nvtx];

	for (v = mesh->vtx; v != vtx_end; v++)
		vec_zero(&v->normal);

	poly_end = &mesh->poly[mesh->npoly];

	for (p = mesh->poly; p != poly_end; p++) {
		poly_normal = &p->normal;
		idx = p->vtx_index;
		nvtx = p->nvtx;

		while (nvtx--)
			vec_add_to(&mesh->vtx[*idx++].normal, poly_normal);
	}

	for (v = mesh->vtx; v != vtx_end; v++)
		vec_normalize(&v->normal);
}
Ejemplo n.º 4
0
void
ui_set_modelview_matrix(struct matrix *mv)
{
	struct matrix ry, rx, rm;
	struct vector p, o;
	struct matrix c, t, r, v;

	mat_make_rotation_around_y(&ry, ui.rotation);
	mat_make_rotation_around_x(&rx, ui.tilt);
	mat_mul_copy(&rm, &ry, &rx);

	vec_zero(&o); /* look at */

	vec_set(&p, 0.f, 0.f, -ui.distance); /* camera position */
	mat_rotate(&p, &rm);

	mat_make_look_at(&c, &o, &p);

	vec_neg(&p);
	mat_make_translation_from_vec(&t, &p);
	mat_mul_copy(&v, &c, &t);

	mat_make_rotation_around_x(&r, -M_PI/2.);

	mat_mul_copy(mv, &v, &r);
}
Ejemplo n.º 5
0
vec_t vec_normalize(vec_t a){
	if (vec_zero(a)){
		return a;
	}else{
		return vec_scale(a,1.0/vec_len(a));
	}
}
Ejemplo n.º 6
0
void hotcold_gradient_lerp(float pucker_sum, float *rgb) {
  vec_zero(rgb); // set default color to black

  // hot to cold color map
  // Red (1, 0, 0) -> Yellow (1, 1, 0) -> Green (0, 1, 0) -> Cyan (0, 1, 1) -> blue (0, 0, 1)
  float     red[3] = {1.0f, 0.0f, 0.0f};
  float  yellow[3] = {1.0f, 1.0f, 0.0f};
  float yellow2[3] = {0.8f, 1.0f, 0.0f};
  float   green[3] = {0.0f, 1.0f, 0.0f};
  float  green2[3] = {0.6f, 1.0f, 0.0f};
  float    cyan[3] = {0.0f, 1.0f, 1.0f};
  float   cyan2[3] = {0.0f, 1.0f, 0.8f};
  float    blue[3] = {0.0f, 0.0f, 1.0f};

  if (pucker_sum < 0.25f) {
    lerp_color_range(rgb, pucker_sum, 0.00f, 0.25f, red, yellow);
  } else if (pucker_sum < 0.45f) {
    vec_copy(rgb, yellow);
  } else if (pucker_sum < 0.55f) {
    lerp_color_range(rgb, pucker_sum, 0.45f, 0.55f, yellow, green2);
  } else if (pucker_sum < 0.75f) {
    lerp_color_range(rgb, pucker_sum, 0.55f, 0.75f, green, cyan2);
  } else {
    lerp_color_range(rgb, pucker_sum, 0.75f, 1.00f, cyan, blue);
  }

  clamp_color(rgb); // clamp color values to legal range
}
Ejemplo n.º 7
0
void hotcold_gradient(float pucker_sum, float *rgb) {
  vec_zero(rgb); // set default color to black

  // hot to cold color map
  // Red (1, 0, 0) -> Yellow (1, 1, 0) -> Green (0, 1, 0) -> Cyan (0, 1, 1) -> blue (0, 0, 1) -> magenta (1, 0, 1)
  if (pucker_sum < 0.40f) {  //MK - envelopes here
    rgb[0] = 1.0f;  // red
    rgb[1] = pucker_sum * 2.5f;  // MK from red increasing green -> yellow -  adjusted multiplier for large range
    rgb[2] = 0.0f;
  } else if (pucker_sum < 0.56f) {
    rgb[0] = 1.0f - (pucker_sum - 0.40f) * 6.25f; // from Yellow, decrease red -> green adjusted multiplier for small range
    rgb[1] = 1.0f;
    rgb[2] = 0.0f;
  } else if (pucker_sum < 0.64f) {
    rgb[0] = 0.0f;
    rgb[1] = 1.0f; //green
    rgb[2] = (pucker_sum - 0.56f) * 12.5f; // from green, increasing blue ->  cyan,  adjusted multiplier for small range
  } else if (pucker_sum < 0.76f) {
    rgb[0] = 0.0f;
    rgb[1] = 1.0f - (pucker_sum - 0.64f) * 5.0f; // from cyan, decrease green -> blue, adjusted multiplier for small range
    rgb[2] = 1.0f;
  } else {
    rgb[0] = (pucker_sum - 0.76f) * 0.8f; // from blue, increase red to get magenta, adjusted multiplier for very large range
    rgb[1] = 0.0f;
    rgb[2] = 1.0f;
  }

  clamp_color(rgb); // clamp color values to legal range
}
Ejemplo n.º 8
0
/* Transpose matrix B to both:
 *
 * - increase cache hits
 * - simd GCC vector extensions which is made possible.
 *   by the transposition, to increase likelyhood of SIMDs.
 *
 * Note that GCC 6 O=3 is smart enough to use SIMD
 * even for the naive CPU method. However this was still way faster.
 * */
void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    F tmpf;
    size_t i, j, k, k_max, ai, bi;
    Vec tmp, a, b;
    UNUSED(cache);

    mat_trans((F*)B, n);
    k_max = (n / VECTOR_NELEMS) * VECTOR_NELEMS;
    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            vec_zero(&tmp, VECTOR_NELEMS);
            for (k = 0; k < k_max; k += VECTOR_NELEMS) {
                ai = i * n + k;
                bi = j * n + k;
                vec_load(&a, VECTOR_NELEMS, A, ai);
                vec_load(&b, VECTOR_NELEMS, B, bi);
                tmp += a * b;
            }
            tmpf = 0.0;
            for (; k < n; ++k) {
                tmpf += A[i*n+k] * B[j*n+k];
            }
            C[i*n+j] = vec_sum(tmp, VECTOR_NELEMS) + tmpf;
        }
    }
    mat_trans((F*)B, n);
}
Ejemplo n.º 9
0
// Calculate Cremer-Pople Pucker Parameters and convert these to a ring colour
void cremer_pople_ring_color(SmallRing &ring, float *framepos, float *rgb) {
  int N = ring.num(); //the number of atoms in the current ring
  float *xring = new float[N];
  float *yring = new float[N];
  float *zring = new float[N];
  float *displ = new float[N];
  float *q = new float[N];
  float *phi = new float[N];
  float Q;
  int m;
  float *atompos;
  int curatomid;

  vec_zero(rgb); // set default color to black

  for (int i=0; i<N; i++) {
    curatomid = ring[i];
    atompos = framepos + 3*curatomid; // pointer arithmetic is evil :)
    xring[i] = atompos[0];
    yring[i] = atompos[1];
    zring[i] = atompos[2];
  }     

  atom_displ_from_mean_plane(xring, yring, zring, displ, N);
         
  if (N==6) { //special case - pyranose rings
    if (cremer_pople_params(N, displ, q, phi, m, Q)) {
      float cosTheta = q[2]/Q;
      float theta = acosf(cosTheta); 
      float sinTheta = sinf(theta);

      // Q is the puckering amplitude - i.e. the intensity of the pucker.
      // multiply by Q to show intensity, particularly for rings with 
      // little pucker (black)
      // NOTE -using abs - polar positions therefore equivalent
      float intensity = Q;
      
      rgb[0] = fabsf(sinTheta)*intensity;
      rgb[1] = fabsf(cosTheta)*intensity;
      rgb[2] = fabsf(sinf(3.0f*phi[1])*sinTheta)*intensity;
    }
  } else if (N==5) { //special case - furanose rings
    if (cremer_pople_params(N, displ, q, phi, m, Q)) {
      rgb[0] = 0;
      rgb[1] = 0;
      rgb[2] = Q;
    }
  }

  // clamp color values to legal range
  clamp_color(rgb);
 
  delete [] xring;
  delete [] yring;
  delete [] zring;
  delete [] displ;
  delete [] q;
  delete [] phi;
}
Ejemplo n.º 10
0
// call this function from a level to enable the free camera movement
void def_move()
{
	VECTOR force,speed,dist;
	ANGLE aforce,aspeed; 

	// initialize speed and distance
	vec_zero(speed);
	vec_zero(aspeed);
	vec_zero(dist);

	if (1 > def_camera)
	def_camera = 1;
	if (1 < run_mode && run_mode < 5) 
	def_camera = 2;	// prevent player movement in entity viewer mode

	while (def_camera) 
	{
		aforce.tilt = 5*(key_pgup - key_pgdn + mouse_right*mouse_force.y);
		if (key_alt==0) {
			aforce.pan = -5*(key_force.x + mouse_right*mouse_force.x + joy_force.x);
			aforce.roll = 0;
			} else {
			aforce.pan = 0;
			aforce.roll = 5*(key_force.x + mouse_right*mouse_force.x + joy_force.x);
		}
		vec_add(&camera->pan,vec_accelerate(&dist,&aspeed,&aforce,0.8));

		force.x = 7*(key_force.y + key_w - key_s + joy_force.y);
		force.y = 3*(key_comma - key_period + key_a - key_d);
		force.z = 3*(key_home - key_end);
		vec_accelerate(&dist,&speed,&force,0.5);
		
		if (NULL != player && 1 == def_camera) {
			c_move(player,&dist,nullvector,IGNORE_PASSABLE|IGNORE_PASSENTS|GLIDE);
			camera->genius = player;
			vec_set(&player->pan,&camera->pan);
			vec_set(&camera->x,nullvector);
			vec_rotate(&camera->x,&camera->pan);
			vec_add(&camera->x,&player->x);
			} else {
			camera->genius = NULL;
			vec_add(&camera->x,vec_rotate(&dist,&camera->pan));
		}
		wait(1);
	}
}
Ejemplo n.º 11
0
int BSpline::getParamPoint(float t,float outp[3]) const
{
	if (t<V[0] || t>V[1]) {
		vec_zero(outp);
		return 0;
	}

	float *b;
	b=new float[K+1];
	int i,j;

	float *btemp;

	btemp=new float[1+K+M];
	for (i=0; i<=K+M; i++) {
		if (t>=T[i] && t<=T[i+1]) btemp[i]=1;
		else if (T[i]==T[i+1] && t==T[i]) btemp[i]=1;
		else btemp[i]=0;
	}

	for (j=1; j<=M; j++) {
		for (i=0; i<=K+M-j; i++) {
			float c=0;
			if (T[i+j]!=T[i]) {
				c=btemp[i]*(t-T[i])/(T[i+j]-T[i]);
			}
			if (T[i+j+1]!=T[i+1]) {
				c+= btemp[i+1]*(T[i+j+1]-t)/(T[i+j+1]-T[i+1]);
			}
			btemp[i]=c;
		}
	}

	for (i=0; i<=K; i++) {
		b[i]=btemp[i];
	}
	

	delete []btemp;



	float denom=0;
	outp[0]=0; outp[1]=0; outp[2]=0;
	for (i=0; i<=K; i++) {
		float r=W[i]*b[i];
		outp[0]+= P[i][0]*r;
		outp[1]+= P[i][1]*r;
		outp[2]+= P[i][2]*r;
		denom+=r;
	}
	outp[0]/=denom;
	outp[1]/=denom;
	outp[2]/=denom;
	delete []b;

	return 1;
}
Ejemplo n.º 12
0
static void
init_vars(void) 
{
  if (firsttime) {
    firsttime = FALSE;
    ivec_zero(cstatus);
    vec_zero(cart);
    bzero((char *)&(ctarget[1]),n_endeffs*sizeof(ctarget[1]));
  }
}
Ejemplo n.º 13
0
//-----------------------------------------------------------------------------------------------------------
// View <-> Sphere BUGGY
//-----------------------------------------------------------------------------------------------------------
int sc_physics_intersectViewSphere(VIEW* inView,VECTOR* inPos,var inRadius)
{
	
	VECTOR vcTemp1,vcTemp2,vcTemp3;
	vec_zero(vcTemp1);
	vec_zero(vcTemp2);
	vec_zero(vcTemp3);
	
	var vTemp1=0;
	var vTemp2=0;
	var vTemp3=0;
	
	vec_for_angle(vcTemp1,inView.pan);
	vec_diff(vcTemp2,inPos,inView.x);
	
	vTemp1=vec_dot(vcTemp1,vcTemp2);
	
	vec_set(vcTemp3,inView.x);
	vec_scale(vcTemp1,vTemp1);
	vec_add(vcTemp3,vcTemp1);
	
	vec_diff(vcTemp1,inPos,vcTemp3);
	vTemp2=vec_length(vcTemp1);
	
	vTemp1=tanv(atanv( sqrt(2) * tanv(inView.arc/2) ))*vTemp1;
	
	return (vTemp2-vTemp1>inRadius);
	
	
	/*
	VECTOR viewDir;
	vec_for_angle(viewDir, inView.pan);
	//vec_scale(viewDir, 5);
	return sc_physics_intersectConeSphere (inView.x, viewDir, inView.arc, inPos, inRadius );
	*/
}
Ejemplo n.º 14
0
/* Compute the first and (optionally) second derivatives with respect
   to the scale parameters for the single-feature log likelihood
   function (feat_compute_log_likelihood).  This version assumes scale
   parameters for the whole tree and for the subtree.  Return value is
   log likelihood, which is computed as a by-product.  Derivs will be
   stored in *gradient and *hessian.  If hessian == NULL,
   it will not be computed (saves some time).  */
double ff_scale_derivs_subtree(FeatFitData *d, Vector *gradient,
                               Matrix *hessian, double ***scratch) {
    double retval = 0;
    Vector *d1 = vec_new(2);
    Matrix *d2 = (hessian == NULL ? NULL : mat_new(2, 2));
    int i;
    vec_zero(gradient);
    if (hessian != NULL) mat_zero(hessian);
    for (i = d->feat->start-1; i < d->feat->end; i++) { /* offset of one */
        d->cdata->tupleidx = d->cdata->msa->ss->tuple_idx[i];
        retval += col_scale_derivs_subtree(d->cdata, d1, d2, scratch);
        vec_plus_eq(gradient, d1);
        if (hessian != NULL) mat_plus_eq(hessian, d2);
    }
    return retval;
}
Ejemplo n.º 15
0
Archivo: sys.c Proyecto: ilyak/vimol
vec_t
sys_get_sel_center(struct sys *sys, struct sel *sel)
{
	vec_t center, xyz;
	int idx;

	center = vec_zero();

	if (sel_get_count(sel) == 0)
		return (center);

	sel_iter_start(sel);

	while (sel_iter_next(sel, &idx)) {
		xyz = sys_get_atom_xyz(sys, idx);
		center = vec_add(&center, &xyz);
	}

	vec_scale(&center, 1.0 / sel_get_count(sel));

	return (center);
}
Ejemplo n.º 16
0
void do_physics(world_t *w){
	int i = 0;
	particle_t *p = NULL;
	int j = 0;
	particle_t *q = NULL;
	box_t b = box_new(vec_new(0,0),10,10);
	vec_t d = vec_new(0,0);
	while((p = world_next_moving(w,&i))){
		j = 0;
		p->move(p);
		if(particle_collides(p)){
			while((q = world_next_solid(w,b,&j))){
				if(	   q != p 
					&& particle_collide_with(q,p) 
					&& !vec_zero(d = box_oriented_collision(p->box,q->box)) ){
					/*printf("collide\n");*/
					/*d = box_oriented_collision(p->box,q->box);*/
					if(d.x != 0.0){
						p->v.x = - p->v.x * 0.5 ;
					}else{
						p->v.x *= 0.3;
					}
					if(d.y != 0.0){
						p->v.y = - p->v.y * 0.5 ;
					}else{
						p->v.x *= 0.3;
					}
					if(p->collide){
						p->collide(p,q);
					}
					p->box.pos = vec_add(p->box.pos,d);
				}
			}
		}
	}
	/*TODO collisions and damage*/
}
Ejemplo n.º 17
0
int main(int argc, char **argv)
{	
	printf("N=%d, CL_DEVICE=%d\n", N, CL_DEVICE);
	steps = 0;
	totalTime = 0;

	// distribute bodies in space (randomly)
	for(int i=0; i<N; i++) {
		B[i].m = rand_val(8,20); // (8,20)
		B[i].pos = vec_rand();
		B[i].v   = vec_zero();
	}

	// ocl initialization
	init_ocl();

	// ogl stuff
	init_gl(&argc, argv);
	glutDisplayFunc(&display);
	atexit(&exit_cb);
	glutMainLoop();
	
	return EXIT_SUCCESS;
}
void 
nb_kernel310_ppc_altivec  (int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float vfacel,tsc,fs,fs2,nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc;
	vector float fix,fiy,fiz;
	vector float tmp1,tmp2,tmp3,tmp4;
	vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12;

	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	vfacel=load_float_and_splat(p_facel);
	tsc=load_float_and_splat(p_tabscale);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			vctot      = nul;
			fix        = nul;
			fiy        = nul;
			fiz        = nul;
			ix         = vec_add(ix,shvec);
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			iq        = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);

			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
										   charge+jnrc,charge+jnrd),iq,nul);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
				add_xyz_to_mem(faction+j3d,tmp4);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_2_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_2_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_madd(fs,rinv,nul);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_3_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_3_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				qq = vec_madd(load_1_float(charge+jnra),iq,nul);
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_1(dx,dy,dz,&tmp1);
				add_xyz_to_mem(faction+j3a,tmp1);
			}
			/* update outer data */
			transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
			tmp1 = vec_add(tmp1,tmp3);
			tmp2 = vec_add(tmp2,tmp4);
			tmp1 = vec_add(tmp1,tmp2);

			add_xyz_to_mem(faction+ii3,tmp1);
			add_xyz_to_mem(fshift+is3,tmp1);

			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
void ATL_USERMM
(const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc)

{
   /*--- achitecture specific declarations ---*/

   /*--- program specific declarations ---*/
   int i, j, k;
   vector betavec;
   vector zerovec = {0.0,0.0};
   const float *pA0 = A;
   const float *pB0 = B;
   float *pC0 = C;
   float *pC1 = C+(ldc SHIFT);
   const float *stM = A + (M-M%2)*KB;
   const float *stN = B + NB*KB;
   const int incAm = 2*KB-KB+4;
   const int incBm = -KB+4;
   const int incCm = (2 SHIFT);
   const int incAn = -(M-M%2)*KB;
   const int incBn = 2*KB;
   const int incCn = ((ldc*2-(M-M%2)) SHIFT);
   const int incAm_m = KB-KB+4;
   const int incAn_m = -(M%2)*KB;
   const int incCn_m = (ldc*2-(M%2))SHIFT;
   const float *stM_m = A + M*KB;

   /*--- initial arhitecture specific statements ---*/
   vec_enter();

   /*--- main program statements ---*/
   vec_mov_mr_1(&beta,reg0);
   vec_mov_rm(reg0,betavec);
   if (M>=2)
   {
      do /* N-loop */
      {
         do /* M-loop */
         {
#ifdef BETA0
            vec_zero(reg0);
            vec_zero(reg1);
            vec_zero(reg2);
            vec_zero(reg3);
#elif defined(BETA1)
            vec_mov_mr_1(pC0,reg0);
            vec_mov_mr_1(pC0+(1 SHIFT),reg1);
            vec_mov_mr_1(pC1,reg2);
            vec_mov_mr_1(pC1+(1 SHIFT),reg3);
#else
            vec_mov_mr(betavec,reg7);
            vec_mov_mr_1(pC0,reg0);
            vec_mul_rr(reg7,reg0);
            vec_mov_mr_1(pC0+(1 SHIFT),reg1);
            vec_mul_rr(reg7,reg1);
            vec_mov_mr_1(pC1,reg2);
            vec_mul_rr(reg7,reg2);
            vec_mov_mr_1(pC1+(1 SHIFT),reg3);
            vec_mul_rr(reg7,reg3);
#endif
            vec_mov_mr(pA0,reg4);
            vec_mul_mr(pB0,reg4);
            vec_mov_mr(pA0+KB,reg5);
            vec_mul_mr(pB0,reg5);
            vec_mov_mr(pA0,reg6);
            vec_mov_mr(pA0+KB,reg7);
            align();
            for (k=0; k<KB-4; k+=16)
            {
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+2,reg4);
               vec_mul_mr(pB0+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+2+KB,reg5);
               vec_mul_mr(pB0+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+2,reg6);
               vec_mul_mr(pB0+2,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+2+KB,reg7);
               vec_mul_mr(pB0+2,reg5);
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+4,reg4);
               vec_mul_mr(pB0+2+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+4+KB,reg5);
               vec_mul_mr(pB0+2+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+4,reg6);
               vec_mul_mr(pB0+4,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+4+KB,reg7);
               vec_mul_mr(pB0+4,reg5);
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+6,reg4);
               vec_mul_mr(pB0+4+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+6+KB,reg5);
               vec_mul_mr(pB0+4+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+6,reg6);
               vec_mul_mr(pB0+6,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+6+KB,reg7);
               vec_mul_mr(pB0+6,reg5);
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+8,reg4);
               vec_mul_mr(pB0+6+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+8+KB,reg5);
               vec_mul_mr(pB0+6+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+8,reg6);
               vec_mul_mr(pB0+8,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+8+KB,reg7);
               vec_mul_mr(pB0+8,reg5);
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+10,reg4);
               vec_mul_mr(pB0+8+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+10+KB,reg5);
               vec_mul_mr(pB0+8+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+10,reg6);
               vec_mul_mr(pB0+10,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+10+KB,reg7);
               vec_mul_mr(pB0+10,reg5);
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+12,reg4);
               vec_mul_mr(pB0+10+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+12+KB,reg5);
               vec_mul_mr(pB0+10+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+12,reg6);
               vec_mul_mr(pB0+12,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+12+KB,reg7);
               vec_mul_mr(pB0+12,reg5);
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+14,reg4);
               vec_mul_mr(pB0+12+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+14+KB,reg5);
               vec_mul_mr(pB0+12+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+14,reg6);
               vec_mul_mr(pB0+14,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+14+KB,reg7);
               vec_mul_mr(pB0+14,reg5);
               vec_add_rr(reg4,reg0);
               vec_mov_mr(pA0+16,reg4);
               vec_mul_mr(pB0+14+KB,reg6);
               vec_add_rr(reg5,reg1);
               vec_mov_mr(pA0+16+KB,reg5);
               vec_mul_mr(pB0+14+KB,reg7);
               vec_add_rr(reg6,reg2);
               vec_mov_mr(pA0+16,reg6);
               vec_mul_mr(pB0+16,reg4);
               vec_add_rr(reg7,reg3);
               vec_mov_mr(pA0+16+KB,reg7);
               vec_mul_mr(pB0+16,reg5);

               pA0 += 16;
               pB0 += 16;
            }
            vec_add_rr(reg4,reg0);
            vec_mov_mr(pA0+2,reg4);
            vec_mul_mr(pB0+KB,reg6);
            vec_add_rr(reg5,reg1);
            vec_mov_mr(pA0+2+KB,reg5);
            vec_mul_mr(pB0+KB,reg7);
            vec_add_rr(reg6,reg2);
            vec_mov_mr(pA0+2,reg6);
            vec_mul_mr(pB0+2,reg4);
            vec_add_rr(reg7,reg3);
            vec_mov_mr(pA0+2+KB,reg7);
            vec_mul_mr(pB0+2,reg5);
            vec_add_rr(reg4,reg0);
            vec_add_rr(reg5,reg1);
            vec_mul_mr(pB0+2+KB,reg6);
            vec_add_rr(reg6,reg2);
            vec_mul_mr(pB0+2+KB,reg7);
            vec_add_rr(reg7,reg3);
            vec_sum(reg0);
            vec_sum(reg1);
            vec_sum(reg2);
            vec_sum(reg3);
            vec_mov_rm_1(reg0,pC0);
            vec_mov_rm_1(reg1,pC0+(1 SHIFT));
            vec_mov_rm_1(reg2,pC1);
            vec_mov_rm_1(reg3,pC1+(1 SHIFT));
            pA0 += incAm;
            pB0 += incBm;
            pC0 += incCm;
            pC1 += incCm;
         }
         while(pA0 != stM);

         pA0 += incAn;
         pB0 += incBn;
         pC0 += incCn;
         pC1 += incCn;
      }
      while(pB0 != stN);

   }
   if (M%2>0)
   {
      pC0 = C+((M-M%2)SHIFT);
      pC1 = C+(ldc SHIFT)+((M-M%2)SHIFT);
      pA0 = A+(M-M%2)*KB;
      pB0 = B;
      do /* N-loop */
      {
         do /* M-loop */
         {
#ifdef BETA0
            vec_zero(reg0);
            vec_zero(reg1);
#elif defined(BETA1)
            vec_mov_mr_1(pC0,reg0);
            vec_mov_mr_1(pC1,reg1);
#else
            vec_mov_mr(betavec,reg7);
            vec_mov_mr_1(pC0,reg0);
            vec_mul_rr(reg7,reg0);
            vec_mov_mr_1(pC1,reg1);
            vec_mul_rr(reg7,reg1);
#endif
            vec_mov_mr(pA0,reg6);
            align();
            for (k=0; k<KB-4; k+=16)
            {
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+2,reg6);
               vec_mul_mr(pB0+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0+2,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+4,reg6);
               vec_mul_mr(pB0+2+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0+4,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+6,reg6);
               vec_mul_mr(pB0+4+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0+6,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+8,reg6);
               vec_mul_mr(pB0+6+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0+8,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+10,reg6);
               vec_mul_mr(pB0+8+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0+10,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+12,reg6);
               vec_mul_mr(pB0+10+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0+12,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+14,reg6);
               vec_mul_mr(pB0+12+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);
               vec_mov_rr(reg6,reg2);
               vec_mul_mr(pB0+14,reg2);
               vec_mov_rr(reg6,reg3);
               vec_mov_mr(pA0+16,reg6);
               vec_mul_mr(pB0+14+KB,reg3);
               vec_add_rr(reg2,reg0);
               vec_add_rr(reg3,reg1);

               pA0 += 16;
               pB0 += 16;
            }
            vec_mov_rr(reg6,reg2);
            vec_mul_mr(pB0,reg2);
            vec_mov_rr(reg6,reg3);
            vec_mov_mr(pA0+2,reg6);
            vec_mul_mr(pB0+KB,reg3);
            vec_add_rr(reg2,reg0);
            vec_add_rr(reg3,reg1);
            vec_mov_rr(reg6,reg2);
            vec_mul_mr(pB0+2,reg2);
            vec_add_rr(reg2,reg0);
            vec_mov_rr(reg6,reg3);
            vec_mul_mr(pB0+2+KB,reg3);
            vec_add_rr(reg3,reg1);
            vec_sum(reg0);
            vec_sum(reg1);
            vec_mov_rm_1(reg0,pC0);
            vec_mov_rm_1(reg1,pC1);
            pA0 += incAm_m;
            pB0 += incBm;
            pC0 += (1 SHIFT);
            pC1 += (1 SHIFT);
         }
         while(pA0 != stM_m);

         pA0 += incAn_m;
         pB0 += incBn;
         pC0 += incCn_m;
         pC1 += incCn_m;
      }
      while(pB0 != stN);

   }
   vec_exit();
}
Ejemplo n.º 20
0
    vector float rinv,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12;

    int n,k,ii,is3,ii3,ntiA,nj0,nj1;
    int jnra,jnrb,jnrc,jnrd;
    int j3a,j3b,j3c,j3d;
    int nri, ntype, nouter, ninner;
    int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
    int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
    nul=vec_zero();
    vfacel=load_float_and_splat(p_facel);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
    do {
        gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
        nn0              = *count;
        nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
        *count           = nn1;
        gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
        if(nn1>nri) nn1=nri;
        for(n=nn0; (n<nn1); n++) {
#if 0
        } /* maintain correct indentation even with conditional left braces */
#endif
void 
nb_kernel010nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,c6,c12;
	vector float rinvsq,rsq,rinvsix;
  
	int n,k,ii,is3,ii3,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int ntiA,tja,tjb,tjc,tjd;
#ifdef GMX_THREAD_SHM_FDECOMP
	int nn0, nn1;
#endif
  
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();

#ifdef GMX_THREAD_SHM_FDECOMP
    nthreads = *p_nthreads;
	do {
		tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without tMPI_Threads */
		for(n=0;n<nri;n++) {
#endif
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			ix         = vec_add(ix,shvec);    
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_2_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_3_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				load_1_pair(vdwparam+tja,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			/* update outer data */
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREAD_SHM_FDECOMP
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
void 
nb_kernel231_ppc_altivec  (int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float vkrf,vcrf;
	vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
	vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
	vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
	vector float Vvdwtot,c6,c12,VVd,VVr,FFd,FFr,tsc,r;
	vector float fsO,fsH1,fsH2,krsqO,krsqH1,krsqH2;
	vector float vctot,qqO,qqH,iqO,iqH,jq;
	vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
	vector float tmp1,tmp2,tmp3,tmp4;
	vector float rinvO,rinvH1,rinvH2,rinvsqH1,rinvsqH2;
	vector float rsqO,rsqH1,rsqH2;
  
	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREAD_SHM_FDECOMP
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	tsc=load_float_and_splat(p_tabscale);
	vfacel=load_float_and_splat(p_facel);
	vkrf=load_float_and_splat(p_krf);
	vcrf=load_float_and_splat(p_crf);
	ii         = iinr[0];
	iqO        = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
	iqH        = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
	ntiA       = 2*ntype*type[ii];
  
#ifdef GMX_THREAD_SHM_FDECOMP
    nthreads = *p_nthreads;
	do {
		tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without tMPI_Threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			ii         = iinr[n];
			ii3        = 3*ii;
			load_1_3atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
										  &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
			vctot      = nul;
			Vvdwtot     = nul;
			fiOx       = nul;
			fiOy       = nul;
			fiOz       = nul;
			fiH1x      = nul;
			fiH1y      = nul;
			fiH1z      = nul;
			fiH2x      = nul;
			fiH2y      = nul;
			fiH2z      = nul;
			nj0        = jindex[n];
			nj1        = jindex[n+1];
    
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);
				r               = vec_madd(rinvO,rsqO,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				/* load 4 j charges and multiply by iq */
				jq=load_4_float(charge+jnra,charge+jnrb,
								charge+jnrc,charge+jnrd);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);

				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
				add_xyz_to_mem(faction+j3d,tmp4);
			} 
			if(k<(nj1-2)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);

				zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);

				r               = vec_madd(rinvO,rsqO,nul);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);

				jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);      
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12);
				do_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);
				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
			} else if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);

				zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);

				r               = vec_madd(rinvO,rsqO,nul);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				/* load 2 j charges and multiply by iq */
				jq=load_2_float(charge+jnra,charge+jnrb);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);
				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
			} else if(k<nj1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);

				zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);

				r               = vec_madd(rinvO,rsqO,nul);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);
				tja             = ntiA+2*type[jnra];
				/* load 1 j charges and multiply by iq */
				jq=load_1_float(charge+jnra);
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);
				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_3_to_1(dOx,dOy,dOz,&tmp1);
				add_xyz_to_mem(faction+j3a,tmp1);
			}
			/* update outer data */
			update_i_3atoms_forces(faction+ii3,fshift+is3,
								   fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,
								   fiH2x,fiH2y,fiH2z);

			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREAD_SHM_FDECOMP
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
Ejemplo n.º 23
0
void run_dla(params p) {
	time_t start, now;
	FILE *fp;

	fp = fopen(p.output_filename, "w");
	if (fp == NULL) {
		fprintf(stderr,"ERROR: could not open output file '%s'\n",
			p.output_filename);
		exit(EXIT_FAILURE);
	}

	if(KDT_DIM == 2) {
		fprintf(fp,"n x y\n");
	}
	else if(KDT_DIM == 3) {
		fprintf(fp,"n x y z\n");
	}
	else {
		fprintf(stderr, "DLA not defined for KDT_DIM=%d\n", KDT_DIM);
		exit(1);
	}

	setuprandom(p.seed);

	// setup initial world conditions
	point_list *buffer = kdt_new_point_list(p.n);
	tree_node *root = kdt_new_tree();
	point *zero = (point *)malloc(sizeof(point));
	vec_zero(zero);
	kdt_add_point(root, zero);
	double curr_max_radius = 0.0f;	
	double starting_rad = starting_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size);
	double death_rad = death_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size, p.outer_mult);

	time(&start);
	time(&now);
	int i;
	for(i=0; i < p.n && (now - start) < p.max_secs; i++) {
		if(i % (p.n / 10) == 0 && p.log_progress == 1) {
			log_progress((double)i / (double)p.n * 100.0f, curr_max_radius, starting_rad, kdt_max_depth(root));
		}
		point dir, end, col;
		point *pt = (point *)malloc(sizeof(point));
		vec_rand_unit(pt);
		vec_scalar_mult(pt, pt, starting_rad);
		int walking = 1;
		while(walking) {
			vec_rand_unit(&dir);
			vec_scalar_mult(&dir, &dir, p.step_size);
			vec_add(&end, pt, &dir);
			if(kdt_collision_detect(root, pt, &end, &col, p.epsilon, buffer) && sticks(p.stickiness)) {
				vec_copy(pt, &col);
				kdt_add_point(root, pt);
				double pt_radius = vec_length(pt);
				if(pt_radius > curr_max_radius) {
					curr_max_radius = pt_radius;
					starting_rad = starting_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size);
					death_rad = death_radius(p.min_inner_radius, curr_max_radius, p.inner_mult, p.step_size, p.outer_mult);
				}
				walking = 0;
			} 
			else {
				vec_copy(pt, &end);
				if(vec_length(pt) >= death_rad) {
					// if we have moved outside the max radius, start over
					vec_rand_unit(pt);
					vec_scalar_mult(pt, pt, starting_rad);
				}
			}
		}
		print_point(fp, i, pt);
		time(&now);
	}
	if (p.log_progress == 1) {
		log_progress((double)i / (double)p.n * 100.0f, curr_max_radius, 
			starting_rad, kdt_max_depth(root));
	}
	fclose(fp);
}
Ejemplo n.º 24
0
/* Subtree version of score test */
void ff_score_tests_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode,
                        double *feat_pvals, double *feat_null_scales,
                        double *feat_derivs, double *feat_sub_derivs,
                        double *feat_teststats, FILE *logf) {
    int i;
    FeatFitData *d, *d2;
    Vector *grad = vec_new(2);
    Matrix *fim = mat_new(2, 2);
    double lnl, teststat;
    FimGrid *grid;
    List *inside=NULL, *outside=NULL;
    TreeModel *modcpy = tm_create_copy(mod); /* need separate copy of tree model
                                              with different internal scaling
                                              data for supertree/subtree case */

    /* init FeatFitData -- one for null model, one for alt */
    d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE);
    d2 = ff_init_fit_data(mod, msa, SUBTREE, NNEUT, FALSE);
    /* mod has the subtree info, modcpy
       does not */

    /* precompute Fisher information matrices for a grid of scale values */
    grid = col_fim_grid_sub(mod);

    /* prepare lists of leaves inside and outside root, for use in
       checking for informative substitutions */
    if (mod->subtree_root != NULL) {
        inside = lst_new_ptr(mod->tree->nnodes);
        outside = lst_new_ptr(mod->tree->nnodes);
        tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside);
    }

    /* iterate through features  */
    for (i = 0; i < lst_size(gff->features); i++) {
        checkInterrupt();
        d->feat = lst_get_ptr(gff->features, i);

        /* first check for informative substitution data in feature; if none,
           don't waste time computing likelihoods */
        if (!ff_has_data_sub(mod, msa, d->feat, inside, outside)) {
            teststat = 0;
            vec_zero(grad);
        }

        else {
            vec_set(d->cdata->params, 0, d->cdata->init_scale);
            opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d,
                          &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0],
                          logf, NULL, NULL);
            /* turns out to be faster to use numerical rather than exact
               derivatives (judging by col case) */

            d2->feat = d->feat;
            d2->cdata->mod->scale = d->cdata->params->data[0];
            d2->cdata->mod->scale_sub = 1;
            tm_set_subst_matrices(d2->cdata->mod);
            ff_scale_derivs_subtree(d2, grad, NULL, d2->cdata->fels_scratch);

            fim = col_get_fim_sub(grid, d2->cdata->mod->scale);
            mat_scale(fim, d->feat->end - d->feat->start + 1);
            /* scale column-by-column FIM by length of feature (expected
               values are additive) */

            teststat = grad->data[1]*grad->data[1] /
                       (fim->data[1][1] - fim->data[0][1]*fim->data[1][0]/fim->data[0][0]);

            if (teststat < 0) {
                fprintf(stderr, "WARNING: teststat < 0 (%f)\n", teststat);
                teststat = 0;
            }

            if ((mode == ACC && grad->data[1] < 0) ||
                    (mode == CON && grad->data[1] > 0))
                teststat = 0;             /* derivative points toward boundary;
                                     truncate at 0 */

            mat_free(fim);
        }

        if (feat_pvals != NULL) {
            if (mode == NNEUT || mode == CONACC)
                feat_pvals[i] = chisq_cdf(teststat, 1, FALSE);
            else
                feat_pvals[i] = half_chisq_cdf(teststat, 1, FALSE);
            /* assumes 50:50 mix of chisq and point mass at zero */

            if (feat_pvals[i] < 1e-20)
                feat_pvals[i] = 1e-20;
            /* approx limit of eval of tail prob; pvals of 0 cause problems */

            if (mode == CONACC && grad->data[1] > 0)
                feat_pvals[i] *= -1; /* mark as acceleration */
        }

        /* store scales and log likelihood ratios if necessary */
        if (feat_null_scales != NULL) feat_null_scales[i] = d->cdata->params->data[0];
        if (feat_derivs != NULL) feat_derivs[i] = grad->data[0];
        if (feat_sub_derivs != NULL) feat_sub_derivs[i] = grad->data[1];
        if (feat_teststats != NULL) feat_teststats[i] = teststat;
    }

    ff_free_fit_data(d);
    ff_free_fit_data(d2);
    vec_free(grad);
    modcpy->estimate_branchlens = TM_BRANCHLENS_ALL;
    /* have to revert for tm_free to work
       correctly */
    tm_free(modcpy);
    col_free_fim_grid(grid);
    if (inside != NULL) lst_free(inside);
    if (outside != NULL) lst_free(outside);
}
Ejemplo n.º 25
0
void VolumeTexture::calculateTexgenPlanes(float v0[3], float v1[3], float v2[3], float v3[3]) const {

  int i;
  if (!texmap || !v) {
    // do something sensible
    vec_zero(v0);
    vec_zero(v1);
    vec_zero(v2);
    vec_zero(v3);
    v1[0] = v2[1] = v3[2] = 1;
    return;
  }

  // rescale texture coordinates by the portion of the 
  // entire texture volume they reference
  // XXX added an additional scale factor to keep "nearest" texture modes 
  //     rounding into the populated area rather than catching black
  //     texels in the empty part of the texture volume
  float tscale[3];
  tscale[0] = (v->xsize / (float)size[0]) * 0.99999f;
  tscale[1] = (v->ysize / (float)size[1]) * 0.99999f;
  tscale[2] = (v->zsize / (float)size[2]) * 0.99999f;

  // calculate length squared of volume axes
  float lensq[3];
  vec_zero(lensq);
  for (i=0; i<3; i++) {
    lensq[0] += float(v->xaxis[i] * v->xaxis[i]);
    lensq[1] += float(v->yaxis[i] * v->yaxis[i]);
    lensq[2] += float(v->zaxis[i] * v->zaxis[i]);
  }

  // Calculate reciprocal space lattice vectors, which are used
  // in the OpenGL texgen eye space plane equations in order to transform
  // incoming world coordinates to the correct texture coordinates.
  // This code should work for both orthogonal and non-orthogonal volumes.
  // The last step adds in the NPOT texture scaling where necessary.
  // Reference: Introductory Solid State Physics, H.P.Myers, page 43
  float xaxdir[3], yaxdir[3], zaxdir[3];
  float nxaxdir[3], nyaxdir[3], nzaxdir[3];
  float bxc[3], cxa[3], axb[3];
  float tmp;

  // copy axis direction vectors
  for (i=0; i<3; i++) {
    xaxdir[i] = float(v->xaxis[i]);
    yaxdir[i] = float(v->yaxis[i]);
    zaxdir[i] = float(v->zaxis[i]);
  }

  // calculate reciprocal lattice vector for X texture coordiante
  cross_prod(bxc, yaxdir, zaxdir);
  tmp = dot_prod(xaxdir, bxc); 
  for (i=0; i<3; i++) {
    nxaxdir[i] = bxc[i] / tmp;
  }

  // calculate reciprocal lattice vector for Y texture coordiante
  cross_prod(cxa, zaxdir, xaxdir);
  tmp = dot_prod(yaxdir, cxa); 
  for (i=0; i<3; i++) {
    nyaxdir[i] = cxa[i] / tmp;
  }

  // calculate reciprocal lattice vector for Z texture coordiante
  cross_prod(axb, xaxdir, yaxdir);
  tmp = dot_prod(zaxdir, axb); 
  for (i=0; i<3; i++) {
    nzaxdir[i] = axb[i] / tmp;
  }

  // negate and transform the volume origin to reciprocal space
  // for use in the OpenGL texgen plane equation
  float norigin[3];
  for (i=0; i<3; i++)
    norigin[i] = float(v->origin[i]);

  v0[0] = -dot_prod(norigin, nxaxdir) * tscale[0];
  v0[1] = -dot_prod(norigin, nyaxdir) * tscale[1];
  v0[2] = -dot_prod(norigin, nzaxdir) * tscale[2];

  // scale the volume axes for the OpenGL texgen plane equation
  for (i=0; i<3; i++) {
    v1[i] = nxaxdir[i] * tscale[0];
    v2[i] = nyaxdir[i] * tscale[1];
    v3[i] = nzaxdir[i] * tscale[2];
  }
}
Ejemplo n.º 26
0
int BSplineSurf::getParamPoint(float s,float t,float outp[3]) const
{
	if (s<U[0] || s>U[1]) {
		vec_zero(outp);
		return 0;
	}
	if (t<V[0] || t>V[1]) {
		vec_zero(outp);
		return 0;
	}

	float *b1,*b2;
	b1=new float[K1+1];
	b2=new float[K2+1];

	int i,j;

	float *btemp1,*btemp2;

	btemp1=new float[1+K1+M1];
	btemp2=new float[1+K2+M2];

	for (i=0; i<=K1+M1; i++) {
		if (s>=S[i] && s<=S[i+1]) btemp1[i]=1;
		else if (S[i]==S[i+1] && s==S[i]) btemp1[i]=1;
		else btemp1[i]=0;
	}

	for (j=1; j<=M1; j++) {
		for (i=0; i<=K1+M1-j; i++) {
			float c=0;
			if (S[i+j]!=S[i]) {
				c=btemp1[i]*(s-S[i])/(S[i+j]-S[i]);
			}
			if (S[i+j+1]!=S[i+1]) {
				c+= btemp1[i+1]*(S[i+j+1]-s)/(S[i+j+1]-S[i+1]);
			}
			btemp1[i]=c;
		}
	}

	for (i=0; i<=K1; i++) {
		b1[i]=btemp1[i];
	}


	for (i=0; i<=K2+M2; i++) {
		if (t>=T[i] && t<=T[i+1]) btemp2[i]=1;
		else if (T[i]==T[i+1] && t==T[i]) btemp2[i]=1;
		else btemp2[i]=0;
	}


	for (j=1; j<=M2; j++) {
		for (i=0; i<=K2+M2-j; i++) {
			float c=0;
			if (T[i+j]!=T[i]) {
				c=btemp2[i]*(t-T[i])/(T[i+j]-T[i]);
			}
			if (T[i+j+1]!=T[i+1]) {
				c+= btemp2[i+1]*(T[i+j+1]-t)/(T[i+j+1]-T[i+1]);
			}
			btemp2[i]=c;
		}
	}

	for (i=0; i<=K2; i++) {
		b2[i]=btemp2[i];
	}
	

	delete []btemp1;
	delete []btemp2;



	float denom=0;
	outp[0]=0; outp[1]=0; outp[2]=0;
	for (j=0; j<=K2; j++) {
		for (i=0; i<=K1; i++) {
			int ij=i+j*(K1+1);
			float r=W[ij]*b1[i]*b2[j];
			outp[0]+= P[ij][0]*r;
			outp[1]+= P[ij][1]*r;
			outp[2]+= P[ij][2]*r;
			denom+=r;
		}
	}
	outp[0]/=denom;
	outp[1]/=denom;
	outp[2]/=denom;
	delete []b1;
	delete []b2;

	return 1;
}
void 
nb_kernel133nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z,iMx,iMy,iMz;
	vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z,dMx,dMy,dMz;
	vector float Vvdwtot,c6,c12,VVd,VVr,tsc,r;
	vector float vfacel,nul;
	vector float vctot,qqM,qqH,iqM,iqH,jq;
	vector float rinvO,rinvH1,rinvH2,rinvM,rsqO,rsqH1,rsqH2,rsqM;  

	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif
 
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	tsc=load_float_and_splat(p_tabscale);
	vfacel=load_float_and_splat(p_facel);
	ii         = iinr[0];
	iqH        = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
	iqM        = vec_madd(load_float_and_splat(charge+ii+3),vfacel,nul);
	ntiA       = 2*ntype*type[ii];
  
#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			ii         = iinr[n];
			ii3        = 3*ii;
			load_1_4atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
										  &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z,
										  &iMx,&iMy,&iMz);
			vctot      = nul;
			Vvdwtot     = nul;
			nj0        = jindex[n];
			nj1        = jindex[n+1];
    
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);

				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				r               = vec_madd(rsqO,rinvO,nul);
				
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				/* load 4 j charges and multiply by iq */
				jq=load_4_float(charge+jnra,charge+jnrb,
								charge+jnrc,charge+jnrd);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_vonly_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			} 
			if(k<(nj1-2)) 
            {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),nul,&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);
				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				zero_highest_element_in_vector(&rinvO);
				zero_highest_element_in_vector(&rsqO);				
				zero_highest_element_in_3_vectors(&rinvH1,&rinvH2,&rinvM);

				r               = vec_madd(rsqO,rinvO,nul);
				
				jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				/* load 3 j charges and multiply by iq */
				load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12);
				do_vonly_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			}
            else if(k<(nj1-1))
            {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);

				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);

				zero_highest_2_elements_in_vector(&rinvO);
				zero_highest_2_elements_in_vector(&rsqO);				
				zero_highest_2_elements_in_3_vectors(&rinvM,&rinvH1,&rinvH2);

				r               = vec_madd(rsqO,rinvO,nul);
				
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				/* load 2 j charges and multiply by iq */
				jq=load_2_float(charge+jnra,charge+jnrb);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_vonly_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			} 
            else if(k<nj1) 
            {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);
				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				zero_highest_3_elements_in_vector(&rinvO);
				zero_highest_3_elements_in_vector(&rsqO);				
				zero_highest_3_elements_in_3_vectors(&rinvH1,&rinvH2,&rinvM);

				r               = vec_madd(rsqO,rinvO,nul);
				
				jq=load_1_float(charge+jnra);
				tja             = ntiA+2*type[jnra];
				/* load 1 j charge and multiply by iq */
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_vonly_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			}
			/* update outer data */
			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
Ejemplo n.º 28
0
void
get_position_xyz(struct vector *p, const struct position *pos)
{
	int i, j, br, bc, r, c;
	struct position bp;

	vec_zero(p);

	r = pos->square/BCOLS;
	c = pos->square%BCOLS;

	for (i = NUM_MAIN_BOARDS - 1; i >= 0; i--) {
		get_main_board_position(&bp, i);

		if (pos->level == bp.level) {
			br = bp.square/BCOLS;
			bc = bp.square%BCOLS;

			if (r >= br && r - br < MAIN_BOARD_SIZE &&
			  c >= bc && c - bc < MAIN_BOARD_SIZE) {
				struct vector board_center;

				get_main_board_xyz(&board_center, i);

				p->x = board_center.x
				  - .5f*MAIN_BOARD_SIZE*SQUARE_SIZE
				  + (r - br + .5f)*SQUARE_SIZE;
				p->y = board_center.y
				  - .5f*MAIN_BOARD_SIZE*SQUARE_SIZE
				  + (c - bc + .5f)*SQUARE_SIZE;
				p->z = board_center.z;

				break;
			}
		}

		for (j = 0; j < 8; j++) {
			get_attack_board_position(&bp, i, j);

			if (pos->level == bp.level) {
				br = bp.square/BCOLS;
				bc = bp.square%BCOLS;

				if (r >= br
				  && r - br < ATTACK_BOARD_SIZE
				  && c >= bc
				  && c - bc < ATTACK_BOARD_SIZE) {
					struct vector board_center;

					get_attack_board_xyz(&board_center, i,
					  j);

					p->x = board_center.x
					  - .5f*ATTACK_BOARD_SIZE*SQUARE_SIZE
					  + (r - br + .5f)*SQUARE_SIZE;

					p->y = board_center.y
					  - .5f*ATTACK_BOARD_SIZE*SQUARE_SIZE
					  + (c - bc + .5f)*SQUARE_SIZE;

					p->z = board_center.z;

					goto done;
				}
			}
		}
	}
  done:
  	;
}
Ejemplo n.º 29
0
int main(int argc, char* argv[]) {
    FILE* F;
    MSA *msa;
    int *msa_gap_patterns = NULL;
    HMM *hmm = NULL;
    TreeNode *tree = NULL;
    int i, input_format = SS, msa_idx, quiet_mode = FALSE,
           ncats, nmsas, ncats_unspooled, indel_nseqs = -1;
    String *msa_fname, *gff_fname;
    List *gff_fname_list = NULL, *msa_fname_list = NULL,
          *msa_length_list = NULL, *model_indels_str = NULL;
    Matrix *traincounts = NULL;
    Vector *begcounts = NULL, *statecounts = NULL;
    CategoryMap *cm = NULL;
    char c;
    GapPatternMap *gpm = NULL;
    GFF_Set *gff;
    char *reverse_groups_tag = NULL;

    while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) {
        switch(c) {
        case 'i':
            input_format = msa_str_to_format(optarg);
            if (input_format == -1)
                die("ERROR: bad alignment format.\n");
            break;
        case 'g':
            gff_fname_list = get_arg_list(optarg);
            break;
        case 'c':
            cm = cm_new_string_or_file(optarg);
            break;
        case 'm':
            msa_fname_list = get_arg_list(optarg);
            break;
        case 'M':
            msa_length_list = str_list_as_int(get_arg_list(optarg));
            break;
        case 'R':
            reverse_groups_tag = optarg;
            break;
        case 'I':
            model_indels_str = get_arg_list(optarg);
            break;
        case 'n':
            indel_nseqs = get_arg_int(optarg);
            break;
        case 't':
            if (optarg[0] == '(')     /* in this case, assume topology given
                                   at command line */
                tree = tr_new_from_string(optarg);
            else
                tree = tr_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'q':
            quiet_mode = TRUE;
            break;
        case 'h':
            print_usage();
            exit(0);
        case '?':
            die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n");
        }
    }

    if (msa_fname_list == NULL)
        die("ERROR: -m required.  Type 'hmm_train -h' for usage.\n");
    if (gff_fname_list == NULL)
        die("ERROR: -g required in training mode.  Type 'hmm_train -h' for usage.\n");
    if (msa_length_list != NULL && msa_fname_list != NULL)
        die("ERROR: -m and -M are mutually exclusive.  Type 'hmm_train -h' for usage.\n");
    if (model_indels_str != NULL && tree == NULL)
        die("ERROR: -I requires -t.  Type 'hmm_train -h' for usage.\n");
    if (cm == NULL)
        die("ERROR: category map required.\n");

    set_seed(-1);

    ncats = cm->ncats + 1;
    ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled :
                      ncats;
    nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) :
             lst_size(msa_fname_list));

    if (model_indels_str != NULL) {
        if (tree == NULL)
            die("ERROR: tree is NULL\n");  /*FIXME: indel_ncats broken */
        gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE);
        ncats = cm->ncats + 1;    /* numbers will change */
        ncats_unspooled = cm->unspooler == NULL ? ncats :
                          cm->unspooler->nstates_unspooled;
    }

    /* allocate memory for storage of "training paths" */
    traincounts = mat_new(ncats_unspooled, ncats_unspooled);
    statecounts = vec_new(ncats_unspooled);
    begcounts = vec_new(ncats_unspooled);
    mat_zero(traincounts);
    vec_zero(statecounts);
    vec_zero(begcounts);


    /* create skeleton of new HMM. */
    hmm = hmm_new_nstates(ncats_unspooled, 0, 0);

    /* Main loop: consider each MSA in turn */
    for (msa_idx = 0; msa_idx < nmsas; msa_idx++) {
        if (msa_fname_list != NULL) {
            msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx);
            F = phast_fopen(msa_fname->chars, "r");
            if (!quiet_mode)
                fprintf(stderr, "Reading alignment from %s ...\n",
                        F == stdin ? "stdin" : msa_fname->chars);
            msa = msa_new_from_file(F, NULL);
            phast_fclose(F);

        }
        else {                      /* only lengths of alignments specified */
            msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL);
            /* just a shell in this case */
        }

        gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx);
        if (!quiet_mode)
            fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars);
        gff = gff_read_set(phast_fopen(gff_fname->chars, "r"));

        /* convert GFF to coordinate frame of alignment */
        if (msa_length_list == NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Mapping annotations to alignment ...\n");
            msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */
        }

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Obtaining gap patterns ...\n");
            msa_gap_patterns = smalloc(msa->length * sizeof(int));
            gp_set_phylo_patterns(gpm, msa_gap_patterns, msa);
        }

        /* at this point, we don't actually need the alignment anymore;
           if using ordered suff stats (likely with large data sets),
           can free them now, to avoid running out of memory */
        if (msa->ss != NULL) {
            ss_free(msa->ss);
            msa->ss = NULL;
        }

        if (reverse_groups_tag != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n",
                        reverse_groups_tag);
            /* we don't need to reverse complement the whole alignment --
               just the gff and possibly the gap pattern array (pass a
               NULL msa) */
            gff_group(gff, reverse_groups_tag);
            msa_reverse_compl_feats(NULL, gff, msa_gap_patterns);
        }

        if (!quiet_mode)
            fprintf(stderr, "Labeling sites by category ...\n");
        msa_label_categories(msa, gff, cm);

        gff_free_set(gff);

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Remapping categories according to gap patterns ...\n");

            if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) {
                /* in this case, we'll simply reassign non-trivial gap
                   patterns randomly.  This will achieve the desired
                   effect with minimal coding, as long as the number of
                   sites is not too small (the indel model is probably
                   useless anyway if the number is small) */
                int pat, newpat;
                int npatterns = 4 * indel_nseqs - 5;
                int complex_allowed[cm->ncats+1];
                List *no_complex_names, *no_complex_nums;

                if (!quiet_mode)
                    fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs);

                /* set up index indicating by cat no. whether complex gaps
                   are allowed */
                for (i = 0; i < ncats; i++) complex_allowed[i] = 1;
                no_complex_names = lst_new_ptr(10);
                str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names);
                no_complex_nums = cm_get_category_list(cm, no_complex_names, 1);
                for (i = 0; i < lst_size(no_complex_nums); i++)
                    complex_allowed[lst_get_int(no_complex_nums, i)] = 0;
                lst_free(no_complex_nums);
                lst_free_strings(no_complex_names);
                lst_free(no_complex_names);

                /* now reassign all non-null numbers */
                for (i = 0; i < msa->length; ) {
                    if ((pat = msa_gap_patterns[i]) != 0) {
                        if (complex_allowed[msa->categories[i]])
                            newpat = 1 + ((double)npatterns * unif_rand());
                        /* random number in interval [1, npatterns] */
                        else
                            newpat = 1 + ((double)(npatterns-1) * unif_rand());
                        /* random number in interval [1,npatterns-1]
                           (excludes complex gap pattern) */
                        for (; i < msa->length && msa_gap_patterns[i] == pat; i++)
                            msa_gap_patterns[i] = newpat; /* change for whole sequence */
                    }
                    else i++;
                }
            }

            /* obtain gapped category number for each site */
            for (i = 0; i < msa->length; i++)
                if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL)
                    msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]];
        }

        if (!quiet_mode)
            fprintf(stderr, "Unspooling categories ...\n");
        cm_spooled_to_unspooled(cm, msa->categories, msa->length);

        if (!quiet_mode)
            fprintf(stderr, "Collecting training data ...\n");
        hmm_train_update_counts(traincounts, statecounts, begcounts,
                                msa->categories, msa->length,
                                ncats_unspooled);

        if (msa_gap_patterns != NULL) sfree(msa_gap_patterns);
        msa_free(msa);
    }

    /* now train HMM, using cumulative data */
    hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL,
                          begcounts, NULL);

    /* if modeling indels, adjust begin transitions so probability is
       distributed among different "gap pattern" states that all
       correspond to the same ungapped state (category); this helps
       avoid problems that occur when training on a few large sequences
       (e.g., whole chromosomes) and then testing on many shorter ones */
    if (model_indels_str != NULL) {
        double tprob[gpm->ncats];
        int nst[gpm->ncats];  /* total prob and number of states per
                             spooled, ungapped category */
        for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0;
        for (i = 0; i < hmm->nstates; i++) {
            if (vec_get(hmm->begin_transitions, i) > 0)
                /* have to go from unspooled space to spooled space, then to
                   ungapped space (HMM states correspond to unspooled,
                   gapped categories).  Note that states with nonzero begin
                   probs shouldn't be conditioned on other states. */
                tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] +=
                    vec_get(hmm->begin_transitions, i);
            nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++;
        }
        for (i = 0; i < hmm->nstates; i++)
            if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0)
                vec_set(hmm->begin_transitions, i,
                        tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] /
                        nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]);
        /* (uniform prior) */
    }

    /* write trained HMM */
    hmm_print(stdout, hmm);

    if (!quiet_mode) fprintf(stderr, "Done.\n");

    return 0;
}
void 
nb_kernel100nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float vfacel,nul;
	vector float dx,dy,dz;
	vector float vctot,qq,iq;
	vector float rinv,rsq;

	int n,k,ii,is3,ii3,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif
  
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	vfacel=load_float_and_splat(p_facel);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			vctot      = nul;
			ix         = vec_add(ix,shvec);    
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			iq         = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);

			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				/* load 4 j charges and multiply by iq */
				qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
										   charge+jnrc,charge+jnrd),iq,nul);
				vctot           = vec_madd(qq,rinv,vctot);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_2_elements_in_vector(&rinv);
				/* load 2 j charges and multiply by iq */
				qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
				vctot           = vec_madd(qq,rinv,vctot);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_3_elements_in_vector(&rinv);
				/* load 1 j charge and multiply by iq */
				qq = vec_madd(load_1_float(charge+jnra),iq,nul);
				vctot           = vec_madd(qq,rinv,vctot);
			}
			/* update outer data */
			add_vector_to_float(Vc+gid[n],vctot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}