void 
nb_kernel310_ppc_altivec  (int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float vfacel,tsc,fs,fs2,nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc;
	vector float fix,fiy,fiz;
	vector float tmp1,tmp2,tmp3,tmp4;
	vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12;

	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	vfacel=load_float_and_splat(p_facel);
	tsc=load_float_and_splat(p_tabscale);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			vctot      = nul;
			fix        = nul;
			fiy        = nul;
			fiz        = nul;
			ix         = vec_add(ix,shvec);
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			iq        = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);

			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
										   charge+jnrc,charge+jnrd),iq,nul);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
				add_xyz_to_mem(faction+j3d,tmp4);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_2_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_2_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_madd(fs,rinv,nul);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_3_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_3_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				qq = vec_madd(load_1_float(charge+jnra),iq,nul);
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_1(dx,dy,dz,&tmp1);
				add_xyz_to_mem(faction+j3a,tmp1);
			}
			/* update outer data */
			transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
			tmp1 = vec_add(tmp1,tmp3);
			tmp2 = vec_add(tmp2,tmp4);
			tmp1 = vec_add(tmp1,tmp2);

			add_xyz_to_mem(faction+ii3,tmp1);
			add_xyz_to_mem(fshift+is3,tmp1);

			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
void 
nb_kernel133nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z,iMx,iMy,iMz;
	vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z,dMx,dMy,dMz;
	vector float Vvdwtot,c6,c12,VVd,VVr,tsc,r;
	vector float vfacel,nul;
	vector float vctot,qqM,qqH,iqM,iqH,jq;
	vector float rinvO,rinvH1,rinvH2,rinvM,rsqO,rsqH1,rsqH2,rsqM;  

	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif
 
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	tsc=load_float_and_splat(p_tabscale);
	vfacel=load_float_and_splat(p_facel);
	ii         = iinr[0];
	iqH        = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
	iqM        = vec_madd(load_float_and_splat(charge+ii+3),vfacel,nul);
	ntiA       = 2*ntype*type[ii];
  
#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			ii         = iinr[n];
			ii3        = 3*ii;
			load_1_4atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
										  &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z,
										  &iMx,&iMy,&iMz);
			vctot      = nul;
			Vvdwtot     = nul;
			nj0        = jindex[n];
			nj1        = jindex[n+1];
    
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);

				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				r               = vec_madd(rsqO,rinvO,nul);
				
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				/* load 4 j charges and multiply by iq */
				jq=load_4_float(charge+jnra,charge+jnrb,
								charge+jnrc,charge+jnrd);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_vonly_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			} 
			if(k<(nj1-2)) 
            {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),nul,&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);
				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				zero_highest_element_in_vector(&rinvO);
				zero_highest_element_in_vector(&rsqO);				
				zero_highest_element_in_3_vectors(&rinvH1,&rinvH2,&rinvM);

				r               = vec_madd(rsqO,rinvO,nul);
				
				jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				/* load 3 j charges and multiply by iq */
				load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12);
				do_vonly_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			}
            else if(k<(nj1-1))
            {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);

				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);

				zero_highest_2_elements_in_vector(&rinvO);
				zero_highest_2_elements_in_vector(&rsqO);				
				zero_highest_2_elements_in_3_vectors(&rinvM,&rinvH1,&rinvH2);

				r               = vec_madd(rsqO,rinvO,nul);
				
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				/* load 2 j charges and multiply by iq */
				jq=load_2_float(charge+jnra,charge+jnrb);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_vonly_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			} 
            else if(k<nj1) 
            {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);
				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				zero_highest_3_elements_in_vector(&rinvO);
				zero_highest_3_elements_in_vector(&rsqO);				
				zero_highest_3_elements_in_3_vectors(&rinvH1,&rinvH2,&rinvM);

				r               = vec_madd(rsqO,rinvO,nul);
				
				jq=load_1_float(charge+jnra);
				tja             = ntiA+2*type[jnra];
				/* load 1 j charge and multiply by iq */
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_vonly_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			}
			/* update outer data */
			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
void 
nb_kernel010nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,c6,c12;
	vector float rinvsq,rsq,rinvsix;
  
	int n,k,ii,is3,ii3,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int ntiA,tja,tjb,tjc,tjd;
#ifdef GMX_THREAD_SHM_FDECOMP
	int nn0, nn1;
#endif
  
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();

#ifdef GMX_THREAD_SHM_FDECOMP
    nthreads = *p_nthreads;
	do {
		tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without tMPI_Threads */
		for(n=0;n<nri;n++) {
#endif
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			ix         = vec_add(ix,shvec);    
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_2_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_3_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				load_1_pair(vdwparam+tja,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			/* update outer data */
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREAD_SHM_FDECOMP
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
void 
nb_kernel100nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float vfacel,nul;
	vector float dx,dy,dz;
	vector float vctot,qq,iq;
	vector float rinv,rsq;

	int n,k,ii,is3,ii3,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif
  
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	vfacel=load_float_and_splat(p_facel);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			vctot      = nul;
			ix         = vec_add(ix,shvec);    
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			iq         = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);

			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				/* load 4 j charges and multiply by iq */
				qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
										   charge+jnrc,charge+jnrd),iq,nul);
				vctot           = vec_madd(qq,rinv,vctot);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_2_elements_in_vector(&rinv);
				/* load 2 j charges and multiply by iq */
				qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
				vctot           = vec_madd(qq,rinv,vctot);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_3_elements_in_vector(&rinv);
				/* load 1 j charge and multiply by iq */
				qq = vec_madd(load_1_float(charge+jnra),iq,nul);
				vctot           = vec_madd(qq,rinv,vctot);
			}
			/* update outer data */
			add_vector_to_float(Vc+gid[n],vctot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
Пример #5
0
int main(int argc, char **argv) {
  printf("ERKALE - Basis set tools from Hel.\n");
  print_copyright();
  print_license();
#ifdef SVNRELEASE
  printf("At svn revision %s.\n\n",SVNREVISION);
#endif
  print_hostname();

  if(argc<3) {
    printf("Usage: %s input.gbs command\n\n",argv[0]);
    help();
    return 0;
  }

  // Get filename
  std::string filein(argv[1]);
  // Load input
  BasisSetLibrary bas;
  bas.load_gaussian94(filein);

  // Get command
  std::string cmd(argv[2]);
  // and determine what to do.
  if(stricmp(cmd,"cholesky")==0) {
    // Print completeness profile.

    if(argc!=7) {
      printf("\nUsage: %s input.gbs cholesky thr maxam ovlthr output.gbs\n",argv[0]);
      return 1;
    }

    double thr(atof(argv[3]));
    int maxam(atoi(argv[4]));
    double ovlthr(atof(argv[5]));
    std::string outfile(argv[6]);

    if(maxam>=LIBINT_MAX_AM) {
      printf("Setting maxam = %i because limitations in used version of LIBINT.\n",LIBINT_MAX_AM-1);
      maxam=LIBINT_MAX_AM-1;
    }
			       
    init_libint_base();
    BasisSetLibrary ret=bas.cholesky_set(thr,maxam,ovlthr);
    ret.save_gaussian94(outfile);
    
  } else if(stricmp(cmd,"completeness")==0) {
    // Print completeness profile.

    if(argc!=5 && argc!=6) {
      printf("\nUsage: %s input.gbs completeness element output.dat (coulomb)\n",argv[0]);
      return 1;
    }

    std::string el(argv[3]);
    std::string fileout(argv[4]);
    bool coulomb=false;
    if(argc==6)
      coulomb=atoi(argv[5]);

    // Get wanted element from basis
    ElementBasisSet elbas=bas.get_element(el);

    // Compute completeness profile
    compprof_t prof=compute_completeness(elbas,-10.0,10.0,2001,coulomb);

    // Print profile in output file
    FILE *out=fopen(fileout.c_str(),"w");
    for(size_t i=0;i<prof.lga.size();i++) {
      // Value of scanning exponent
      fprintf(out,"%13e",prof.lga[i]);
      // Print completeness of shells
      for(size_t j=0;j<prof.shells.size();j++)
	fprintf(out,"\t%13e",prof.shells[j].Y[i]);
      fprintf(out,"\n");
    }
    fclose(out);

  } else if(stricmp(cmd,"composition")==0) {
    // Determine composition of basis set.

    if(argc!=3 && argc!=4) {
      printf("\nUsage: %s input.gbs composition (El)\n",argv[0]);
      return 1;
    }

    // Elemental basis sets
    std::vector<ElementBasisSet> elbases;

    if(argc==4)
      elbases.push_back(bas.get_element(argv[3]));
    else
      elbases=bas.get_elements();

    printf("\n");
    printf("el at#  [npr|nbf] [primitive|contracted(?)]\n");
    printf("-------------------------------------------\n");

    // Loop over elements
    for(size_t iel=0;iel<elbases.size();iel++) {
      // Get the basis set
      ElementBasisSet elbas=elbases[iel];

      // Decontracted basis
      ElementBasisSet eldec(elbas);
      eldec.decontract();

      // Get the shells
      std::vector<FunctionShell> sh=elbas.get_shells();
      std::vector<FunctionShell> decsh=eldec.get_shells();

      // Count the shells
      arma::imat Nsh(max_am,2);
      Nsh.zeros();
      for(size_t ish=0;ish<decsh.size();ish++)
	Nsh(decsh[ish].get_am(),0)++;
      for(size_t ish=0;ish<sh.size();ish++)
	Nsh(sh[ish].get_am(),1)++;

      // Determine if basis set is contracted and the amount of
      // functions
      bool contr=false;
      size_t nbf=0;
      size_t nprim=0;
      for(int am=0;am<max_am;am++) {
	// Number of primitives
	nprim+=Nsh(am,0)*(2*am+1);
	// Number of contracted functions
	nbf+=Nsh(am,1)*(2*am+1);
      }
      if(nbf!=nprim)
	contr=true;

      // Print composition
      printf("%-2s %3i ",elbas.get_symbol().c_str(),(int) elbas.get_number());
      if(contr) {
	// Print amount of functions
	char cmp[20];
	sprintf(cmp,"[%i|%i]",(int) nprim,(int) nbf);
	printf("%10s [",cmp);

	// Print primitives
	for(int am=0;am<max_am;am++)
	  if(Nsh(am,0)>0)
	    printf("%i%c",Nsh(am,0),tolower(shell_types[am]));
	// Print contractions
	printf("|");
	for(int am=0;am<max_am;am++)
	  if(Nsh(am,0)!=Nsh(am,1))
	    printf("%i%c",Nsh(am,1),tolower(shell_types[am]));
	printf("]\n");
      } else {
	printf("%10i  ",(int) nbf);
	for(int am=0;am<max_am;am++)
	  if(Nsh(am,0)>0)
	    printf("%i%c",Nsh(am,0),tolower(shell_types[am]));
	printf("\n");
      }
    }
  } else if(stricmp(cmd,"daug")==0 || stricmp(cmd,"taug")==0) {
    // Augment basis set

    if(argc!=4) {
      printf("\nUsage: %s input.gbs %s output.gbs\n",argv[0],tolower(cmd).c_str());
      return 1;
    }

    int naug;
    if(stricmp(cmd,"daug")==0)
      naug=1;
    else
      naug=2;

    std::string fileout(argv[3]);
    bas.augment(naug);
    bas.save_gaussian94(fileout);
  } else if(stricmp(cmd,"decontract")==0) {
  // Decontract basis set.

    if(argc!=4) {
      printf("\nUsage: %s input.gbs decontract output.gbs\n",argv[0]);
      return 1;
    }

    std::string fileout(argv[3]);
    bas.decontract();
    bas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"densityfit")==0) {
  // Generate density fitted set

    if(argc!=6) {
      printf("\nUsage: %s input.gbs densityfit lval fsam output.gbs\n",argv[0]);
      return 1;
    }

    int lval(atoi(argv[3]));
    double fsam(atof(argv[4]));
    std::string fileout(argv[5]);
    BasisSetLibrary dfit(bas.density_fitting(lval,fsam));
    dfit.save_gaussian94(fileout);

  } else if(stricmp(cmd,"dump")==0) {
    // Dump wanted element.

    if(argc!=5 && argc!=6) {
      printf("\nUsage: %s input.gbs dump element output.gbs (number)\n",argv[0]);
      return 1;
    }

    std::string el(argv[3]);
    std::string fileout(argv[4]);

    int no=0;
    if(argc==6)
      no=atoi(argv[5]);

    // Save output
    BasisSetLibrary elbas;
    elbas.add_element(bas.get_element(el,no));
    elbas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"dumpdec")==0) {
    // Dump wanted element in decontracted form.

    if(argc!=5 && argc!=6) {
      printf("\nUsage: %s input.gbs dumpdec element output.gbs (number)\n",argv[0]);
      return 1;
    }

    std::string el(argv[3]);
    std::string fileout(argv[4]);

    int no=0;
    if(argc==6)
      no=atoi(argv[5]);

    // Save output
    BasisSetLibrary elbas;
    bas.decontract();
    elbas.add_element(bas.get_element(el,no));
    elbas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"genbas")==0) {
    // Generate basis set for xyz file

    if(argc!=5) {
      printf("\nUsage: %s input.gbs genbas system.xyz output.gbs\n",argv[0]);
      return 1;
    }

    // Load atoms from xyz file
    std::vector<atom_t> atoms=load_xyz(argv[3]);
    // Output file
    std::string fileout(argv[4]);
    // Save output
    BasisSetLibrary elbas;

    // Collect elements
    std::vector<ElementBasisSet> els=bas.get_elements();
    // Loop over atoms in system
    for(size_t iat=0;iat<atoms.size();iat++) {
      bool found=false;

      // First, check if there is a special basis for the atom.
      for(size_t iel=0;iel<els.size();iel++)
	if(stricmp(atoms[iat].el,els[iel].get_symbol())==0 && atoms[iat].num == els[iel].get_number()) {
	  // Yes, add it.
	  elbas.add_element(els[iel]);
	  found=true;
	  break;
	}

      // Otherwise, check if a general basis is already in the basis
      if(!found) {
	std::vector<ElementBasisSet> added=elbas.get_elements();
	for(size_t j=0;j<added.size();j++)
	  if(added[j].get_number()==0 && stricmp(atoms[iat].el,added[j].get_symbol())==0)
	    found=true;
      }

      // If general basis not found, add it.
      if(!found) {
	for(size_t iel=0;iel<els.size();iel++)
	  if(stricmp(atoms[iat].el,els[iel].get_symbol())==0 && els[iel].get_number()==0) {
	    // Yes, add it.
	    elbas.add_element(els[iel]);
	    found=true;
	    break;
	  }
      }

      if(!found) {
	std::ostringstream oss;
	oss << "Basis set for element " << atoms[iat].el << " does not exist in " << filein << "!\n";
	throw std::runtime_error(oss.str());
      }
    }
    elbas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"merge")==0) {
    // Merge functions with too big overlap

    if(argc!=5) {
      printf("\nUsage: %s input.gbs merge cutoff output.gbs\n",argv[0]);
      return 1;
    }

    // Cutoff value
    double cutoff=atof(argv[3]);
    bas.merge(cutoff);
    bas.save_gaussian94(argv[4]);

  } else if(stricmp(cmd,"norm")==0) {
    // Normalize basis

    if(argc!=4) {
      printf("\nUsage: %s input.gbs norm output.gbs\n",argv[0]);
      return 1;
    }

    std::string fileout=argv[3];
    bas.normalize();
    bas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"orth")==0) {
    // Orthogonalize basis

    if(argc!=4) {
      printf("\nUsage: %s input.gbs orth output.gbs\n",argv[0]);
      return 1;
    }

    std::string fileout=argv[3];
    bas.orthonormalize();
    bas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"overlap")==0) {
    // Primitive overlap

    if(argc!=4) {
      printf("\nUsage: %s input.gbs overlap element\n",argv[0]);
      return 1;
    }

    // Get element basis set
    ElementBasisSet elbas=bas.get_element(argv[3]);
    elbas.decontract();

    // Loop over angular momentum
    for(int am=0;am<=elbas.get_max_am();am++) {
      // Get primitives
      arma::vec exps;
      arma::mat contr;
      elbas.get_primitives(exps,contr,am);

      // Compute overlap matrix
      arma::mat S=overlap(exps,exps,am);

      // Print out overlap
      printf("*** %c shell ***\n",shell_types[am]);
      exps.t().print("Exponents");
      printf("\n");

      S.print("Overlap");
      printf("\n");
    }

  } else if(stricmp(cmd,"Porth")==0) {
    // P-orthogonalize basis

    if(argc!=6) {
      printf("\nUsage: %s input.gbs Porth cutoff Cortho output.gbs\n",argv[0]);
      return 1;
    }

    double cutoff=atof(argv[3]);
    double Cortho=atof(argv[4]);
    std::string fileout=argv[5];
    bas.P_orthogonalize(cutoff,Cortho);
    bas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"prodset")==0) {
    // Generate product set
    
    if(argc!=6) {
      printf("\nUsage: %s input.gbs prodset lval fsam output.gbs\n",argv[0]);
      return 1;
    }

    int lval(atoi(argv[3]));
    double fsam(atof(argv[4]));
    std::string fileout(argv[5]);
    BasisSetLibrary dfit(bas.product_set(lval,fsam));
    dfit.save_gaussian94(fileout);
    
  } else if(stricmp(cmd,"save")==0) {
    // Save basis

    if(argc!=4) {
      printf("\nUsage: %s input.gbs save output.gbs\n",argv[0]);
      return 1;
    }

    std::string fileout=argv[3];
    bas.save_gaussian94(fileout);

  } else if(stricmp(cmd,"savecfour")==0) {
    // Save basis in CFOUR format

    if(argc!=5) {
      printf("\nUsage: %s input.gbs savecfour name basis.cfour\n",argv[0]);
      return 1;
    }

    std::string fileout=argv[3];
    std::string name=argv[4];
    bas.save_cfour(name,fileout);

  } else if(stricmp(cmd,"savedalton")==0) {
    // Save basis in Dalton format

    if(argc!=4) {
      printf("\nUsage: %s input.gbs savedalton output.dal\n",argv[0]);
      return 1;
    }

    std::string fileout=argv[3];
    bas.save_dalton(fileout);

  } else if(stricmp(cmd,"savemolpro")==0) {
    // Save basis in Molpro format

    if(argc!=4) {
      printf("\nUsage: %s input.gbs savemolpro output.mol\n",argv[0]);
      return 1;
    }

    std::string fileout=argv[3];
    bas.save_molpro(fileout);

  } else if(stricmp(cmd,"sort")==0) {
    // Sort basis set

    if(argc!=4) {
      printf("\nUsage: %s input.gbs sort output.gbs\n",argv[0]);
      return 1;
    }

    std::string fileout=argv[3];
    bas.sort();
    bas.save_gaussian94(fileout);
  } else {
    printf("\nInvalid command.\n");

    help();
  }

  return 0;
}
Пример #6
0
int main(int argc, char **argv) {

#ifdef _OPENMP
  printf("ERKALE - Geometry optimization from Hel, OpenMP version, running on %i cores.\n",omp_get_max_threads());
#else
  printf("ERKALE - Geometry optimization from Hel, serial version.\n");
#endif
  print_copyright();
  print_license();
#ifdef SVNRELEASE
  printf("At svn revision %s.\n\n",SVNREVISION);
#endif
  print_hostname();

  if(argc!=2) {
    printf("Usage: $ %s runfile\n",argv[0]);
    return 0;
  }

  // Initialize libint
  init_libint_base();
  // Initialize libderiv
  init_libderiv_base();

  Timer tprog;
  tprog.print_time();

  // Parse settings
  Settings set;
  set.add_scf_settings();
  set.add_string("SaveChk","File to use as checkpoint","erkale.chk");
  set.add_string("LoadChk","File to load old results from","");
  set.add_bool("ForcePol","Force polarized calculation",false);
  set.add_bool("FreezeCore","Freeze the atomic cores?",false);
  set.add_string("Optimizer","Optimizer to use: CGFR, CGPR, BFGS, BFGS2 (default), SD","BFGS2");
  set.add_int("MaxSteps","Maximum amount of geometry steps",256);
  set.add_string("Criterion","Convergence criterion to use: LOOSE, NORMAL, TIGHT, VERYTIGHT","NORMAL");
  set.add_string("OptMovie","xyz movie to store progress in","optimize.xyz");
  set.add_string("Result","File to save optimized geometry in","optimized.xyz");
  set.set_string("Logfile","erkale_geom.log");
  set.parse(std::string(argv[1]),true);
  set.print();

  bool verbose=set.get_bool("Verbose");
  int maxiter=set.get_int("MaxSteps");
  std::string optmovie=set.get_string("OptMovie");
  std::string result=set.get_string("Result");

  // Interpret optimizer
  enum minimizer alg;
  std::string method=set.get_string("Optimizer");
  if(stricmp(method,"CGFR")==0)
    alg=gCGFR;
  else if(stricmp(method,"CGPR")==0)
    alg=gCGPR;
  else if(stricmp(method,"BFGS")==0)
    alg=gBFGS;
  else if(stricmp(method,"BFGS2")==0)
    alg=gBFGS2;
  else if(stricmp(method,"SD")==0)
    alg=gSD;
  else {
    ERROR_INFO();
    throw std::runtime_error("Unknown optimization method.\n");
  }

  // Interpret optimizer
  enum convergence crit;
  method=set.get_string("Criterion");
  if(stricmp(method,"LOOSE")==0)
    crit=LOOSE;
  else if(stricmp(method,"NORMAL")==0)
    crit=NORMAL;
  else if(stricmp(method,"TIGHT")==0)
    crit=TIGHT;
  else if(stricmp(method,"VERYTIGHT")==0)
    crit=VERYTIGHT;
  else {
    ERROR_INFO();
    throw std::runtime_error("Unknown optimization method.\n");
  }

  // Redirect output?
  std::string logfile=set.get_string("Logfile");
  if(stricmp(logfile,"stdout")!=0) {
    // Redirect stdout to file
    FILE *outstream=freopen(logfile.c_str(),"w",stdout);
    if(outstream==NULL) {
      ERROR_INFO();
      throw std::runtime_error("Unable to redirect output!\n");
    } else
      fprintf(stderr,"\n");
  }

  // Read in atoms.
  std::string atomfile=set.get_string("System");
  const std::vector<atom_t> origgeom=load_xyz(atomfile);
  std::vector<atom_t> atoms(origgeom);

  // Are any atoms fixed?
  std::vector<size_t> dofidx;
  for(size_t i=0;i<atoms.size();i++) {
    bool fixed=false;

    if(atoms[i].el.size()>3)
      if(stricmp(atoms[i].el.substr(atoms[i].el.size()-3),"-Fx")==0) {
	fixed=true;
	atoms[i].el=atoms[i].el.substr(0,atoms[i].el.size()-3);
      }

    // Add to degrees of freedom
    if(!fixed)
      dofidx.push_back(i);
  }

  // Read in basis set
  BasisSetLibrary baslib;
  std::string basfile=set.get_string("Basis");
  baslib.load_gaussian94(basfile);
  printf("\n");

  // Save to output
  save_xyz(atoms,"Initial configuration",optmovie,false);

  // Minimizer options
  opthelper_t pars;
  pars.atoms=atoms;
  pars.baslib=baslib;
  pars.set=set;
  pars.dofidx=dofidx;

  /* Starting point */
  gsl_vector *x = gsl_vector_alloc (3*dofidx.size());
  for(size_t i=0;i<dofidx.size();i++) {
    gsl_vector_set(x,3*i,atoms[dofidx[i]].x);
    gsl_vector_set(x,3*i+1,atoms[dofidx[i]].y);
    gsl_vector_set(x,3*i+2,atoms[dofidx[i]].z);
  }

  // GSL status
  int status;

  const gsl_multimin_fdfminimizer_type *T;
  gsl_multimin_fdfminimizer *s;

  gsl_multimin_function_fdf minimizer;

  minimizer.n = x->size;
  minimizer.f = calc_E;
  minimizer.df = calc_f;
  minimizer.fdf = calc_Ef;
  minimizer.params = (void *) &pars;

  if(alg==gCGFR) {
    T = gsl_multimin_fdfminimizer_conjugate_fr;
    if(verbose) printf("Using Fletcher-Reeves conjugate gradients.\n");
  } else if(alg==gCGPR) {
    T = gsl_multimin_fdfminimizer_conjugate_pr;
    if(verbose) printf("Using Polak-Ribière conjugate gradients.\n");
  } else if(alg==gBFGS) {
    T = gsl_multimin_fdfminimizer_vector_bfgs;
    if(verbose) printf("Using the BFGS minimizer.\n");
  } else if(alg==gBFGS2) {
    T = gsl_multimin_fdfminimizer_vector_bfgs2;
    if(verbose) printf("Using the BFGS2 minimizer.\n");
  } else if(alg==gSD) {
    T = gsl_multimin_fdfminimizer_steepest_descent;
    if(verbose) printf("Using the steepest descent minimizer.\n");
  } else {
    ERROR_INFO();
    throw std::runtime_error("Unsupported minimizer\n");
  }

  // Run an initial calculation
  double oldE=calc_E(x,minimizer.params);

  // Turn off verbose setting
  pars.set.set_bool("Verbose",false);
  // and load from old checkpoint
  pars.set.set_string("LoadChk",pars.set.get_string("SaveChk"));

  // Initialize minimizer
  s = gsl_multimin_fdfminimizer_alloc (T, minimizer.n);

  // Use initial step length of 0.02 bohr, and a line search accuracy
  // 1e-1 (recommended in the GSL manual for BFGS)
  gsl_multimin_fdfminimizer_set (s, &minimizer, x, 0.02, 1e-1);

  // Store old force
  arma::mat oldf=interpret_force(s->gradient);

  fprintf(stderr,"Geometry optimizer initialized in %s.\n",tprog.elapsed().c_str());
  fprintf(stderr,"Entering minimization loop with %s optimizer.\n",set.get_string("Optimizer").c_str());

  fprintf(stderr,"%4s %16s %10s %10s %9s %9s %9s %9s %s\n","iter","E","dE","dE/dEproj","disp max","disp rms","f max","f rms", "titer");

  std::vector<atom_t> oldgeom(atoms);

  bool convd=false;
  int iter;

  for(iter=1;iter<=maxiter;iter++) {
    printf("\nGeometry iteration %i\n",(int) iter);
    fflush(stdout);

    Timer titer;

    status = gsl_multimin_fdfminimizer_iterate (s);

    if (status) {
      fprintf(stderr,"GSL encountered error: \"%s\".\n",gsl_strerror(status));
      break;
    }

    // New geometry is
    std::vector<atom_t> geom=get_atoms(s->x,pars);

    // Calculate displacements
    double dmax, drms;
    get_displacement(geom, oldgeom, dmax, drms);

    // Calculate projected change of energy
    double dEproj=calculate_projection(geom,oldgeom,oldf,pars.dofidx);
    // Actual change of energy is
    double dE=s->f - oldE;

    // Switch geometries
    oldgeom=geom;
    // Save old force

    // Get forces
    double fmax, frms;
    get_forces(s->gradient, fmax, frms);

    // Save geometry step
    char comment[80];
    sprintf(comment,"Step %i",(int) iter);
    save_xyz(get_atoms(s->x,pars),comment,optmovie,true);

    // Check convergence
    bool fmaxconv=false, frmsconv=false;
    bool dmaxconv=false, drmsconv=false;

    switch(crit) {

    case(LOOSE):
      if(fmax < 2.5e-3)
	fmaxconv=true;
      if(frms < 1.7e-3)
	frmsconv=true;
      if(dmax < 1.0e-2)
	dmaxconv=true;
      if(drms < 6.7e-3)
	drmsconv=true;
      break;

    case(NORMAL):
      if(fmax < 4.5e-4)
	fmaxconv=true;
      if(frms < 3.0e-4)
	frmsconv=true;
      if(dmax < 1.8e-3)
	dmaxconv=true;
      if(drms < 1.2e-3)
	drmsconv=true;
      break;

    case(TIGHT):
      if(fmax < 1.5e-5)
	fmaxconv=true;
      if(frms < 1.0e-5)
	frmsconv=true;
      if(dmax < 6.0e-5)
	dmaxconv=true;
      if(drms < 4.0e-5)
	drmsconv=true;
      break;

    case(VERYTIGHT):
      if(fmax < 2.0e-6)
	fmaxconv=true;
      if(frms < 1.0e-6)
	frmsconv=true;
      if(dmax < 6.0e-6)
	dmaxconv=true;
      if(drms < 4.0e-6)
	drmsconv=true;
      break;

    default:
      ERROR_INFO();
      throw std::runtime_error("Not implemented!\n");
    }

    // Converged?
    const static char cconv[]=" *";

    double dEfrac;
    if(dEproj!=0.0)
      dEfrac=dE/dEproj;
    else
      dEfrac=0.0;

    fprintf(stderr,"%4d % 16.8f % .3e % .3e %.3e%c %.3e%c %.3e%c %.3e%c %s\n", (int) iter, s->f, dE, dEfrac, dmax, cconv[dmaxconv], drms, cconv[drmsconv], fmax, cconv[fmaxconv], frms, cconv[frmsconv], titer.elapsed().c_str());
    fflush(stderr);

    convd=dmaxconv && drmsconv && fmaxconv && frmsconv;

    if(convd) {
      fprintf(stderr,"Converged.\n");
      break;
    }

    // Store old energy
    oldE=s->f;
    // Store old force
    oldf=interpret_force(s->gradient);
  }

  if(convd)
    save_xyz(get_atoms(s->x,pars),"Optimized geometry",result);

  gsl_multimin_fdfminimizer_free (s);

  gsl_vector_free (x);

  if(iter==maxiter && !convd) {
    printf("Geometry convergence was not achieved!\n");
  }


  printf("Running program took %s.\n",tprog.elapsed().c_str());

  return 0;
}
void 
nb_kernel231_ppc_altivec  (int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float vkrf,vcrf;
	vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z;
	vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z;
	vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul;
	vector float Vvdwtot,c6,c12,VVd,VVr,FFd,FFr,tsc,r;
	vector float fsO,fsH1,fsH2,krsqO,krsqH1,krsqH2;
	vector float vctot,qqO,qqH,iqO,iqH,jq;
	vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z;
	vector float tmp1,tmp2,tmp3,tmp4;
	vector float rinvO,rinvH1,rinvH2,rinvsqH1,rinvsqH2;
	vector float rsqO,rsqH1,rsqH2;
  
	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREAD_SHM_FDECOMP
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	tsc=load_float_and_splat(p_tabscale);
	vfacel=load_float_and_splat(p_facel);
	vkrf=load_float_and_splat(p_krf);
	vcrf=load_float_and_splat(p_crf);
	ii         = iinr[0];
	iqO        = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
	iqH        = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
	ntiA       = 2*ntype*type[ii];
  
#ifdef GMX_THREAD_SHM_FDECOMP
    nthreads = *p_nthreads;
	do {
		tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without tMPI_Threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			ii         = iinr[n];
			ii3        = 3*ii;
			load_1_3atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
										  &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z);
			vctot      = nul;
			Vvdwtot     = nul;
			fiOx       = nul;
			fiOy       = nul;
			fiOz       = nul;
			fiH1x      = nul;
			fiH1y      = nul;
			fiH1z      = nul;
			fiH2x      = nul;
			fiH2y      = nul;
			fiH2z      = nul;
			nj0        = jindex[n];
			nj1        = jindex[n+1];
    
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);
				r               = vec_madd(rinvO,rsqO,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				/* load 4 j charges and multiply by iq */
				jq=load_4_float(charge+jnra,charge+jnrb,
								charge+jnrc,charge+jnrd);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);

				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
				add_xyz_to_mem(faction+j3d,tmp4);
			} 
			if(k<(nj1-2)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);

				zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2);

				r               = vec_madd(rinvO,rsqO,nul);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);

				jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);      
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12);
				do_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);
				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
			} else if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);

				zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);

				r               = vec_madd(rinvO,rsqO,nul);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				/* load 2 j charges and multiply by iq */
				jq=load_2_float(charge+jnra,charge+jnrb);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);
				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
			} else if(k<nj1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z);
				dOx             = vec_sub(iOx,dH2x);
				dOy             = vec_sub(iOy,dH2y);
				dOz             = vec_sub(iOz,dH2z);
				dH1x            = vec_sub(iH1x,dH2x);
				dH1y            = vec_sub(iH1y,dH2y);
				dH1z            = vec_sub(iH1z,dH2z);
				dH2x            = vec_sub(iH2x,dH2x);
				dH2y            = vec_sub(iH2y,dH2y);
				dH2z            = vec_sub(iH2z,dH2z);
      
				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);

				zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2);
				do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2);
				zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2);

				r               = vec_madd(rinvO,rsqO,nul);
				rinvsqH1        = vec_madd(rinvH1,rinvH1,nul);
				rinvsqH2        = vec_madd(rinvH2,rinvH2,nul);
				tja             = ntiA+2*type[jnra];
				/* load 1 j charges and multiply by iq */
				jq=load_1_float(charge+jnra);
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				tmp1            = vec_nmsub(c6,FFd,nul);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				tmp1            = vec_nmsub(c12,FFr,tmp1);
				tmp1            = vec_madd(tmp1,tsc,nul);

				qqO             = vec_madd(iqO,jq,nul);
				qqH             = vec_madd(iqH,jq,nul);
				krsqO           = vec_madd(vkrf,rsqO,nul);
				krsqH1          = vec_madd(vkrf,rsqH1,nul);
				krsqH2          = vec_madd(vkrf,rsqH2,nul);
				fsO             = vec_nmsub(vec_two(),krsqO,rinvO);
				vcoulO          = vec_add(rinvO,krsqO);
				vcoulH1         = vec_add(rinvH1,krsqH1);
				fsO             = vec_madd(qqO,fsO,nul);          
				vcoulH2         = vec_add(rinvH2,krsqH2);
				vcoulO          = vec_sub(vcoulO,vcrf);
				vcoulH1         = vec_sub(vcoulH1,vcrf);
				vcoulH2         = vec_sub(vcoulH2,vcrf);
				vctot           = vec_madd(qqO,vcoulO,vctot);
				fsH1            = vec_nmsub(vec_two(),krsqH1,rinvH1);
				fsH2            = vec_nmsub(vec_two(),krsqH2,rinvH2);
				vctot           = vec_madd(qqH,vcoulH1,vctot);
				fsO             = vec_madd(fsO,rinvO,tmp1);
				fsO             = vec_madd(fsO,rinvO,nul);
				fsH1            = vec_madd(fsH1,qqH,nul);
				fsH2            = vec_madd(fsH2,qqH,nul);
				vctot           = vec_madd(qqH,vcoulH2,vctot);
				fsH1            = vec_madd(fsH1,rinvsqH1,nul);
				fsH2            = vec_madd(fsH2,rinvsqH2,nul);

				fiOx            = vec_madd(fsO,dOx,fiOx); /* +=fx */
				dOx             = vec_nmsub(fsO,dOx,nul); /* -fx */
				fiOy            = vec_madd(fsO,dOy,fiOy); /* +=fy */
				dOy             = vec_nmsub(fsO,dOy,nul); /* -fy */
				fiOz            = vec_madd(fsO,dOz,fiOz); /* +=fz */
				dOz             = vec_nmsub(fsO,dOz,nul); /* -fz */
				fiH1x           = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */
				dOx             = vec_nmsub(fsH1,dH1x,dOx); /* -fx */
				fiH1y           = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */
				dOy             = vec_nmsub(fsH1,dH1y,dOy); /* -fy */
				fiH1z           = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */
				dOz             = vec_nmsub(fsH1,dH1z,dOz); /* -fz */
				fiH2x           = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */
				dOx             = vec_nmsub(fsH2,dH2x,dOx); /* -fx */
				fiH2y           = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */
				dOy             = vec_nmsub(fsH2,dH2y,dOy); /* -fy */
				fiH2z           = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */
				dOz             = vec_nmsub(fsH2,dH2z,dOz); /* -fz */

				transpose_3_to_1(dOx,dOy,dOz,&tmp1);
				add_xyz_to_mem(faction+j3a,tmp1);
			}
			/* update outer data */
			update_i_3atoms_forces(faction+ii3,fshift+is3,
								   fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,
								   fiH2x,fiH2y,fiH2z);

			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREAD_SHM_FDECOMP
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
void 
nb_kernel400_ppc_altivec  (int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float vfacel,fs,nul;
	vector float dx,dy,dz;
	vector float vctot,qq,iq;
	vector float fix,fiy,fiz;
	vector float tmp1,tmp2,tmp3,tmp4;
	vector float rinv,r,rsq,VVc,FFc;
	vector float isai,isaj,isaprod,gbtsc,dvdasum,dvdaj,dvdatmp,gbscale,half;

	int n,k,ii,is3,ii3,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	half=vec_half();
	vfacel=load_float_and_splat(p_facel);
	gbtsc=load_float_and_splat(p_gbtabscale);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			vctot      = nul;
			dvdasum    = nul;
			fix        = nul;
			fiy        = nul;
			fiz        = nul;
			ix         = vec_add(ix,shvec);    
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			iq         = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
			isai       = load_float_and_splat(invsqrta+ii);

			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				r               = vec_madd(rinv,rsq,nul);
				/* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */
				isaj    = load_4_float(invsqrta+jnra,invsqrta+jnrb,
									   invsqrta+jnrc,invsqrta+jnrd);
				isaprod = vec_madd(isai,isaj,nul);
				/* load 4 j charges and multiply by iq and 1/sqrt(a1*a2) */
				qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
										   charge+jnrc,charge+jnrd),iq,nul);
				qq = vec_madd(isaprod,qq,nul);
				gbscale = vec_madd(isaprod,gbtsc,nul);
				do_4_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc);
				dvdaj           = load_4_float(dvda+jnra,dvda+jnrb,
											   dvda+jnrc,dvda+jnrd);
				fs              = vec_madd(qq,FFc,nul);
				fs              = vec_madd(fs,gbscale,nul);
				vctot           = vec_madd(qq,VVc,vctot);
				dvdatmp         = vec_madd(fs,r,nul);
				dvdatmp         = vec_madd(qq,VVc,dvdatmp);
				dvdasum         = vec_sub(dvdasum,dvdatmp);
				dvdaj		      = vec_sub(dvdaj,dvdatmp);
				store_4_float(dvdaj,dvda+jnra,dvda+jnrb,dvda+jnrc,dvda+jnrd);
				fs              = vec_nmsub(fs,rinv,nul);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
				add_xyz_to_mem(faction+j3d,tmp4);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_2_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_2_elements_in_vector(&rinv);
				r               = vec_madd(rinv,rsq,nul);
				/* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */
				isaj    = load_2_float(invsqrta+jnra,invsqrta+jnrb);
				isaprod = vec_madd(isai,isaj,nul);
				/* load 2 j charges and multiply by iq and 1/sqrt(a1*a2) */
				qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
				qq = vec_madd(isaprod,qq,nul);
				gbscale = vec_madd(isaprod,gbtsc,nul);
				do_2_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc);
				dvdaj           = load_2_float(dvda+jnra,dvda+jnrb);
				fs              = vec_madd(qq,FFc,nul);
				fs              = vec_madd(fs,gbscale,nul);
				vctot           = vec_madd(qq,VVc,vctot);
				dvdatmp         = vec_madd(fs,r,nul);
				dvdatmp         = vec_madd(qq,VVc,dvdatmp);
				dvdasum         = vec_sub(dvdasum,dvdatmp);
				dvdaj           = vec_sub(dvdaj,dvdatmp);
				store_2_float(dvdaj,dvda+jnra,dvda+jnrb);
				fs              = vec_nmsub(fs,rinv,nul);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);     
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_3_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_3_elements_in_vector(&rinv);
				r               = vec_madd(rinv,rsq,nul);
				/* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */
				isaj    = load_1_float(invsqrta+jnra);
				isaprod = vec_madd(isai,isaj,nul);
				/* load 1 j charge and multiply by iq and 1/sqrt(a1*a2) */
				qq = vec_madd(load_1_float(charge+jnra),iq,nul);
				qq = vec_madd(isaprod,qq,nul);
				gbscale = vec_madd(isaprod,gbtsc,nul);
				do_1_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc);
				dvdaj           = load_1_float(dvda+jnra);
				fs              = vec_madd(qq,FFc,nul);
				fs              = vec_madd(fs,gbscale,nul);
				vctot           = vec_madd(qq,VVc,vctot);
				dvdatmp         = vec_madd(fs,r,nul);
				dvdatmp         = vec_madd(qq,VVc,dvdatmp);
				dvdasum         = vec_sub(dvdasum,dvdatmp);
				dvdaj           = vec_sub(dvdaj,dvdatmp);
				store_1_float(dvdaj,dvda+jnra);
				fs              = vec_nmsub(fs,rinv,nul);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_1(dx,dy,dz,&tmp1);
				add_xyz_to_mem(faction+j3a,tmp1);
			}
			/* update outer data */
			transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
			tmp1 = vec_add(tmp1,tmp3);
			tmp2 = vec_add(tmp2,tmp4);
			tmp1 = vec_add(tmp1,tmp2);    
			add_xyz_to_mem(faction+ii3,tmp1);
			add_xyz_to_mem(fshift+is3,tmp1);

			add_vector_to_float(Vc+gid[n],vctot);

			add_vector_to_float(dvda+ii,dvdasum);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}