void nb_kernel310_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,tsc,fs,fs2,nul; vector float dx,dy,dz; vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc; vector float fix,fiy,fiz; vector float tmp1,tmp2,tmp3,tmp4; vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); tsc=load_float_and_splat(p_tabscale); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); Vvdwtot = nul; vctot = nul; fix = nul; fiy = nul; fiz = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); ntiA = 2*ntype*type[ii]; iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_2_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_madd(fs,rinv,nul); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_3_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; qq = vec_madd(load_1_float(charge+jnra),iq,nul); load_1_pair(vdwparam+tja,&c6,&c12); do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_1(dx,dy,dz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4); tmp1 = vec_add(tmp1,tmp3); tmp2 = vec_add(tmp2,tmp4); tmp1 = vec_add(tmp1,tmp2); add_xyz_to_mem(faction+ii3,tmp1); add_xyz_to_mem(fshift+is3,tmp1); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void nb_kernel133nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z,iMx,iMy,iMz; vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z,dMx,dMy,dMz; vector float Vvdwtot,c6,c12,VVd,VVr,tsc,r; vector float vfacel,nul; vector float vctot,qqM,qqH,iqM,iqH,jq; vector float rinvO,rinvH1,rinvH2,rinvM,rsqO,rsqH1,rsqH2,rsqM; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); tsc=load_float_and_splat(p_tabscale); vfacel=load_float_and_splat(p_facel); ii = iinr[0]; iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul); iqM = vec_madd(load_float_and_splat(charge+ii+3),vfacel,nul); ntiA = 2*ntype*type[ii]; #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; load_1_4atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz, &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z, &iMx,&iMy,&iMz); vctot = nul; Vvdwtot = nul; nj0 = jindex[n]; nj1 = jindex[n+1]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); r = vec_madd(rsqO,rinvO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; /* load 4 j charges and multiply by iq */ jq=load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_vonly_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } if(k<(nj1-2)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c),nul,&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_element_in_vector(&rinvO); zero_highest_element_in_vector(&rsqO); zero_highest_element_in_3_vectors(&rinvH1,&rinvH2,&rinvM); r = vec_madd(rsqO,rinvO,nul); jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; /* load 3 j charges and multiply by iq */ load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12); do_vonly_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } else if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_2_elements_in_vector(&rinvO); zero_highest_2_elements_in_vector(&rsqO); zero_highest_2_elements_in_3_vectors(&rinvM,&rinvH1,&rinvH2); r = vec_madd(rsqO,rinvO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; /* load 2 j charges and multiply by iq */ jq=load_2_float(charge+jnra,charge+jnrb); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_vonly_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } else if(k<nj1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_3_elements_in_vector(&rinvO); zero_highest_3_elements_in_vector(&rsqO); zero_highest_3_elements_in_3_vectors(&rinvH1,&rinvH2,&rinvM); r = vec_madd(rsqO,rinvO,nul); jq=load_1_float(charge+jnra); tja = ntiA+2*type[jnra]; /* load 1 j charge and multiply by iq */ load_1_pair(vdwparam+tja,&c6,&c12); do_vonly_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } /* update outer data */ add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void nb_kernel010nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float nul; vector float dx,dy,dz; vector float Vvdwtot,c6,c12; vector float rinvsq,rsq,rinvsix; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int ntiA,tja,tjb,tjc,tjd; #ifdef GMX_THREAD_SHM_FDECOMP int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); #ifdef GMX_THREAD_SHM_FDECOMP nthreads = *p_nthreads; do { tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without tMPI_Threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); Vvdwtot = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); ntiA = 2*ntype*type[ii]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); zero_highest_2_elements_in_vector(&rinvsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); zero_highest_3_elements_in_vector(&rinvsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; load_1_pair(vdwparam+tja,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); } /* update outer data */ add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREAD_SHM_FDECOMP nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void nb_kernel100nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,nul; vector float dx,dy,dz; vector float vctot,qq,iq; vector float rinv,rsq; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); vctot = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); /* load 4 j charges and multiply by iq */ qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); vctot = vec_madd(qq,rinv,vctot); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); /* load 2 j charges and multiply by iq */ qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); vctot = vec_madd(qq,rinv,vctot); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); /* load 1 j charge and multiply by iq */ qq = vec_madd(load_1_float(charge+jnra),iq,nul); vctot = vec_madd(qq,rinv,vctot); } /* update outer data */ add_vector_to_float(Vc+gid[n],vctot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
int main(int argc, char **argv) { printf("ERKALE - Basis set tools from Hel.\n"); print_copyright(); print_license(); #ifdef SVNRELEASE printf("At svn revision %s.\n\n",SVNREVISION); #endif print_hostname(); if(argc<3) { printf("Usage: %s input.gbs command\n\n",argv[0]); help(); return 0; } // Get filename std::string filein(argv[1]); // Load input BasisSetLibrary bas; bas.load_gaussian94(filein); // Get command std::string cmd(argv[2]); // and determine what to do. if(stricmp(cmd,"cholesky")==0) { // Print completeness profile. if(argc!=7) { printf("\nUsage: %s input.gbs cholesky thr maxam ovlthr output.gbs\n",argv[0]); return 1; } double thr(atof(argv[3])); int maxam(atoi(argv[4])); double ovlthr(atof(argv[5])); std::string outfile(argv[6]); if(maxam>=LIBINT_MAX_AM) { printf("Setting maxam = %i because limitations in used version of LIBINT.\n",LIBINT_MAX_AM-1); maxam=LIBINT_MAX_AM-1; } init_libint_base(); BasisSetLibrary ret=bas.cholesky_set(thr,maxam,ovlthr); ret.save_gaussian94(outfile); } else if(stricmp(cmd,"completeness")==0) { // Print completeness profile. if(argc!=5 && argc!=6) { printf("\nUsage: %s input.gbs completeness element output.dat (coulomb)\n",argv[0]); return 1; } std::string el(argv[3]); std::string fileout(argv[4]); bool coulomb=false; if(argc==6) coulomb=atoi(argv[5]); // Get wanted element from basis ElementBasisSet elbas=bas.get_element(el); // Compute completeness profile compprof_t prof=compute_completeness(elbas,-10.0,10.0,2001,coulomb); // Print profile in output file FILE *out=fopen(fileout.c_str(),"w"); for(size_t i=0;i<prof.lga.size();i++) { // Value of scanning exponent fprintf(out,"%13e",prof.lga[i]); // Print completeness of shells for(size_t j=0;j<prof.shells.size();j++) fprintf(out,"\t%13e",prof.shells[j].Y[i]); fprintf(out,"\n"); } fclose(out); } else if(stricmp(cmd,"composition")==0) { // Determine composition of basis set. if(argc!=3 && argc!=4) { printf("\nUsage: %s input.gbs composition (El)\n",argv[0]); return 1; } // Elemental basis sets std::vector<ElementBasisSet> elbases; if(argc==4) elbases.push_back(bas.get_element(argv[3])); else elbases=bas.get_elements(); printf("\n"); printf("el at# [npr|nbf] [primitive|contracted(?)]\n"); printf("-------------------------------------------\n"); // Loop over elements for(size_t iel=0;iel<elbases.size();iel++) { // Get the basis set ElementBasisSet elbas=elbases[iel]; // Decontracted basis ElementBasisSet eldec(elbas); eldec.decontract(); // Get the shells std::vector<FunctionShell> sh=elbas.get_shells(); std::vector<FunctionShell> decsh=eldec.get_shells(); // Count the shells arma::imat Nsh(max_am,2); Nsh.zeros(); for(size_t ish=0;ish<decsh.size();ish++) Nsh(decsh[ish].get_am(),0)++; for(size_t ish=0;ish<sh.size();ish++) Nsh(sh[ish].get_am(),1)++; // Determine if basis set is contracted and the amount of // functions bool contr=false; size_t nbf=0; size_t nprim=0; for(int am=0;am<max_am;am++) { // Number of primitives nprim+=Nsh(am,0)*(2*am+1); // Number of contracted functions nbf+=Nsh(am,1)*(2*am+1); } if(nbf!=nprim) contr=true; // Print composition printf("%-2s %3i ",elbas.get_symbol().c_str(),(int) elbas.get_number()); if(contr) { // Print amount of functions char cmp[20]; sprintf(cmp,"[%i|%i]",(int) nprim,(int) nbf); printf("%10s [",cmp); // Print primitives for(int am=0;am<max_am;am++) if(Nsh(am,0)>0) printf("%i%c",Nsh(am,0),tolower(shell_types[am])); // Print contractions printf("|"); for(int am=0;am<max_am;am++) if(Nsh(am,0)!=Nsh(am,1)) printf("%i%c",Nsh(am,1),tolower(shell_types[am])); printf("]\n"); } else { printf("%10i ",(int) nbf); for(int am=0;am<max_am;am++) if(Nsh(am,0)>0) printf("%i%c",Nsh(am,0),tolower(shell_types[am])); printf("\n"); } } } else if(stricmp(cmd,"daug")==0 || stricmp(cmd,"taug")==0) { // Augment basis set if(argc!=4) { printf("\nUsage: %s input.gbs %s output.gbs\n",argv[0],tolower(cmd).c_str()); return 1; } int naug; if(stricmp(cmd,"daug")==0) naug=1; else naug=2; std::string fileout(argv[3]); bas.augment(naug); bas.save_gaussian94(fileout); } else if(stricmp(cmd,"decontract")==0) { // Decontract basis set. if(argc!=4) { printf("\nUsage: %s input.gbs decontract output.gbs\n",argv[0]); return 1; } std::string fileout(argv[3]); bas.decontract(); bas.save_gaussian94(fileout); } else if(stricmp(cmd,"densityfit")==0) { // Generate density fitted set if(argc!=6) { printf("\nUsage: %s input.gbs densityfit lval fsam output.gbs\n",argv[0]); return 1; } int lval(atoi(argv[3])); double fsam(atof(argv[4])); std::string fileout(argv[5]); BasisSetLibrary dfit(bas.density_fitting(lval,fsam)); dfit.save_gaussian94(fileout); } else if(stricmp(cmd,"dump")==0) { // Dump wanted element. if(argc!=5 && argc!=6) { printf("\nUsage: %s input.gbs dump element output.gbs (number)\n",argv[0]); return 1; } std::string el(argv[3]); std::string fileout(argv[4]); int no=0; if(argc==6) no=atoi(argv[5]); // Save output BasisSetLibrary elbas; elbas.add_element(bas.get_element(el,no)); elbas.save_gaussian94(fileout); } else if(stricmp(cmd,"dumpdec")==0) { // Dump wanted element in decontracted form. if(argc!=5 && argc!=6) { printf("\nUsage: %s input.gbs dumpdec element output.gbs (number)\n",argv[0]); return 1; } std::string el(argv[3]); std::string fileout(argv[4]); int no=0; if(argc==6) no=atoi(argv[5]); // Save output BasisSetLibrary elbas; bas.decontract(); elbas.add_element(bas.get_element(el,no)); elbas.save_gaussian94(fileout); } else if(stricmp(cmd,"genbas")==0) { // Generate basis set for xyz file if(argc!=5) { printf("\nUsage: %s input.gbs genbas system.xyz output.gbs\n",argv[0]); return 1; } // Load atoms from xyz file std::vector<atom_t> atoms=load_xyz(argv[3]); // Output file std::string fileout(argv[4]); // Save output BasisSetLibrary elbas; // Collect elements std::vector<ElementBasisSet> els=bas.get_elements(); // Loop over atoms in system for(size_t iat=0;iat<atoms.size();iat++) { bool found=false; // First, check if there is a special basis for the atom. for(size_t iel=0;iel<els.size();iel++) if(stricmp(atoms[iat].el,els[iel].get_symbol())==0 && atoms[iat].num == els[iel].get_number()) { // Yes, add it. elbas.add_element(els[iel]); found=true; break; } // Otherwise, check if a general basis is already in the basis if(!found) { std::vector<ElementBasisSet> added=elbas.get_elements(); for(size_t j=0;j<added.size();j++) if(added[j].get_number()==0 && stricmp(atoms[iat].el,added[j].get_symbol())==0) found=true; } // If general basis not found, add it. if(!found) { for(size_t iel=0;iel<els.size();iel++) if(stricmp(atoms[iat].el,els[iel].get_symbol())==0 && els[iel].get_number()==0) { // Yes, add it. elbas.add_element(els[iel]); found=true; break; } } if(!found) { std::ostringstream oss; oss << "Basis set for element " << atoms[iat].el << " does not exist in " << filein << "!\n"; throw std::runtime_error(oss.str()); } } elbas.save_gaussian94(fileout); } else if(stricmp(cmd,"merge")==0) { // Merge functions with too big overlap if(argc!=5) { printf("\nUsage: %s input.gbs merge cutoff output.gbs\n",argv[0]); return 1; } // Cutoff value double cutoff=atof(argv[3]); bas.merge(cutoff); bas.save_gaussian94(argv[4]); } else if(stricmp(cmd,"norm")==0) { // Normalize basis if(argc!=4) { printf("\nUsage: %s input.gbs norm output.gbs\n",argv[0]); return 1; } std::string fileout=argv[3]; bas.normalize(); bas.save_gaussian94(fileout); } else if(stricmp(cmd,"orth")==0) { // Orthogonalize basis if(argc!=4) { printf("\nUsage: %s input.gbs orth output.gbs\n",argv[0]); return 1; } std::string fileout=argv[3]; bas.orthonormalize(); bas.save_gaussian94(fileout); } else if(stricmp(cmd,"overlap")==0) { // Primitive overlap if(argc!=4) { printf("\nUsage: %s input.gbs overlap element\n",argv[0]); return 1; } // Get element basis set ElementBasisSet elbas=bas.get_element(argv[3]); elbas.decontract(); // Loop over angular momentum for(int am=0;am<=elbas.get_max_am();am++) { // Get primitives arma::vec exps; arma::mat contr; elbas.get_primitives(exps,contr,am); // Compute overlap matrix arma::mat S=overlap(exps,exps,am); // Print out overlap printf("*** %c shell ***\n",shell_types[am]); exps.t().print("Exponents"); printf("\n"); S.print("Overlap"); printf("\n"); } } else if(stricmp(cmd,"Porth")==0) { // P-orthogonalize basis if(argc!=6) { printf("\nUsage: %s input.gbs Porth cutoff Cortho output.gbs\n",argv[0]); return 1; } double cutoff=atof(argv[3]); double Cortho=atof(argv[4]); std::string fileout=argv[5]; bas.P_orthogonalize(cutoff,Cortho); bas.save_gaussian94(fileout); } else if(stricmp(cmd,"prodset")==0) { // Generate product set if(argc!=6) { printf("\nUsage: %s input.gbs prodset lval fsam output.gbs\n",argv[0]); return 1; } int lval(atoi(argv[3])); double fsam(atof(argv[4])); std::string fileout(argv[5]); BasisSetLibrary dfit(bas.product_set(lval,fsam)); dfit.save_gaussian94(fileout); } else if(stricmp(cmd,"save")==0) { // Save basis if(argc!=4) { printf("\nUsage: %s input.gbs save output.gbs\n",argv[0]); return 1; } std::string fileout=argv[3]; bas.save_gaussian94(fileout); } else if(stricmp(cmd,"savecfour")==0) { // Save basis in CFOUR format if(argc!=5) { printf("\nUsage: %s input.gbs savecfour name basis.cfour\n",argv[0]); return 1; } std::string fileout=argv[3]; std::string name=argv[4]; bas.save_cfour(name,fileout); } else if(stricmp(cmd,"savedalton")==0) { // Save basis in Dalton format if(argc!=4) { printf("\nUsage: %s input.gbs savedalton output.dal\n",argv[0]); return 1; } std::string fileout=argv[3]; bas.save_dalton(fileout); } else if(stricmp(cmd,"savemolpro")==0) { // Save basis in Molpro format if(argc!=4) { printf("\nUsage: %s input.gbs savemolpro output.mol\n",argv[0]); return 1; } std::string fileout=argv[3]; bas.save_molpro(fileout); } else if(stricmp(cmd,"sort")==0) { // Sort basis set if(argc!=4) { printf("\nUsage: %s input.gbs sort output.gbs\n",argv[0]); return 1; } std::string fileout=argv[3]; bas.sort(); bas.save_gaussian94(fileout); } else { printf("\nInvalid command.\n"); help(); } return 0; }
int main(int argc, char **argv) { #ifdef _OPENMP printf("ERKALE - Geometry optimization from Hel, OpenMP version, running on %i cores.\n",omp_get_max_threads()); #else printf("ERKALE - Geometry optimization from Hel, serial version.\n"); #endif print_copyright(); print_license(); #ifdef SVNRELEASE printf("At svn revision %s.\n\n",SVNREVISION); #endif print_hostname(); if(argc!=2) { printf("Usage: $ %s runfile\n",argv[0]); return 0; } // Initialize libint init_libint_base(); // Initialize libderiv init_libderiv_base(); Timer tprog; tprog.print_time(); // Parse settings Settings set; set.add_scf_settings(); set.add_string("SaveChk","File to use as checkpoint","erkale.chk"); set.add_string("LoadChk","File to load old results from",""); set.add_bool("ForcePol","Force polarized calculation",false); set.add_bool("FreezeCore","Freeze the atomic cores?",false); set.add_string("Optimizer","Optimizer to use: CGFR, CGPR, BFGS, BFGS2 (default), SD","BFGS2"); set.add_int("MaxSteps","Maximum amount of geometry steps",256); set.add_string("Criterion","Convergence criterion to use: LOOSE, NORMAL, TIGHT, VERYTIGHT","NORMAL"); set.add_string("OptMovie","xyz movie to store progress in","optimize.xyz"); set.add_string("Result","File to save optimized geometry in","optimized.xyz"); set.set_string("Logfile","erkale_geom.log"); set.parse(std::string(argv[1]),true); set.print(); bool verbose=set.get_bool("Verbose"); int maxiter=set.get_int("MaxSteps"); std::string optmovie=set.get_string("OptMovie"); std::string result=set.get_string("Result"); // Interpret optimizer enum minimizer alg; std::string method=set.get_string("Optimizer"); if(stricmp(method,"CGFR")==0) alg=gCGFR; else if(stricmp(method,"CGPR")==0) alg=gCGPR; else if(stricmp(method,"BFGS")==0) alg=gBFGS; else if(stricmp(method,"BFGS2")==0) alg=gBFGS2; else if(stricmp(method,"SD")==0) alg=gSD; else { ERROR_INFO(); throw std::runtime_error("Unknown optimization method.\n"); } // Interpret optimizer enum convergence crit; method=set.get_string("Criterion"); if(stricmp(method,"LOOSE")==0) crit=LOOSE; else if(stricmp(method,"NORMAL")==0) crit=NORMAL; else if(stricmp(method,"TIGHT")==0) crit=TIGHT; else if(stricmp(method,"VERYTIGHT")==0) crit=VERYTIGHT; else { ERROR_INFO(); throw std::runtime_error("Unknown optimization method.\n"); } // Redirect output? std::string logfile=set.get_string("Logfile"); if(stricmp(logfile,"stdout")!=0) { // Redirect stdout to file FILE *outstream=freopen(logfile.c_str(),"w",stdout); if(outstream==NULL) { ERROR_INFO(); throw std::runtime_error("Unable to redirect output!\n"); } else fprintf(stderr,"\n"); } // Read in atoms. std::string atomfile=set.get_string("System"); const std::vector<atom_t> origgeom=load_xyz(atomfile); std::vector<atom_t> atoms(origgeom); // Are any atoms fixed? std::vector<size_t> dofidx; for(size_t i=0;i<atoms.size();i++) { bool fixed=false; if(atoms[i].el.size()>3) if(stricmp(atoms[i].el.substr(atoms[i].el.size()-3),"-Fx")==0) { fixed=true; atoms[i].el=atoms[i].el.substr(0,atoms[i].el.size()-3); } // Add to degrees of freedom if(!fixed) dofidx.push_back(i); } // Read in basis set BasisSetLibrary baslib; std::string basfile=set.get_string("Basis"); baslib.load_gaussian94(basfile); printf("\n"); // Save to output save_xyz(atoms,"Initial configuration",optmovie,false); // Minimizer options opthelper_t pars; pars.atoms=atoms; pars.baslib=baslib; pars.set=set; pars.dofidx=dofidx; /* Starting point */ gsl_vector *x = gsl_vector_alloc (3*dofidx.size()); for(size_t i=0;i<dofidx.size();i++) { gsl_vector_set(x,3*i,atoms[dofidx[i]].x); gsl_vector_set(x,3*i+1,atoms[dofidx[i]].y); gsl_vector_set(x,3*i+2,atoms[dofidx[i]].z); } // GSL status int status; const gsl_multimin_fdfminimizer_type *T; gsl_multimin_fdfminimizer *s; gsl_multimin_function_fdf minimizer; minimizer.n = x->size; minimizer.f = calc_E; minimizer.df = calc_f; minimizer.fdf = calc_Ef; minimizer.params = (void *) &pars; if(alg==gCGFR) { T = gsl_multimin_fdfminimizer_conjugate_fr; if(verbose) printf("Using Fletcher-Reeves conjugate gradients.\n"); } else if(alg==gCGPR) { T = gsl_multimin_fdfminimizer_conjugate_pr; if(verbose) printf("Using Polak-Ribière conjugate gradients.\n"); } else if(alg==gBFGS) { T = gsl_multimin_fdfminimizer_vector_bfgs; if(verbose) printf("Using the BFGS minimizer.\n"); } else if(alg==gBFGS2) { T = gsl_multimin_fdfminimizer_vector_bfgs2; if(verbose) printf("Using the BFGS2 minimizer.\n"); } else if(alg==gSD) { T = gsl_multimin_fdfminimizer_steepest_descent; if(verbose) printf("Using the steepest descent minimizer.\n"); } else { ERROR_INFO(); throw std::runtime_error("Unsupported minimizer\n"); } // Run an initial calculation double oldE=calc_E(x,minimizer.params); // Turn off verbose setting pars.set.set_bool("Verbose",false); // and load from old checkpoint pars.set.set_string("LoadChk",pars.set.get_string("SaveChk")); // Initialize minimizer s = gsl_multimin_fdfminimizer_alloc (T, minimizer.n); // Use initial step length of 0.02 bohr, and a line search accuracy // 1e-1 (recommended in the GSL manual for BFGS) gsl_multimin_fdfminimizer_set (s, &minimizer, x, 0.02, 1e-1); // Store old force arma::mat oldf=interpret_force(s->gradient); fprintf(stderr,"Geometry optimizer initialized in %s.\n",tprog.elapsed().c_str()); fprintf(stderr,"Entering minimization loop with %s optimizer.\n",set.get_string("Optimizer").c_str()); fprintf(stderr,"%4s %16s %10s %10s %9s %9s %9s %9s %s\n","iter","E","dE","dE/dEproj","disp max","disp rms","f max","f rms", "titer"); std::vector<atom_t> oldgeom(atoms); bool convd=false; int iter; for(iter=1;iter<=maxiter;iter++) { printf("\nGeometry iteration %i\n",(int) iter); fflush(stdout); Timer titer; status = gsl_multimin_fdfminimizer_iterate (s); if (status) { fprintf(stderr,"GSL encountered error: \"%s\".\n",gsl_strerror(status)); break; } // New geometry is std::vector<atom_t> geom=get_atoms(s->x,pars); // Calculate displacements double dmax, drms; get_displacement(geom, oldgeom, dmax, drms); // Calculate projected change of energy double dEproj=calculate_projection(geom,oldgeom,oldf,pars.dofidx); // Actual change of energy is double dE=s->f - oldE; // Switch geometries oldgeom=geom; // Save old force // Get forces double fmax, frms; get_forces(s->gradient, fmax, frms); // Save geometry step char comment[80]; sprintf(comment,"Step %i",(int) iter); save_xyz(get_atoms(s->x,pars),comment,optmovie,true); // Check convergence bool fmaxconv=false, frmsconv=false; bool dmaxconv=false, drmsconv=false; switch(crit) { case(LOOSE): if(fmax < 2.5e-3) fmaxconv=true; if(frms < 1.7e-3) frmsconv=true; if(dmax < 1.0e-2) dmaxconv=true; if(drms < 6.7e-3) drmsconv=true; break; case(NORMAL): if(fmax < 4.5e-4) fmaxconv=true; if(frms < 3.0e-4) frmsconv=true; if(dmax < 1.8e-3) dmaxconv=true; if(drms < 1.2e-3) drmsconv=true; break; case(TIGHT): if(fmax < 1.5e-5) fmaxconv=true; if(frms < 1.0e-5) frmsconv=true; if(dmax < 6.0e-5) dmaxconv=true; if(drms < 4.0e-5) drmsconv=true; break; case(VERYTIGHT): if(fmax < 2.0e-6) fmaxconv=true; if(frms < 1.0e-6) frmsconv=true; if(dmax < 6.0e-6) dmaxconv=true; if(drms < 4.0e-6) drmsconv=true; break; default: ERROR_INFO(); throw std::runtime_error("Not implemented!\n"); } // Converged? const static char cconv[]=" *"; double dEfrac; if(dEproj!=0.0) dEfrac=dE/dEproj; else dEfrac=0.0; fprintf(stderr,"%4d % 16.8f % .3e % .3e %.3e%c %.3e%c %.3e%c %.3e%c %s\n", (int) iter, s->f, dE, dEfrac, dmax, cconv[dmaxconv], drms, cconv[drmsconv], fmax, cconv[fmaxconv], frms, cconv[frmsconv], titer.elapsed().c_str()); fflush(stderr); convd=dmaxconv && drmsconv && fmaxconv && frmsconv; if(convd) { fprintf(stderr,"Converged.\n"); break; } // Store old energy oldE=s->f; // Store old force oldf=interpret_force(s->gradient); } if(convd) save_xyz(get_atoms(s->x,pars),"Optimized geometry",result); gsl_multimin_fdfminimizer_free (s); gsl_vector_free (x); if(iter==maxiter && !convd) { printf("Geometry convergence was not achieved!\n"); } printf("Running program took %s.\n",tprog.elapsed().c_str()); return 0; }
void nb_kernel231_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float vkrf,vcrf; vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z; vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z; vector float vfacel,vcoulO,vcoulH1,vcoulH2,nul; vector float Vvdwtot,c6,c12,VVd,VVr,FFd,FFr,tsc,r; vector float fsO,fsH1,fsH2,krsqO,krsqH1,krsqH2; vector float vctot,qqO,qqH,iqO,iqH,jq; vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z; vector float tmp1,tmp2,tmp3,tmp4; vector float rinvO,rinvH1,rinvH2,rinvsqH1,rinvsqH2; vector float rsqO,rsqH1,rsqH2; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREAD_SHM_FDECOMP int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); tsc=load_float_and_splat(p_tabscale); vfacel=load_float_and_splat(p_facel); vkrf=load_float_and_splat(p_krf); vcrf=load_float_and_splat(p_crf); ii = iinr[0]; iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul); ntiA = 2*ntype*type[ii]; #ifdef GMX_THREAD_SHM_FDECOMP nthreads = *p_nthreads; do { tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without tMPI_Threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; load_1_3atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz, &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z); vctot = nul; Vvdwtot = nul; fiOx = nul; fiOy = nul; fiOz = nul; fiH1x = nul; fiH1y = nul; fiH1z = nul; fiH2x = nul; fiH2y = nul; fiH2z = nul; nj0 = jindex[n]; nj1 = jindex[n+1]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); r = vec_madd(rinvO,rsqO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; /* load 4 j charges and multiply by iq */ jq=load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-2)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2); r = vec_madd(rinvO,rsqO,nul); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12); do_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); } else if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2); r = vec_madd(rinvO,rsqO,nul); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; /* load 2 j charges and multiply by iq */ jq=load_2_float(charge+jnra,charge+jnrb); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); } else if(k<nj1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2); r = vec_madd(rinvO,rsqO,nul); rinvsqH1 = vec_madd(rinvH1,rinvH1,nul); rinvsqH2 = vec_madd(rinvH2,rinvH2,nul); tja = ntiA+2*type[jnra]; /* load 1 j charges and multiply by iq */ jq=load_1_float(charge+jnra); load_1_pair(vdwparam+tja,&c6,&c12); do_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&FFd,&VVr,&FFr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); tmp1 = vec_nmsub(c6,FFd,nul); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); tmp1 = vec_nmsub(c12,FFr,tmp1); tmp1 = vec_madd(tmp1,tsc,nul); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); krsqO = vec_madd(vkrf,rsqO,nul); krsqH1 = vec_madd(vkrf,rsqH1,nul); krsqH2 = vec_madd(vkrf,rsqH2,nul); fsO = vec_nmsub(vec_two(),krsqO,rinvO); vcoulO = vec_add(rinvO,krsqO); vcoulH1 = vec_add(rinvH1,krsqH1); fsO = vec_madd(qqO,fsO,nul); vcoulH2 = vec_add(rinvH2,krsqH2); vcoulO = vec_sub(vcoulO,vcrf); vcoulH1 = vec_sub(vcoulH1,vcrf); vcoulH2 = vec_sub(vcoulH2,vcrf); vctot = vec_madd(qqO,vcoulO,vctot); fsH1 = vec_nmsub(vec_two(),krsqH1,rinvH1); fsH2 = vec_nmsub(vec_two(),krsqH2,rinvH2); vctot = vec_madd(qqH,vcoulH1,vctot); fsO = vec_madd(fsO,rinvO,tmp1); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,qqH,nul); fsH2 = vec_madd(fsH2,qqH,nul); vctot = vec_madd(qqH,vcoulH2,vctot); fsH1 = vec_madd(fsH1,rinvsqH1,nul); fsH2 = vec_madd(fsH2,rinvsqH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_1(dOx,dOy,dOz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ update_i_3atoms_forces(faction+ii3,fshift+is3, fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z, fiH2x,fiH2y,fiH2z); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREAD_SHM_FDECOMP nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void nb_kernel400_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,fs,nul; vector float dx,dy,dz; vector float vctot,qq,iq; vector float fix,fiy,fiz; vector float tmp1,tmp2,tmp3,tmp4; vector float rinv,r,rsq,VVc,FFc; vector float isai,isaj,isaprod,gbtsc,dvdasum,dvdaj,dvdatmp,gbscale,half; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); half=vec_half(); vfacel=load_float_and_splat(p_facel); gbtsc=load_float_and_splat(p_gbtabscale); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); vctot = nul; dvdasum = nul; fix = nul; fiy = nul; fiz = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); isai = load_float_and_splat(invsqrta+ii); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); r = vec_madd(rinv,rsq,nul); /* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */ isaj = load_4_float(invsqrta+jnra,invsqrta+jnrb, invsqrta+jnrc,invsqrta+jnrd); isaprod = vec_madd(isai,isaj,nul); /* load 4 j charges and multiply by iq and 1/sqrt(a1*a2) */ qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); qq = vec_madd(isaprod,qq,nul); gbscale = vec_madd(isaprod,gbtsc,nul); do_4_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc); dvdaj = load_4_float(dvda+jnra,dvda+jnrb, dvda+jnrc,dvda+jnrd); fs = vec_madd(qq,FFc,nul); fs = vec_madd(fs,gbscale,nul); vctot = vec_madd(qq,VVc,vctot); dvdatmp = vec_madd(fs,r,nul); dvdatmp = vec_madd(qq,VVc,dvdatmp); dvdasum = vec_sub(dvdasum,dvdatmp); dvdaj = vec_sub(dvdaj,dvdatmp); store_4_float(dvdaj,dvda+jnra,dvda+jnrb,dvda+jnrc,dvda+jnrd); fs = vec_nmsub(fs,rinv,nul); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_2_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); r = vec_madd(rinv,rsq,nul); /* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */ isaj = load_2_float(invsqrta+jnra,invsqrta+jnrb); isaprod = vec_madd(isai,isaj,nul); /* load 2 j charges and multiply by iq and 1/sqrt(a1*a2) */ qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); qq = vec_madd(isaprod,qq,nul); gbscale = vec_madd(isaprod,gbtsc,nul); do_2_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc); dvdaj = load_2_float(dvda+jnra,dvda+jnrb); fs = vec_madd(qq,FFc,nul); fs = vec_madd(fs,gbscale,nul); vctot = vec_madd(qq,VVc,vctot); dvdatmp = vec_madd(fs,r,nul); dvdatmp = vec_madd(qq,VVc,dvdatmp); dvdasum = vec_sub(dvdasum,dvdatmp); dvdaj = vec_sub(dvdaj,dvdatmp); store_2_float(dvdaj,dvda+jnra,dvda+jnrb); fs = vec_nmsub(fs,rinv,nul); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_3_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); r = vec_madd(rinv,rsq,nul); /* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */ isaj = load_1_float(invsqrta+jnra); isaprod = vec_madd(isai,isaj,nul); /* load 1 j charge and multiply by iq and 1/sqrt(a1*a2) */ qq = vec_madd(load_1_float(charge+jnra),iq,nul); qq = vec_madd(isaprod,qq,nul); gbscale = vec_madd(isaprod,gbtsc,nul); do_1_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc); dvdaj = load_1_float(dvda+jnra); fs = vec_madd(qq,FFc,nul); fs = vec_madd(fs,gbscale,nul); vctot = vec_madd(qq,VVc,vctot); dvdatmp = vec_madd(fs,r,nul); dvdatmp = vec_madd(qq,VVc,dvdatmp); dvdasum = vec_sub(dvdasum,dvdatmp); dvdaj = vec_sub(dvdaj,dvdatmp); store_1_float(dvdaj,dvda+jnra); fs = vec_nmsub(fs,rinv,nul); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_1(dx,dy,dz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4); tmp1 = vec_add(tmp1,tmp3); tmp2 = vec_add(tmp2,tmp4); tmp1 = vec_add(tmp1,tmp2); add_xyz_to_mem(faction+ii3,tmp1); add_xyz_to_mem(fshift+is3,tmp1); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(dvda+ii,dvdasum); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }