void
nb_kernel_allvsall(t_forcerec *           fr,
				   t_mdatoms *            mdatoms,
				   t_blocka *             excl,    
				   real *                 x,
				   real *                 f,
				   real *                 Vc,
				   real *                 Vvdw,
				   int *                  outeriter,
				   int *                  inneriter,
				   void *                 work)
{
	gmx_allvsall_data_t *aadata;
	int        natoms;
	int        ni0,ni1;
	int        nj0,nj1,nj2;
	int        i,j,k;
	real *     charge;
	int *      type;
    real       facel;
	real *     pvdw;
	int        ggid;
    int *      mask;
    
    real       ix,iy,iz,iq;
    real       fix,fiy,fiz;
    real       jx,jy,jz,qq;
    real       dx,dy,dz;
    real       tx,ty,tz;
    real       rsq,rinv,rinvsq,rinvsix;
    real       vcoul,vctot;
    real       c6,c12,Vvdw6,Vvdw12,Vvdwtot;
    real       fscal;
    
	charge              = mdatoms->chargeA;
	type                = mdatoms->typeA;
	facel               = fr->epsfac;
    natoms              = mdatoms->nr;
	ni0                 = mdatoms->start;
	ni1                 = mdatoms->start+mdatoms->homenr;
    
    aadata = *((gmx_allvsall_data_t **)work);

	if(aadata==NULL)
	{
		setup_aadata(&aadata,excl,natoms,type,fr->ntype,fr->nbfp);
        *((gmx_allvsall_data_t **)work) = aadata;
	}
        
	for(i=ni0; i<ni1; i++)
	{
		/* We assume shifts are NOT used for all-vs-all interactions */
		
		/* Load i atom data */
        ix                = x[3*i];
        iy                = x[3*i+1];
        iz                = x[3*i+2];
        iq                = facel*charge[i];

        pvdw              = aadata->pvdwparam[type[i]];
        
		/* Zero the potential energy for this list */
		Vvdwtot           = 0.0;
        vctot             = 0.0;

		/* Clear i atom forces */
        fix               = 0.0;
        fiy               = 0.0;
        fiz               = 0.0;
        
		/* Load limits for loop over neighbors */
		nj0              = aadata->jindex[3*i];
		nj1              = aadata->jindex[3*i+1];
		nj2              = aadata->jindex[3*i+2];

        mask             = aadata->exclusion_mask[i];
                
        /* Prologue part, including exclusion mask */
        for(j=nj0; j<nj1; j++,mask++)
        {          
            if(*mask!=0)
            {
                k = j%natoms;
                
                /* load j atom coordinates */
                jx                = x[3*k];
                jy                = x[3*k+1];
                jz                = x[3*k+2];
                
                /* Calculate distance */
                dx                = ix - jx;      
                dy                = iy - jy;      
                dz                = iz - jz;      
                rsq               = dx*dx+dy*dy+dz*dz;
                
                /* Calculate 1/r and 1/r2 */
                rinv              = gmx_invsqrt(rsq);
                rinvsq            = rinv*rinv;  
                
                /* Load parameters for j atom */
                qq                = iq*charge[k]; 
                c6                = pvdw[2*k];
                c12               = pvdw[2*k+1];
                
                /* Coulomb interaction */
                vcoul             = qq*rinv;      
                vctot             = vctot+vcoul;    
                
                /* Lennard-Jones interaction */
                rinvsix           = rinvsq*rinvsq*rinvsq;
                Vvdw6             = c6*rinvsix;     
                Vvdw12            = c12*rinvsix*rinvsix;
                Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
                fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
                
                /* Calculate temporary vectorial force */
                tx                = fscal*dx;     
                ty                = fscal*dy;     
                tz                = fscal*dz;     
                
                /* Increment i atom force */
                fix               = fix + tx;      
                fiy               = fiy + ty;      
                fiz               = fiz + tz;      
            
                /* Decrement j atom force */
                f[3*k]            = f[3*k]   - tx;
                f[3*k+1]          = f[3*k+1] - ty;
                f[3*k+2]          = f[3*k+2] - tz;
            }
            /* Inner loop uses 38 flops/iteration */
        }

        /* Main part, no exclusions */
        for(j=nj1; j<nj2; j++)
        {       
            k = j%natoms;

            /* load j atom coordinates */
            jx                = x[3*k];
            jy                = x[3*k+1];
            jz                = x[3*k+2];
            
            /* Calculate distance */
            dx                = ix - jx;      
            dy                = iy - jy;      
            dz                = iz - jz;      
            rsq               = dx*dx+dy*dy+dz*dz;
            
            /* Calculate 1/r and 1/r2 */
            rinv              = gmx_invsqrt(rsq);
            rinvsq            = rinv*rinv;  
            
            /* Load parameters for j atom */
            qq                = iq*charge[k]; 
            c6                = pvdw[2*k];
            c12               = pvdw[2*k+1];
            
            /* Coulomb interaction */
            vcoul             = qq*rinv;      
            vctot             = vctot+vcoul;    
            
            /* Lennard-Jones interaction */
            rinvsix           = rinvsq*rinvsq*rinvsq;
            Vvdw6             = c6*rinvsix;     
            Vvdw12            = c12*rinvsix*rinvsix;
            Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
            fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
                        
            /* Calculate temporary vectorial force */
            tx                = fscal*dx;     
            ty                = fscal*dy;     
            tz                = fscal*dz;     
            
            /* Increment i atom force */
            fix               = fix + tx;      
            fiy               = fiy + ty;      
            fiz               = fiz + tz;      

            /* Decrement j atom force */
            f[3*k]            = f[3*k]   - tx;
            f[3*k+1]          = f[3*k+1] - ty;
            f[3*k+2]          = f[3*k+2] - tz;
            
            /* Inner loop uses 38 flops/iteration */
        }
        
        f[3*i]   += fix;
        f[3*i+1] += fiy;
        f[3*i+2] += fiz;
		
		/* Add potential energies to the group for this list */
		ggid             = 0;         
        
		Vc[ggid]         = Vc[ggid] + vctot;
        Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
		
		/* Outer loop uses 6 flops/iteration */
	}    
      
    /* Write outer/inner iteration count to pointers */
    *outeriter       = ni1-ni0;         
    *inneriter       = (ni1-ni0)*natoms/2;         
}
void
nb_kernel_allvsallgb(t_forcerec *           fr,
                     t_mdatoms *            mdatoms,
                     t_blocka *             excl,    
                     real *                 x,
                     real *                 f,
                     real *                 Vc,
                     real *                 Vvdw,
                     real *                 vpol,
                     int *                  outeriter,
                     int *                  inneriter,
                     void *                 work)
{
	gmx_allvsall_data_t *aadata;
	int        natoms;
	int        ni0,ni1;
	int        nj0,nj1,nj2;
	int        i,j,k;
	real *     charge;
	int *      type;
    real       facel;
	real *     pvdw;
	int        ggid;
    int *      mask;
    real *     GBtab;
    real       gbfactor;
    real *     invsqrta;
    real *     dvda;
    real       vgbtot,dvdasum;
    int        nnn,n0;
    
    real       ix,iy,iz,iq;
    real       fix,fiy,fiz;
    real       jx,jy,jz,qq;
    real       dx,dy,dz;
    real       tx,ty,tz;
    real       rsq,rinv,rinvsq,rinvsix;
    real       vcoul,vctot;
    real       c6,c12,Vvdw6,Vvdw12,Vvdwtot;
    real       fscal,dvdatmp,fijC,vgb;
    real       Y,F,Fp,Geps,Heps2,VV,FF,eps,eps2,r,rt;
    real       dvdaj,gbscale,isaprod,isai,isaj,gbtabscale;
    
	charge              = mdatoms->chargeA;
	type                = mdatoms->typeA;
    gbfactor            = ((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
	facel               = fr->epsfac;
    GBtab               = fr->gbtab.tab;
    gbtabscale          = fr->gbtab.scale;
    invsqrta            = fr->invsqrta;
    dvda                = fr->dvda;
    
    natoms              = mdatoms->nr;
	ni0                 = mdatoms->start;
	ni1                 = mdatoms->start+mdatoms->homenr;
    
    aadata = *((gmx_allvsall_data_t **)work);

	if(aadata==NULL)
	{
		setup_aadata(&aadata,excl,natoms,type,fr->ntype,fr->nbfp);
        *((gmx_allvsall_data_t **)work) = aadata;
	}

	for(i=ni0; i<ni1; i++)
	{
		/* We assume shifts are NOT used for all-vs-all interactions */
		
		/* Load i atom data */
        ix                = x[3*i];
        iy                = x[3*i+1];
        iz                = x[3*i+2];
        iq                = facel*charge[i];
        
        isai              = invsqrta[i];

        pvdw              = aadata->pvdwparam[type[i]];
        
		/* Zero the potential energy for this list */
		Vvdwtot           = 0.0;
        vctot             = 0.0;
        vgbtot            = 0.0;
        dvdasum           = 0.0;              

		/* Clear i atom forces */
        fix               = 0.0;
        fiy               = 0.0;
        fiz               = 0.0;
        
		/* Load limits for loop over neighbors */
		nj0              = aadata->jindex[3*i];
		nj1              = aadata->jindex[3*i+1];
		nj2              = aadata->jindex[3*i+2];

        mask             = aadata->exclusion_mask[i];
                
        /* Prologue part, including exclusion mask */
        for(j=nj0; j<nj1; j++,mask++)
        {          
            if(*mask!=0)
            {
                k = j%natoms;
                
                /* load j atom coordinates */
                jx                = x[3*k];
                jy                = x[3*k+1];
                jz                = x[3*k+2];
                
                /* Calculate distance */
                dx                = ix - jx;      
                dy                = iy - jy;      
                dz                = iz - jz;      
                rsq               = dx*dx+dy*dy+dz*dz;
                
                /* Calculate 1/r and 1/r2 */
                rinv             = gmx_invsqrt(rsq);
                
                /* Load parameters for j atom */
                isaj             = invsqrta[k];  
                isaprod          = isai*isaj;      
                qq               = iq*charge[k]; 
                vcoul            = qq*rinv;      
                fscal            = vcoul*rinv;   
                qq               = isaprod*(-qq)*gbfactor;  
                gbscale          = isaprod*gbtabscale;
                c6                = pvdw[2*k];
                c12               = pvdw[2*k+1];
                rinvsq           = rinv*rinv;  
                
                /* Tabulated Generalized-Born interaction */
                dvdaj            = dvda[k];      
                r                = rsq*rinv;   
                
                /* Calculate table index */
                rt               = r*gbscale;      
                n0               = rt;             
                eps              = rt-n0;          
                eps2             = eps*eps;        
                nnn              = 4*n0;           
                Y                = GBtab[nnn];     
                F                = GBtab[nnn+1];   
                Geps             = eps*GBtab[nnn+2];
                Heps2            = eps2*GBtab[nnn+3];
                Fp               = F+Geps+Heps2;   
                VV               = Y+eps*Fp;       
                FF               = Fp+Geps+2.0*Heps2;
                vgb              = qq*VV;          
                fijC             = qq*FF*gbscale;  
                dvdatmp          = -0.5*(vgb+fijC*r);
                dvdasum          = dvdasum + dvdatmp;
                dvda[k]          = dvdaj+dvdatmp*isaj*isaj;
                vctot            = vctot + vcoul;  
                vgbtot           = vgbtot + vgb;
                
                /* Lennard-Jones interaction */
                rinvsix          = rinvsq*rinvsq*rinvsq;
                Vvdw6            = c6*rinvsix;     
                Vvdw12           = c12*rinvsix*rinvsix;
                Vvdwtot          = Vvdwtot+Vvdw12-Vvdw6;
                fscal            = (12.0*Vvdw12-6.0*Vvdw6)*rinvsq-(fijC-fscal)*rinv;
                                
                /* Calculate temporary vectorial force */
                tx                = fscal*dx;     
                ty                = fscal*dy;     
                tz                = fscal*dz;     
                
                /* Increment i atom force */
                fix               = fix + tx;      
                fiy               = fiy + ty;      
                fiz               = fiz + tz;      
            
                /* Decrement j atom force */
                f[3*k]            = f[3*k]   - tx;
                f[3*k+1]          = f[3*k+1] - ty;
                f[3*k+2]          = f[3*k+2] - tz;
            }
            /* Inner loop uses 38 flops/iteration */
        }

        /* Main part, no exclusions */
        for(j=nj1; j<nj2; j++)
        {       
            k = j%natoms;

            /* load j atom coordinates */
            jx                = x[3*k];
            jy                = x[3*k+1];
            jz                = x[3*k+2];
            
            /* Calculate distance */
            dx                = ix - jx;      
            dy                = iy - jy;      
            dz                = iz - jz;      
            rsq               = dx*dx+dy*dy+dz*dz;
            
            /* Calculate 1/r and 1/r2 */
            rinv             = gmx_invsqrt(rsq);
            
            /* Load parameters for j atom */
            isaj             = invsqrta[k];  
            isaprod          = isai*isaj;      
            qq               = iq*charge[k]; 
            vcoul            = qq*rinv;      
            fscal            = vcoul*rinv;   
            qq               = isaprod*(-qq)*gbfactor;  
            gbscale          = isaprod*gbtabscale;
            c6                = pvdw[2*k];
            c12               = pvdw[2*k+1];
            rinvsq           = rinv*rinv;  
            
            /* Tabulated Generalized-Born interaction */
            dvdaj            = dvda[k];      
            r                = rsq*rinv;   
            
            /* Calculate table index */
            rt               = r*gbscale;      
            n0               = rt;             
            eps              = rt-n0;          
            eps2             = eps*eps;        
            nnn              = 4*n0;           
            Y                = GBtab[nnn];     
            F                = GBtab[nnn+1];   
            Geps             = eps*GBtab[nnn+2];
            Heps2            = eps2*GBtab[nnn+3];
            Fp               = F+Geps+Heps2;   
            VV               = Y+eps*Fp;       
            FF               = Fp+Geps+2.0*Heps2;
            vgb              = qq*VV;          
            fijC             = qq*FF*gbscale;  
            dvdatmp          = -0.5*(vgb+fijC*r);
            dvdasum          = dvdasum + dvdatmp;
            dvda[k]          = dvdaj+dvdatmp*isaj*isaj;
            vctot            = vctot + vcoul;  
            vgbtot           = vgbtot + vgb;

            /* Lennard-Jones interaction */
            rinvsix          = rinvsq*rinvsq*rinvsq;
            Vvdw6            = c6*rinvsix;     
            Vvdw12           = c12*rinvsix*rinvsix;
            Vvdwtot          = Vvdwtot+Vvdw12-Vvdw6;
            fscal            = (12.0*Vvdw12-6.0*Vvdw6)*rinvsq-(fijC-fscal)*rinv;
            
            /* Calculate temporary vectorial force */
            tx                = fscal*dx;     
            ty                = fscal*dy;     
            tz                = fscal*dz;     
            
            /* Increment i atom force */
            fix               = fix + tx;      
            fiy               = fiy + ty;      
            fiz               = fiz + tz;      
            
            /* Decrement j atom force */
            f[3*k]            = f[3*k]   - tx;
            f[3*k+1]          = f[3*k+1] - ty;
            f[3*k+2]          = f[3*k+2] - tz;
            
            /* Inner loop uses 38 flops/iteration */
        }
        
        f[3*i]   += fix;
        f[3*i+1] += fiy;
        f[3*i+2] += fiz;
		
		/* Add potential energies to the group for this list */
		ggid             = 0;         
        
		Vc[ggid]         = Vc[ggid] + vctot;
        Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
        vpol[ggid]       = vpol[ggid] + vgbtot;
        dvda[i]          = dvda[i] + dvdasum*isai*isai;

		/* Outer loop uses 6 flops/iteration */
	}    

    /* Write outer/inner iteration count to pointers */
    *outeriter       = ni1-ni0;         
    *inneriter       = (ni1-ni0)*natoms/2;         
}
void
nb_kernel_allvsall(t_nblist gmx_unused *     nlist,
                   rvec *                    xx,
                   rvec *                    ff,
                   struct t_forcerec *       fr,
                   t_mdatoms *               mdatoms,
                   nb_kernel_data_t *        kernel_data,
                   t_nrnb *                  nrnb)
{
    gmx_allvsall_data_t *aadata;
    int                  natoms;
    int                  ni0, ni1;
    int                  nj0, nj1, nj2;
    int                  i, j, k;
    real           *     charge;
    int           *      type;
    real                 facel;
    real           *     pvdw;
    int                  ggid;
    int           *      mask;

    real                 ix, iy, iz, iq;
    real                 fix, fiy, fiz;
    real                 jx, jy, jz, qq;
    real                 dx, dy, dz;
    real                 tx, ty, tz;
    real                 rsq, rinv, rinvsq, rinvsix;
    real                 vcoul, vctot;
    real                 c6, c12, Vvdw6, Vvdw12, Vvdwtot;
    real                 fscal;
    const t_blocka      *excl;
    real           *     f;
    real           *     x;
    real           *     Vvdw;
    real           *     Vc;

    x                   = xx[0];
    f                   = ff[0];
    charge              = mdatoms->chargeA;
    type                = mdatoms->typeA;
    facel               = fr->ic->epsfac;
    natoms              = mdatoms->nr;
    ni0                 = 0;
    ni1                 = mdatoms->homenr;
    aadata              = reinterpret_cast<gmx_allvsall_data_t *>(fr->AllvsAll_work);
    excl                = kernel_data->exclusions;

    Vc                  = kernel_data->energygrp_elec;
    Vvdw                = kernel_data->energygrp_vdw;

    if (aadata == nullptr)
    {
        setup_aadata(&aadata, excl, natoms, type, fr->ntype, fr->nbfp);
        fr->AllvsAll_work  = aadata;
    }

    for (i = ni0; i < ni1; i++)
    {
        /* We assume shifts are NOT used for all-vs-all interactions */

        /* Load i atom data */
        ix                = x[3*i];
        iy                = x[3*i+1];
        iz                = x[3*i+2];
        iq                = facel*charge[i];

        pvdw              = aadata->pvdwparam[type[i]];

        /* Zero the potential energy for this list */
        Vvdwtot           = 0.0;
        vctot             = 0.0;

        /* Clear i atom forces */
        fix               = 0.0;
        fiy               = 0.0;
        fiz               = 0.0;

        /* Load limits for loop over neighbors */
        nj0              = aadata->jindex[3*i];
        nj1              = aadata->jindex[3*i+1];
        nj2              = aadata->jindex[3*i+2];

        mask             = aadata->exclusion_mask[i];

        /* Prologue part, including exclusion mask */
        for (j = nj0; j < nj1; j++, mask++)
        {
            if (*mask != 0)
            {
                k = j%natoms;

                /* load j atom coordinates */
                jx                = x[3*k];
                jy                = x[3*k+1];
                jz                = x[3*k+2];

                /* Calculate distance */
                dx                = ix - jx;
                dy                = iy - jy;
                dz                = iz - jz;
                rsq               = dx*dx+dy*dy+dz*dz;

                /* Calculate 1/r and 1/r2 */
                rinv              = 1.0/sqrt(rsq);
                rinvsq            = rinv*rinv;

                /* Load parameters for j atom */
                qq                = iq*charge[k];
                c6                = pvdw[2*k];
                c12               = pvdw[2*k+1];

                /* Coulomb interaction */
                vcoul             = qq*rinv;
                vctot             = vctot+vcoul;

                /* Lennard-Jones interaction */
                rinvsix           = rinvsq*rinvsq*rinvsq;
                Vvdw6             = c6*rinvsix;
                Vvdw12            = c12*rinvsix*rinvsix;
                Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
                fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;

                /* Calculate temporary vectorial force */
                tx                = fscal*dx;
                ty                = fscal*dy;
                tz                = fscal*dz;

                /* Increment i atom force */
                fix               = fix + tx;
                fiy               = fiy + ty;
                fiz               = fiz + tz;

                /* Decrement j atom force */
                f[3*k]            = f[3*k]   - tx;
                f[3*k+1]          = f[3*k+1] - ty;
                f[3*k+2]          = f[3*k+2] - tz;
            }
            /* Inner loop uses 38 flops/iteration */
        }

        /* Main part, no exclusions */
        for (j = nj1; j < nj2; j++)
        {
            k = j%natoms;

            /* load j atom coordinates */
            jx                = x[3*k];
            jy                = x[3*k+1];
            jz                = x[3*k+2];

            /* Calculate distance */
            dx                = ix - jx;
            dy                = iy - jy;
            dz                = iz - jz;
            rsq               = dx*dx+dy*dy+dz*dz;

            /* Calculate 1/r and 1/r2 */
            rinv              = 1.0/sqrt(rsq);
            rinvsq            = rinv*rinv;

            /* Load parameters for j atom */
            qq                = iq*charge[k];
            c6                = pvdw[2*k];
            c12               = pvdw[2*k+1];

            /* Coulomb interaction */
            vcoul             = qq*rinv;
            vctot             = vctot+vcoul;

            /* Lennard-Jones interaction */
            rinvsix           = rinvsq*rinvsq*rinvsq;
            Vvdw6             = c6*rinvsix;
            Vvdw12            = c12*rinvsix*rinvsix;
            Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
            fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;

            /* Calculate temporary vectorial force */
            tx                = fscal*dx;
            ty                = fscal*dy;
            tz                = fscal*dz;

            /* Increment i atom force */
            fix               = fix + tx;
            fiy               = fiy + ty;
            fiz               = fiz + tz;

            /* Decrement j atom force */
            f[3*k]            = f[3*k]   - tx;
            f[3*k+1]          = f[3*k+1] - ty;
            f[3*k+2]          = f[3*k+2] - tz;

            /* Inner loop uses 38 flops/iteration */
        }

        f[3*i]   += fix;
        f[3*i+1] += fiy;
        f[3*i+2] += fiz;

        /* Add potential energies to the group for this list */
        ggid             = 0;

        Vc[ggid]         = Vc[ggid] + vctot;
        Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;

        /* Outer loop uses 6 flops/iteration */
    }

    /* 12 flops per outer iteration
     * 19 flops per inner iteration
     */
    inc_nrnb(nrnb, eNR_NBKERNEL_ELEC_VDW_VF, (ni1-ni0)*12 + gmx::exactDiv(natoms*(natoms-1), 2)*19);
}