/*! This function computes the gravitational forces for all active * particles. If needed, a new tree is constructed, otherwise the * dynamically updated tree is used. Particles are only exported to other * processors when really needed, thereby allowing a good use of the * communication buffer. */ void gravity_tree(void) { long long ntot; int numnodes, nexportsum = 0; int i, j, iter = 0; int *numnodeslist, maxnumnodes, nexport, *numlist, *nrecv, *ndonelist; double tstart, tend, timetree = 0, timecommsumm = 0, timeimbalance = 0, sumimbalance; double ewaldcount; double costtotal, ewaldtot, *costtreelist, *ewaldlist; double maxt, sumt, *timetreelist, *timecommlist; double fac, plb, plb_max, sumcomm; #ifndef NOGRAVITY int *noffset, *nbuffer, *nsend, *nsend_local; long long ntotleft; int ndone, maxfill, ngrp; int k, place; int level, sendTask, recvTask; double ax, ay, az; MPI_Status status; #endif #ifdef ADD_CENTRAL_GRAVITY int numsinks,root,globalroot,liveStar,liveStarGlobal; double starData[4],r,h,h_inv,h3_inv,u,starGrav[3],starGravGlobal[3]; #endif /* set new softening lengths */ if(All.ComovingIntegrationOn) set_softenings(); /* contruct tree if needed */ tstart = second(); if(TreeReconstructFlag) { if(ThisTask == 0) printf("Tree construction.\n"); force_treebuild(NumPart); TreeReconstructFlag = 0; if(ThisTask == 0) printf("Tree construction done.\n"); } tend = second(); All.CPU_TreeConstruction += timediff(tstart, tend); costtotal = ewaldcount = 0; /* Note: 'NumForceUpdate' has already been determined in find_next_sync_point_and_drift() */ numlist = malloc(NTask * sizeof(int) * NTask); MPI_Allgather(&NumForceUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD); for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; free(numlist); #ifndef NOGRAVITY if(ThisTask == 0) printf("Begin tree force.\n"); #ifdef SELECTIVE_NO_GRAVITY for(i = 0; i < NumPart; i++) if(((1 << P[i].Type) & (SELECTIVE_NO_GRAVITY))) P[i].Ti_endstep = -P[i].Ti_endstep - 1; #endif noffset = malloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = malloc(sizeof(int) * NTask); nsend_local = malloc(sizeof(int) * NTask); nsend = malloc(sizeof(int) * NTask * NTask); ndonelist = malloc(sizeof(int) * NTask); i = 0; /* beginn with this index */ ntotleft = ntot; /* particles left for all tasks together */ while(ntotleft > 0) { iter++; for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ tstart = second(); for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) { if(P[i].Ti_endstep == All.Ti_Current) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 0; #ifndef PMGRID costtotal += force_treeevaluate(i, 0, &ewaldcount); #else costtotal += force_treeevaluate_shortrange(i, 0); #endif for(j = 0; j < NTask; j++) { if(Exportflag[j]) { for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataGet[nexport].Type = P[i].Type; #ifdef ADAPTIVE_GRAVSOFT_FORGAS if(P[i].Type == 0) GravDataGet[nexport].Soft = SphP[i].Hsml; #endif #endif GravDataGet[nexport].w.OldAcc = P[i].OldAcc; GravDataIndexTable[nexport].Task = j; GravDataIndexTable[nexport].Index = i; GravDataIndexTable[nexport].SortIndex = nexport; nexport++; nexportsum++; nsend_local[j]++; } } } } tend = second(); timetree += timediff(tstart, tend); qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key); for(j = 0; j < nexport; j++) GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex]; for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; tstart = second(); MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_A, &GravDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_A, MPI_COMM_WORLD, &status); } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); tstart = second(); for(j = 0; j < nbuffer[ThisTask]; j++) { #ifndef PMGRID costtotal += force_treeevaluate(j, 1, &ewaldcount); #else costtotal += force_treeevaluate_shortrange(j, 1); #endif } tend = second(); timetree += timediff(tstart, tend); tstart = second(); MPI_Barrier(MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* get the result */ tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_B, &GravDataOut[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_B, MPI_COMM_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { place = GravDataIndexTable[noffset[recvTask] + j].Index; for(k = 0; k < 3; k++) P[place].GravAccel[k] += GravDataOut[j + noffset[recvTask]].u.Acc[k]; P[place].GravCost += GravDataOut[j + noffset[recvTask]].w.Ninteractions; } } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); level = ngrp - 1; } MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD); for(j = 0; j < NTask; j++) ntotleft -= ndonelist[j]; } free(ndonelist); free(nsend); free(nsend_local); free(nbuffer); free(noffset); /* now add things for comoving integration */ #ifndef PERIODIC #ifndef PMGRID if(All.ComovingIntegrationOn) { fac = 0.5 * All.Hubble * All.Hubble * All.Omega0 / All.G; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] += fac * P[i].Pos[j]; } #endif #endif for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) { #ifdef PMGRID ax = P[i].GravAccel[0] + P[i].GravPM[0] / All.G; ay = P[i].GravAccel[1] + P[i].GravPM[1] / All.G; az = P[i].GravAccel[2] + P[i].GravPM[2] / All.G; #else ax = P[i].GravAccel[0]; ay = P[i].GravAccel[1]; az = P[i].GravAccel[2]; #endif P[i].OldAcc = sqrt(ax * ax + ay * ay + az * az); } if(All.TypeOfOpeningCriterion == 1) All.ErrTolTheta = 0; /* This will switch to the relative opening criterion for the following force computations */ /* muliply by G */ for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) { P[i].GravAccel[j] *= All.G; //printf("Gravity! %g\n",P[i].GravAccel[j]); } /* Finally, the following factor allows a computation of a cosmological simulation with vacuum energy in physical coordinates */ #ifndef PERIODIC #ifndef PMGRID if(All.ComovingIntegrationOn == 0) { fac = All.OmegaLambda * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] += fac * P[i].Pos[j]; } #endif #endif #ifdef SELECTIVE_NO_GRAVITY for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) P[i].Ti_endstep = -P[i].Ti_endstep - 1; #endif if(ThisTask == 0) printf("tree is done.\n"); #else /* gravity is switched off */ for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] = 0; #endif #ifdef SINK_GRAV_ONLY sink_grav(); #endif #ifdef ADD_CENTRAL_GRAVITY /* Get the position and mass of the central object and send it to everyone */ numsinks=NumPart - N_gas; starData[0]=starData[1]=starData[2]=starData[3]= -1.0; root=-1; liveStar=0; for(i=0; i<3; i++) { starGrav[i]=0.0; starGravGlobal[i]=0.0; } for(i=0; i<numsinks;i++) { if(P[i+N_gas].ID==All.StarID) { starData[0] = P[i+N_gas].Pos[0]; starData[1] = P[i+N_gas].Pos[1]; starData[2] = P[i+N_gas].Pos[2]; starData[3] = P[i+N_gas].Mass; root = ThisTask; //Do we need to update the star's gravity? if(P[i+N_gas].Ti_endstep == All.Ti_Current) { liveStar=1; } } } /* Get the node that has the data */ MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(&root,&globalroot,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(&liveStar,&liveStarGlobal,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); /* Broadcast it. */ MPI_Bcast(&starData,4,MPI_DOUBLE,globalroot,MPI_COMM_WORLD); //We have the central object mass and position, add its gravity, it is softened by the type 1 softening... h = All.ForceSoftening[1]; h_inv = 1.0 / h; h3_inv = h_inv * h_inv * h_inv; for(i = 0; i < NumPart; i++) { if(P[i].ID != All.StarID) { //If we need to update the star's gravity we need to calculate this for all particles... if(liveStarGlobal) { r=sqrt((starData[0]-P[i].Pos[0])*(starData[0]-P[i].Pos[0])+(starData[1]-P[i].Pos[1])*(starData[1]-P[i].Pos[1])+(starData[2]-P[i].Pos[2])*(starData[2]-P[i].Pos[2])); if(r >= h) { fac = 1 / (r*r*r); } else { u = r * h_inv; if(u < 0.5) { fac = h3_inv * (10.666666666667 + u * u * (32.0 * u - 38.4)); } else { fac = h3_inv * (21.333333333333 - 48.0 * u + 38.4 * u * u - 10.666666666667 * u * u * u - 0.066666666667 / (u * u * u)); } } for(j=0;j<3;j++) { starGrav[j]+=(P[i].Pos[j]-starData[j])*All.G*P[i].Mass*fac; } } //Otherwise, just give the star's gravity to those that need it if(P[i].Ti_endstep == All.Ti_Current) { r=sqrt((starData[0]-P[i].Pos[0])*(starData[0]-P[i].Pos[0])+(starData[1]-P[i].Pos[1])*(starData[1]-P[i].Pos[1])+(starData[2]-P[i].Pos[2])*(starData[2]-P[i].Pos[2])); if(r >= h) { fac = 1 / (r*r*r); } else { h_inv = 1.0 / h; h3_inv = h_inv * h_inv * h_inv; u = r * h_inv; if(u < 0.5) { fac = h3_inv * (10.666666666667 + u * u * (32.0 * u - 38.4)); } else { fac = h3_inv * (21.333333333333 - 48.0 * u + 38.4 * u * u - 10.666666666667 * u * u * u - 0.066666666667 / (u * u * u)); } } for(j=0;j<3;j++) { P[i].GravAccel[j]+=(starData[j]-P[i].Pos[j])*All.G*starData[3]*fac; } } } } //Gather the forces of the pcles on the star together and add them to the star if(liveStarGlobal) { //Finally we need to combine all the starGrav values for the star... MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(&starGrav[0],&starGravGlobal[0],1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); MPI_Allreduce(&starGrav[1],&starGravGlobal[1],1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); MPI_Allreduce(&starGrav[2],&starGravGlobal[2],1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); //Finally, find the actual star and add it if(globalroot==ThisTask) { for(i=0; i<numsinks;i++) { if(P[i+N_gas].ID==All.StarID) { for(j=0;j<3;j++) { P[i+N_gas].GravAccel[j]+=starGravGlobal[j]; } } } } } MPI_Barrier(MPI_COMM_WORLD); #endif /* Now the force computation is finished */ /* gather some diagnostic information */ timetreelist = malloc(sizeof(double) * NTask); timecommlist = malloc(sizeof(double) * NTask); costtreelist = malloc(sizeof(double) * NTask); numnodeslist = malloc(sizeof(int) * NTask); ewaldlist = malloc(sizeof(double) * NTask); nrecv = malloc(sizeof(int) * NTask); numnodes = Numnodestree; MPI_Gather(&costtotal, 1, MPI_DOUBLE, costtreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&numnodes, 1, MPI_INT, numnodeslist, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Gather(&timetree, 1, MPI_DOUBLE, timetreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&timecommsumm, 1, MPI_DOUBLE, timecommlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&NumPart, 1, MPI_INT, nrecv, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Gather(&ewaldcount, 1, MPI_DOUBLE, ewaldlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Reduce(&nexportsum, &nexport, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if(ThisTask == 0) { All.TotNumOfForces += ntot; fprintf(FdTimings, "Step= %d t= %g dt= %g \n", All.NumCurrentTiStep, All.Time, All.TimeStep); fprintf(FdTimings, "Nf= %d%09d total-Nf= %d%09d ex-frac= %g iter= %d\n", (int) (ntot / 1000000000), (int) (ntot % 1000000000), (int) (All.TotNumOfForces / 1000000000), (int) (All.TotNumOfForces % 1000000000), nexport / ((double) ntot), iter); /* note: on Linux, the 8-byte integer could be printed with the format identifier "%qd", but doesn't work on AIX */ fac = NTask / ((double) All.TotNumPart); for(i = 0, maxt = timetreelist[0], sumt = 0, plb_max = 0, maxnumnodes = 0, costtotal = 0, sumcomm = 0, ewaldtot = 0; i < NTask; i++) { costtotal += costtreelist[i]; sumcomm += timecommlist[i]; if(maxt < timetreelist[i]) maxt = timetreelist[i]; sumt += timetreelist[i]; plb = nrecv[i] * fac; if(plb > plb_max) plb_max = plb; if(numnodeslist[i] > maxnumnodes) maxnumnodes = numnodeslist[i]; ewaldtot += ewaldlist[i]; } fprintf(FdTimings, "work-load balance: %g max=%g avg=%g PE0=%g\n", maxt / (sumt / NTask), maxt, sumt / NTask, timetreelist[0]); fprintf(FdTimings, "particle-load balance: %g\n", plb_max); fprintf(FdTimings, "max. nodes: %d, filled: %g\n", maxnumnodes, maxnumnodes / (All.TreeAllocFactor * All.MaxPart)); fprintf(FdTimings, "part/sec=%g | %g ia/part=%g (%g)\n", ntot / (sumt + 1.0e-20), ntot / (maxt * NTask), ((double) (costtotal)) / ntot, ((double) ewaldtot) / ntot); fprintf(FdTimings, "\n"); fflush(FdTimings); All.CPU_TreeWalk += sumt / NTask; All.CPU_Imbalance += sumimbalance / NTask; All.CPU_CommSum += sumcomm / NTask; } free(nrecv); free(ewaldlist); free(numnodeslist); free(costtreelist); free(timecommlist); free(timetreelist); }
/*! This function computes the gravitational forces for all active * particles. If needed, a new tree is constructed, otherwise the * dynamically updated tree is used. Particles are only exported to other * processors when really needed, thereby allowing a good use of the * communication buffer. */ void gravity_tree(void) { long long ntot; int numnodes, nexportsum = 0; int i, j, iter = 0; int *numnodeslist, maxnumnodes, nexport, *numlist, *nrecv, *ndonelist; double tstart, tend, timetree = 0, timecommsumm = 0, timeimbalance = 0, sumimbalance; double ewaldcount; double costtotal, ewaldtot, *costtreelist, *ewaldlist; double maxt, sumt, *timetreelist, *timecommlist; double fac, plb, plb_max, sumcomm; #ifndef NOGRAVITY int *noffset, *nbuffer, *nsend, *nsend_local; long long ntotleft; int ndone, maxfill, ngrp; int k, place; int level, sendTask, recvTask; double ax, ay, az; MPI_Status status; #endif /* set new softening lengths */ if(All.ComovingIntegrationOn) set_softenings(); /* contruct tree if needed */ tstart = second(); if(TreeReconstructFlag) { if(ThisTask == 0) printf("Tree construction.\n"); force_treebuild(NumPart); TreeReconstructFlag = 0; if(ThisTask == 0) printf("Tree construction done.\n"); } tend = second(); All.CPU_TreeConstruction += timediff(tstart, tend); costtotal = ewaldcount = 0; /* Note: 'NumForceUpdate' has already been determined in find_next_sync_point_and_drift() */ numlist = malloc(NTask * sizeof(int) * NTask); MPI_Allgather(&NumForceUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD); for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; free(numlist); #ifndef NOGRAVITY if(ThisTask == 0) printf("Begin tree force.\n"); #ifdef SELECTIVE_NO_GRAVITY for(i = 0; i < NumPart; i++) if(((1 << P[i].Type) & (SELECTIVE_NO_GRAVITY))) P[i].Ti_endstep = -P[i].Ti_endstep - 1; #endif noffset = malloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = malloc(sizeof(int) * NTask); nsend_local = malloc(sizeof(int) * NTask); nsend = malloc(sizeof(int) * NTask * NTask); ndonelist = malloc(sizeof(int) * NTask); i = 0; /* beginn with this index */ ntotleft = ntot; /* particles left for all tasks together */ while(ntotleft > 0) { //printf("nontotleft %d, iter %d\n", ntotleft, iter); iter++; for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ tstart = second(); for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) if(P[i].Ti_endstep == All.Ti_Current) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 0; #ifndef PMGRID costtotal += force_treeevaluate(i, 0, &ewaldcount); #else costtotal += force_treeevaluate_shortrange(i, 0); #endif for(j = 0; j < NTask; j++) { if(Exportflag[j]) { for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k]; // KC 8/11/14 Need to export single particle masses now GravDataGet[nexport].Mass = P[i].Mass; #ifdef UNEQUALSOFTENINGS GravDataGet[nexport].Type = P[i].Type; #ifdef ADAPTIVE_GRAVSOFT_FORGAS if(P[i].Type == 0) GravDataGet[nexport].Soft = SphP[i].Hsml; #endif #endif GravDataGet[nexport].w.OldAcc = P[i].OldAcc; GravDataIndexTable[nexport].Task = j; GravDataIndexTable[nexport].Index = i; GravDataIndexTable[nexport].SortIndex = nexport; nexport++; nexportsum++; nsend_local[j]++; } } } tend = second(); timetree += timediff(tstart, tend); qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key); for(j = 0; j < nexport; j++) GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex]; for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; tstart = second(); MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_A, &GravDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_A, MPI_COMM_WORLD, &status); } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); tstart = second(); for(j = 0; j < nbuffer[ThisTask]; j++) { #ifndef PMGRID costtotal += force_treeevaluate(j, 1, &ewaldcount); #else costtotal += force_treeevaluate_shortrange(j, 1); #endif } tend = second(); timetree += timediff(tstart, tend); tstart = second(); MPI_Barrier(MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* get the result */ tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_B, &GravDataOut[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_B, MPI_COMM_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { place = GravDataIndexTable[noffset[recvTask] + j].Index; for(k = 0; k < 3; k++) P[place].GravAccel[k] += GravDataOut[j + noffset[recvTask]].u.Acc[k]; P[place].GravCost += GravDataOut[j + noffset[recvTask]].w.Ninteractions; } } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); level = ngrp - 1; } MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD); for(j = 0; j < NTask; j++) ntotleft -= ndonelist[j]; } free(ndonelist); free(nsend); free(nsend_local); free(nbuffer); free(noffset); // KC 10/22/14 // At this point, GravAcce[j] will contain the tree-walked force. If PMGRID is on, then this will // be the shortrange stuff #if defined PMGRID && defined DEBUG_NGRAVS_SHORTTREE for(i = 0; i < NumPart; ++i) fprintf(stderr, "%d\t%e\t%e\t%e\t%e\t%e\t%e\t%d\n", P[i].ID, P[i].Pos[0], P[i].Pos[1], P[i].Pos[2], P[i].GravAccel[0], P[i].GravAccel[1], P[i].GravAccel[2], P[i].Type); endrun(5555); #endif /* now add things for comoving integration */ #ifndef PERIODIC #ifndef PMGRID if(All.ComovingIntegrationOn) { fac = 0.5 * All.Hubble * All.Hubble * All.Omega0 / All.G; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] += fac * P[i].Pos[j]; } #endif #endif for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) { #ifdef PMGRID ax = P[i].GravAccel[0] + P[i].GravPM[0] / All.G; ay = P[i].GravAccel[1] + P[i].GravPM[1] / All.G; az = P[i].GravAccel[2] + P[i].GravPM[2] / All.G; #else ax = P[i].GravAccel[0]; ay = P[i].GravAccel[1]; az = P[i].GravAccel[2]; #endif P[i].OldAcc = sqrt(ax * ax + ay * ay + az * az); } if(All.TypeOfOpeningCriterion == 1) All.ErrTolTheta = 0; /* This will switch to the relative opening criterion for the following force computations */ /* muliply by G */ for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] *= All.G; /* Finally, the following factor allows a computation of a cosmological simulation with vacuum energy in physical coordinates */ #ifndef PERIODIC #ifndef PMGRID if(All.ComovingIntegrationOn == 0) { fac = All.OmegaLambda * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] += fac * P[i].Pos[j]; } #endif #endif #ifdef SELECTIVE_NO_GRAVITY for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) P[i].Ti_endstep = -P[i].Ti_endstep - 1; #endif if(ThisTask == 0) printf("tree is done.\n"); #else /* gravity is switched off */ for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] = 0; #endif /* Now the force computation is finished */ //printf("Tree force computation done."); /* gather some diagnostic information */ timetreelist = malloc(sizeof(double) * NTask); timecommlist = malloc(sizeof(double) * NTask); costtreelist = malloc(sizeof(double) * NTask); numnodeslist = malloc(sizeof(int) * NTask); ewaldlist = malloc(sizeof(double) * NTask); nrecv = malloc(sizeof(int) * NTask); numnodes = Numnodestree; MPI_Gather(&costtotal, 1, MPI_DOUBLE, costtreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&numnodes, 1, MPI_INT, numnodeslist, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Gather(&timetree, 1, MPI_DOUBLE, timetreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&timecommsumm, 1, MPI_DOUBLE, timecommlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&NumPart, 1, MPI_INT, nrecv, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Gather(&ewaldcount, 1, MPI_DOUBLE, ewaldlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Reduce(&nexportsum, &nexport, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if(ThisTask == 0) { All.TotNumOfForces += ntot; fprintf(FdTimings, "Step= %d t= %g dt= %g \n", All.NumCurrentTiStep, All.Time, All.TimeStep); fprintf(FdTimings, "Nf= %d%09d total-Nf= %d%09d ex-frac= %g iter= %d\n", (int) (ntot / 1000000000), (int) (ntot % 1000000000), (int) (All.TotNumOfForces / 1000000000), (int) (All.TotNumOfForces % 1000000000), nexport / ((double) ntot), iter); /* note: on Linux, the 8-byte integer could be printed with the format identifier "%qd", but doesn't work on AIX */ fac = NTask / ((double) All.TotNumPart); for(i = 0, maxt = timetreelist[0], sumt = 0, plb_max = 0, maxnumnodes = 0, costtotal = 0, sumcomm = 0, ewaldtot = 0; i < NTask; i++) { costtotal += costtreelist[i]; sumcomm += timecommlist[i]; if(maxt < timetreelist[i]) maxt = timetreelist[i]; sumt += timetreelist[i]; plb = nrecv[i] * fac; if(plb > plb_max) plb_max = plb; if(numnodeslist[i] > maxnumnodes) maxnumnodes = numnodeslist[i]; ewaldtot += ewaldlist[i]; } fprintf(FdTimings, "work-load balance: %g max=%g avg=%g PE0=%g\n", maxt / (sumt / NTask), maxt, sumt / NTask, timetreelist[0]); fprintf(FdTimings, "particle-load balance: %g\n", plb_max); fprintf(FdTimings, "max. nodes: %d, filled: %g\n", maxnumnodes, maxnumnodes / (All.TreeAllocFactor * All.MaxPart)); fprintf(FdTimings, "part/sec=%g | %g ia/part=%g (%g)\n", ntot / (sumt + 1.0e-20), ntot / (maxt * NTask), ((double) (costtotal)) / ntot, ((double) ewaldtot) / ntot); fprintf(FdTimings, "\n"); fflush(FdTimings); All.CPU_TreeWalk += sumt / NTask; All.CPU_Imbalance += sumimbalance / NTask; All.CPU_CommSum += sumcomm / NTask; } free(nrecv); free(ewaldlist); free(numnodeslist); free(costtreelist); free(timecommlist); free(timetreelist); }
/*! This function computes the gravitational potential for ALL the particles. * First, the (short-range) tree potential is computed, and then, if needed, * the long range PM potential is added. */ void compute_potential(void) { int i; #ifndef NOGRAVITY int j, k, ret, sendTask, recvTask; int ndone, ndone_flag, dummy; int ngrp, place, nexport, nimport; double fac; MPI_Status status; double r2; if(All.ComovingIntegrationOn) set_softenings(); if(ThisTask == 0) { printf("Start computation of potential for all particles...\n"); fflush(stdout); } CPU_Step[CPU_MISC] += measure_time(); if(TreeReconstructFlag) { if(ThisTask == 0) printf("Tree construction.\n"); CPU_Step[CPU_MISC] += measure_time(); #if defined(SFR) || defined(BLACK_HOLES) rearrange_particle_sequence(); #endif force_treebuild(NumPart, NULL); CPU_Step[CPU_TREEBUILD] += measure_time(); TreeReconstructFlag = 0; if(ThisTask == 0) printf("Tree construction done.\n"); } /* allocate buffers to arrange communication */ All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + sizeof(struct gravdata_in) + sizeof(struct potdata_out) + sizemax(sizeof(struct gravdata_in), sizeof(struct potdata_out)))); DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist)); for(i = 0; i < NumPart; i++) if(P[i].Ti_current != All.Ti_Current) drift_particle(i, All.Ti_Current); i = 0; /* beginn with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i < NumPart; i++) { #ifndef PMGRID ret = force_treeevaluate_potential(i, 0, &nexport, Send_count); #else ret = force_treeevaluate_potential_shortrange(i, 0, &nexport, Send_count); #endif if(ret < 0) break; /* export buffer has filled up */ } #ifdef MYSORT mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #else qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #endif MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask]; nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } GravDataGet = (struct gravdata_in *) mymalloc(nimport * sizeof(struct gravdata_in)); GravDataIn = (struct gravdata_in *) mymalloc(nexport * sizeof(struct gravdata_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; for(k = 0; k < 3; k++) GravDataIn[j].Pos[k] = P[place].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataIn[j].Type = P[place].Type; #ifdef ADAPTIVE_GRAVSOFT_FORGAS if(P[place].Type == 0) GravDataIn[j].Soft = SphP[place].Hsml; #endif #endif GravDataIn[j].OldAcc = P[place].OldAcc; for(k = 0; k < NODELISTLENGTH; k++) GravDataIn[j].NodeList[k] = DataNodeList[DataIndexTable[j].IndexGet].NodeList[k]; } /* exchange particle data */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, &GravDataGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, MPI_COMM_WORLD, &status); } } } myfree(GravDataIn); PotDataResult = (struct potdata_out *) mymalloc(nimport * sizeof(struct potdata_out)); PotDataOut = (struct potdata_out *) mymalloc(nexport * sizeof(struct potdata_out)); /* now do the particles that were sent to us */ for(j = 0; j < nimport; j++) { #ifndef PMGRID force_treeevaluate_potential(j, 1, &dummy, &dummy); #else force_treeevaluate_potential_shortrange(j, 1, &dummy, &dummy); #endif } if(i >= NumPart) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* get the result */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* send the results */ MPI_Sendrecv(&PotDataResult[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct potdata_out), MPI_BYTE, recvTask, TAG_POTENTIAL_B, &PotDataOut[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct potdata_out), MPI_BYTE, recvTask, TAG_POTENTIAL_B, MPI_COMM_WORLD, &status); } } } /* add the results to the local particles */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; P[place].p.dPotential += PotDataOut[j].Potential; } myfree(PotDataOut); myfree(PotDataResult); myfree(GravDataGet); } while(ndone < NTask); myfree(DataNodeList); myfree(DataIndexTable); /* add correction to exclude self-potential */ for(i = 0; i < NumPart; i++) { #ifdef FLTROUNDOFFREDUCTION P[i].p.Potential = FLT(P[i].p.dPotential); #endif /* remove self-potential */ P[i].p.Potential += P[i].Mass / All.SofteningTable[P[i].Type]; if(All.ComovingIntegrationOn) if(All.PeriodicBoundariesOn) P[i].p.Potential -= 2.8372975 * pow(P[i].Mass, 2.0 / 3) * pow(All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G), 1.0 / 3); } /* multiply with the gravitational constant */ for(i = 0; i < NumPart; i++) P[i].p.Potential *= All.G; #ifdef PMGRID #ifdef PERIODIC pmpotential_periodic(); #ifdef PLACEHIGHRESREGION i = pmpotential_nonperiodic(1); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); pm_setup_nonperiodic_kernel(); i = pmpotential_nonperiodic(1); /* try again */ } if(i == 1) endrun(88686); #endif #else i = pmpotential_nonperiodic(0); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); pm_setup_nonperiodic_kernel(); i = pmpotential_nonperiodic(0); /* try again */ } if(i == 1) endrun(88687); #ifdef PLACEHIGHRESREGION i = pmpotential_nonperiodic(1); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); i = pmpotential_nonperiodic(1); } if(i != 0) endrun(88688); #endif #endif #endif if(All.ComovingIntegrationOn) { #ifndef PERIODIC fac = -0.5 * All.Omega0 * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].p.Potential += fac * r2; } #endif } else { fac = -0.5 * All.OmegaLambda * All.Hubble * All.Hubble; if(fac != 0) { for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].p.Potential += fac * r2; } } } if(ThisTask == 0) { printf("potential done.\n"); fflush(stdout); } #else for(i = 0; i < NumPart; i++) P[i].Potential = 0; #endif CPU_Step[CPU_POTENTIAL] += measure_time(); }
/*! This function computes the gravitational potential for ALL the particles. * First, the (short-range) tree potential is computed, and then, if needed, * the long range PM potential is added. */ void compute_potential(void) { int i; #ifndef NOGRAVITY long long ntot, ntotleft; int j, k, level, sendTask, recvTask; int ndone; int maxfill, ngrp, place, nexport; int *nsend, *noffset, *nsend_local, *nbuffer, *ndonelist, *numlist; double fac; double t0, t1, tstart, tend; #ifndef NOMPI MPI_Status status; #endif double r2; t0 = second(); if(All.ComovingIntegrationOn) set_softenings(); if(ThisTask == 0) { printf("Start computation of potential for all particles...\n"); fflush(stdout); } tstart = second(); if(TreeReconstructFlag) { if(ThisTask == 0) printf("Tree construction.\n"); force_treebuild(NumPart); TreeReconstructFlag = 0; if(ThisTask == 0) printf("Tree construction done.\n"); } tend = second(); All.CPU_TreeConstruction += timediff(tstart, tend); numlist = malloc(NTask * sizeof(int) * NTask); #ifndef NOMPI MPI_Allgather(&NumPart, 1, MPI_INT, numlist, 1, MPI_INT, GADGET_WORLD); #else numlist[0] = NumPart; #endif for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; free(numlist); noffset = malloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = malloc(sizeof(int) * NTask); nsend_local = malloc(sizeof(int) * NTask); nsend = malloc(sizeof(int) * NTask * NTask); ndonelist = malloc(sizeof(int) * NTask); i = 0; /* beginn with this index */ ntotleft = ntot; /* particles left for all tasks together */ while(ntotleft > 0) { for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 0; #ifndef PMGRID force_treeevaluate_potential(i, 0); #else force_treeevaluate_potential_shortrange(i, 0); #endif for(j = 0; j < NTask; j++) { if(Exportflag[j]) { for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataGet[nexport].Type = P[i].Type; #ifdef ADAPTIVE_GRAVSOFT_FORGAS if(P[i].Type == 0) GravDataGet[nexport].Soft = SphP[i].Hsml; #endif #endif GravDataGet[nexport].w.OldAcc = P[i].OldAcc; GravDataIndexTable[nexport].Task = j; GravDataIndexTable[nexport].Index = i; GravDataIndexTable[nexport].SortIndex = nexport; nexport++; nsend_local[j]++; } } } qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key); for(j = 0; j < nexport; j++) GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex]; for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; #ifndef NOMPI MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, GADGET_WORLD); #else nsend[0] = nsend_local[0]; #endif /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; #ifndef NOMPI if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, &GravDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, GADGET_WORLD, &status); } } #endif for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } for(j = 0; j < nbuffer[ThisTask]; j++) { #ifndef PMGRID force_treeevaluate_potential(j, 1); #else force_treeevaluate_potential_shortrange(j, 1); #endif } /* get the result */ for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; #ifndef NOMPI if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_B, &GravDataOut[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_B, GADGET_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { place = GravDataIndexTable[noffset[recvTask] + j].Index; P[place].Potential += GravDataOut[j + noffset[recvTask]].u.Potential; } } } #endif for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } level = ngrp - 1; } #ifndef NOMPI MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, GADGET_WORLD); #else ndonelist[0] = ndone; #endif // NOMPI for(j = 0; j < NTask; j++) ntotleft -= ndonelist[j]; } free(ndonelist); free(nsend); free(nsend_local); free(nbuffer); free(noffset); /* add correction to exclude self-potential */ for(i = 0; i < NumPart; i++) { /* remove self-potential */ P[i].Potential += P[i].Mass / All.SofteningTable[P[i].Type]; if(All.ComovingIntegrationOn) if(All.PeriodicBoundariesOn) P[i].Potential -= 2.8372975 * pow(P[i].Mass, 2.0 / 3) * pow(All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G), 1.0 / 3); } /* multiply with the gravitational constant */ for(i = 0; i < NumPart; i++) P[i].Potential *= All.G; #ifdef PMGRID #ifdef PERIODIC pmpotential_periodic(); #ifdef PLACEHIGHRESREGION i = pmpotential_nonperiodic(1); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); pm_setup_nonperiodic_kernel(); i = pmpotential_nonperiodic(1); /* try again */ } if(i == 1) endrun(88686); #endif #else i = pmpotential_nonperiodic(0); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); pm_setup_nonperiodic_kernel(); i = pmpotential_nonperiodic(0); /* try again */ } if(i == 1) endrun(88687); #ifdef PLACEHIGHRESREGION i = pmpotential_nonperiodic(1); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); i = pmpotential_nonperiodic(1); } if(i != 0) endrun(88688); #endif #endif #endif if(All.ComovingIntegrationOn) { #ifndef PERIODIC fac = -0.5 * All.Omega0 * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].Potential += fac * r2; } #endif } else { fac = -0.5 * All.OmegaLambda * All.Hubble * All.Hubble; if(fac != 0) { for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].Potential += fac * r2; } } } if(ThisTask == 0) { printf("potential done.\n"); fflush(stdout); } t1 = second(); All.CPU_Potential += timediff(t0, t1); #else for(i = 0; i < NumPart; i++) P[i].Potential = 0; #endif }
/* This function computes the gravitational potential for ALL the particles. * It expects that the particles are predicted to the current time. * The routine constructs a new force-tree. */ void compute_potential() { #ifdef GRAPE compute_potential_grape(); return; #else double t0,t1; int i,j; double r2,fac; t0=second(); if(All.ComovingIntegrationOn) set_softenings(); printf("Start potential computation..."); fflush(stdout); force_treebuild(); All.NumForcesSinceLastTreeConstruction = All.TreeUpdateFrequency*All.TotNumPart ; /* ensures that new tree will be constructed next time*/ for(i=1;i<=NumPart;i++) { force_treeevaluate_potential(i-1); P[i].Potential += P[i].Mass/All.SofteningTable[P[i].Type]; /* removes self energy */ } if(All.ComovingIntegrationOn) { fac=0.5*All.Omega0*All.Hubble*All.Hubble; for(i=1;i<=NumPart;i++) { #ifdef PERIODIC P[i].Potential = All.G*P[i].Potential; #else for(j=0, r2=0; j<3; j++) r2 += P[i].PosPred[j]*P[i].PosPred[j]; P[i].Potential = All.G*P[i].Potential - fac*r2; #endif } } else { fac= -0.5*All.OmegaLambda*All.Hubble*All.Hubble; for(i=1;i<=NumPart;i++) { P[i].Potential *= All.G; if(fac!=0) { for(j=0,r2=0;j<3;j++) r2 += P[i].PosPred[j]*P[i].PosPred[j]; P[i].Potential += fac*r2; } } } printf("done.\n"); fflush(stdout); t1=second(); All.CPU_Potential+= timediff(t0,t1); #endif }
/*! This function computes the gravitational forces for all active * particles. If needed, a new tree is constructed, otherwise the * dynamically updated tree is used. Particles are only exported to other * processors when really needed, thereby allowing a good use of the * communication buffer. */ void gravity_tree(void) { int tim=20; // GX mod, timer to profile calls TimerBeg(29); TimerBeg(tim); long long ntot; int numnodes, nexportsum = 0; int i, j, iter = 0; int *numnodeslist, maxnumnodes, nexport, *numlist, *nrecv, *ndonelist; double tstart, tend, timetree = 0, timecommsumm = 0, timeimbalance = 0, sumimbalance; double ewaldcount; double costtotal, ewaldtot, *costtreelist, *ewaldlist; double maxt, sumt, *timetreelist, *timecommlist; double fac, plb, plb_max, sumcomm; #ifndef NOGRAVITY int *noffset, *nbuffer, *nsend, *nsend_local; long long ntotleft; int ndone,maxfill, ngrp; int k, place; int level, sendTask, recvTask; double ax, ay, az; MPI_Status status; #endif ///////////////// GX ////////////////////// int totdone=0; #if CUDA_DEBUG_GX>0 int not_timestepped_gx=0; int exporthash_gx=0; int count_exported_gx=0; #endif ///////////////// GX ////////////////////// /* set new softening lengths */ if(All.ComovingIntegrationOn) set_softenings(); /* contruct tree if needed */ tstart = second(); if(TreeReconstructFlag) { if(ThisTask == 0) printf("Tree construction.\n"); force_treebuild(NumPart); TreeReconstructFlag = 0; if(ThisTask == 0) printf("Tree construction done.\n"); } tend = second(); All.CPU_TreeConstruction += timediff(tstart, tend); costtotal = ewaldcount = 0; /* Note: 'NumForceUpdate' has already been determined in find_next_sync_point_and_drift() */ numlist = malloc(NTask * sizeof(int) * NTask); MPI_Allgather(&NumForceUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD); for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; free(numlist); #ifndef NOGRAVITY if(ThisTask == 0) printf("Begin tree force.\n"); #ifdef SELECTIVE_NO_GRAVITY for(i = 0; i < NumPart; i++) if(((1 << P[i].Type) & (SELECTIVE_NO_GRAVITY))) P[i].Ti_endstep = -P[i].Ti_endstep - 1; #endif noffset = malloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = malloc(sizeof(int) * NTask); nsend_local = malloc(sizeof(int) * NTask); nsend = malloc(sizeof(int) * NTask * NTask); ndonelist = malloc(sizeof(int) * NTask); i = 0; /* begin with this index */ ntotleft = ntot; /* particles left for all tasks together */ TimerEnd(tim++); ///////////////// GX ////////////////////// // if (s_gx.cudamode>0 && All.MaxPart>1400000) TimersSleep(10); // GPU card runs hot on large sims, this is around N_p=1404928 // if (s_gx.cudamode>0) TimersSleep(10); TimerBeg(tim); double starttime,subtime=-1,cpytime=-1; int Np=-1; int buffered=0; if(s_gx.cudamode>0) { FUN_MESSAGE(2,"gravity_tree()"); TimerBeg(50); cpytime=GetTime(); Np=InitializeProlog_gx(NumPart); TimerEnd(50); cpytime=GetTime()-cpytime; } ///////////////// GX ////////////////////// while(ntotleft > 0) { TimerBeg(31); starttime=GetTime(); iter++; for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ tstart = second(); if (s_gx.cudamode==0 || Np<MIN_FORCE_PARTICLES_FOR_GPU_GX) { ASSERT_GX( !buffered ); ReLaunchChunkManager(); for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) { if(P[i].Ti_endstep == All.Ti_Current) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 0; TimerUpdateCounter(31,1); #ifndef PMGRID costtotal += force_treeevaluate(i, 0, &ewaldcount); #else costtotal += force_treeevaluate_shortrange(i, 0 ); #endif #if CUDA_DEBUG_GX>0 int flagexported_gx=0; #endif for(j = 0; j < NTask; j++) { if(Exportflag[j]) { ASSERT_GX( NTask>1 ); #if CUDA_DEBUG_GX>0 flagexported_gx=1; exporthash_gx += (i-j)*(j+ThisTask+1); #endif for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataGet[nexport].Type = P[i].Type; #ifdef ADAPTIVE_GRAVSOFT_FORGAS if(P[i].Type == 0) GravDataGet[nexport].Soft = SphP[i].Hsml; #endif #endif GravDataGet[nexport].w.OldAcc = P[i].OldAcc; GravDataIndexTable[nexport].Task = j; GravDataIndexTable[nexport].Index = i; GravDataIndexTable[nexport].SortIndex = nexport; nexport++; nexportsum++; nsend_local[j]++; } } #if CUDA_DEBUG_GX>0 if (flagexported_gx) ++count_exported_gx; #endif } #if CUDA_DEBUG_GX>0 else ++not_timestepped_gx; #endif } ManageChuncks(0); } else { ///////////////// GX ////////////////////// // cudamode>0 ///////////////// GX ////////////////////// #ifndef PMGRID // WARNING Attemping to run in tree-only mode, examine results carefully // ERROR cannot run in non PMGRID mode #endif if (iter==1){ const double tx=GetTime(); TimerBeg(51); ASSERT_GX(NumPart>=i); ASSERT_GX(!buffered); if (iter!=1) ERROR("cuda mode does not support iterations in gravtree calc, try to increasing the 'BufferSize' in the parameter file to surcomevent this problem"); const int Np2=InitializeCalculation_gx(NumPart,P,0); ASSERT_GX( Np2==Np ); if (Np2==0) WARNING("no particles participate in this timestep"); TimerEnd(51); cpytime += GetTime() - tx; subtime=GetTime(); TimerBeg(52); force_treeevaluate_shortrange_range_gx(0, Np); buffered=1; TimerUpdateCounter(31,NumPart-i); TimerEnd(52); subtime = GetTime() - subtime; } else { cpytime=-1; subtime=-1; ASSERT_GX(buffered); } for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) { if(P[i].Ti_endstep == All.Ti_Current) { ndone++; ASSERT_GX( i<NumPart ); ASSERT_GX( buffered ); const struct result_gx r=GetTarget(totdone++,i); // s_gx.result[target]; P[i].GravAccel[0] = r.acc_x; P[i].GravAccel[1] = r.acc_y; P[i].GravAccel[2] = r.acc_z; P[i].GravCost = r.ninteractions; costtotal += r.ninteractions; if (s_gx.NTask>1) { #if CUDA_DEBUG_GX>0 int flagexported_gx=0; #endif for(j = 0; j < NTask; j++) { if (GetExportflag_gx(&s_gx,i,NTask,j)){ ASSERT_GX( NTask>1 ); #if CUDA_DEBUG_GX>0 flagexported_gx=1; exporthash_gx += (i-j)*(j+ThisTask+1); #endif for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataGet[nexport].Type = P[i].Type; #ifdef ADAPTIVE_GRAVSOFT_FORGAS if(P[i].Type == 0) GravDataGet[nexport].Soft = SphP[i].Hsml; #endif #endif GravDataGet[nexport].w.OldAcc = P[i].OldAcc; GravDataIndexTable[nexport].Task = j; GravDataIndexTable[nexport].Index = i; GravDataIndexTable[nexport].SortIndex = nexport; nexport++; nexportsum++; nsend_local[j]++; } } #if CUDA_DEBUG_GX>0 if (flagexported_gx) ++count_exported_gx; #endif } } #if CUDA_DEBUG_GX>0 else ++not_timestepped_gx; #endif } AssertsOnhasGadgetDataBeenModified_gx(0,1,0); } TimerEnd(31); ///////////////// GX ////////////////////// if (iter==1 || !buffered){ PrintInfoFinalize(s_gx,ndone,Np,starttime,cpytime,subtime,0,iter,-1 #if CUDA_DEBUG_GX>0 ,not_timestepped_gx,count_exported_gx,nexport,nexportsum,exporthash_gx,costtotal #else ,0,0,0,0,0,0 #endif ); subtime=-1; } TimerBeg(39); ///////////////// GX ////////////////////// tend = second(); timetree += timediff(tstart, tend); qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key); for(j = 0; j < nexport; j++) GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex]; for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; tstart = second(); MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_A, &GravDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_A, MPI_COMM_WORLD, &status); } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); TimerBeg(30); TimerUpdateCounter(30,nbuffer[ThisTask]); tstart = second(); ///////////////// GX ////////////////////// // Do exported particles on the CPU/GPU { AssertsOnhasGadgetDataBeenModified_gx(1,1,0); #if CUDA_DEBUG_GX>1 MESSAGE("INFO: DistRMSGrav=%g",DistRMSGravdata(nbuffer[ThisTask],GravDataGet)); #endif starttime=GetTime(); const int N=nbuffer[ThisTask]; if (N>0){ if (s_gx.cudamode==0 || N<MIN_FORCE_PARTICLES_FOR_GPU_GX || Np<MIN_FORCE_PARTICLES_FOR_GPU_GX) { ReLaunchChunkManager(); for(j = 0; j<N ; j++) { #ifndef PMGRID costtotal += force_treeevaluate(j, 1, &ewaldcount); #else costtotal += force_treeevaluate_shortrange(j, 1); #endif } ManageChuncks(0); } else { ASSERT_GX( buffered ); cpytime=GetTime(); InitializeExportCalculation_gx(N,P[0].Type); ASSERT_GX( N==s_gx.Np ); subtime=GetTime(); force_treeevaluate_shortrange_range_gx(1, N); subtime=GetTime()-subtime; costtotal += FinalizeExportCalculation_gx(N); cpytime=GetTime()-cpytime-subtime; ASSERT_GX( N==s_gx.Np ); } PrintInfoFinalize(s_gx,0,N,starttime,cpytime,subtime,2,iter,level,0,0,nexport,0,0,0); subtime=-1; } else { ReLaunchChunkManager(); ManageChuncks(0); } } ///////////////// GX ////////////////////// if (nbuffer[ThisTask]>0) TimerUpdateCounter(30,-1); TimerEnd(30); tend = second(); timetree += timediff(tstart, tend); TimerBeg(33); tstart = second(); MPI_Barrier(MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); TimerEnd(33); /* get the result */ tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_B, &GravDataOut[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_GRAV_B, MPI_COMM_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { place = GravDataIndexTable[noffset[recvTask] + j].Index; // comment out in order to disable export forces for debugging for(k = 0; k < 3; k++) P[place].GravAccel[k] += GravDataOut[j + noffset[recvTask]].u.Acc[k]; P[place].GravCost += GravDataOut[j + noffset[recvTask]].w.Ninteractions; } } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); level = ngrp - 1; } MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD); for(j = 0; j < NTask; j++) ntotleft -= ndonelist[j]; TimerEnd(39); } TimerEnd(tim++); TimerBeg(tim); free(ndonelist); free(nsend); free(nsend_local); free(nbuffer); free(noffset); /* now add things for comoving integration */ #ifndef PERIODIC #ifndef PMGRID if(All.ComovingIntegrationOn) { fac = 0.5 * All.Hubble * All.Hubble * All.Omega0 / All.G; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] += fac * P[i].Pos[j]; } #endif #endif for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) { #ifdef PMGRID ax = P[i].GravAccel[0] + P[i].GravPM[0] / All.G; ay = P[i].GravAccel[1] + P[i].GravPM[1] / All.G; az = P[i].GravAccel[2] + P[i].GravPM[2] / All.G; #else ax = P[i].GravAccel[0]; ay = P[i].GravAccel[1]; az = P[i].GravAccel[2]; #endif P[i].OldAcc = sqrt(ax * ax + ay * ay + az * az); } if(All.TypeOfOpeningCriterion == 1) All.ErrTolTheta = 0; /* This will switch to the relative opening criterion for the following force computations */ /* muliply by G */ for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] *= All.G; /* Finally, the following factor allows a computation of a cosmological simulation with vacuum energy in physical coordinates */ #ifndef PERIODIC #ifndef PMGRID if(All.ComovingIntegrationOn == 0) { fac = All.OmegaLambda * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] += fac * P[i].Pos[j]; } #endif #endif #ifdef SELECTIVE_NO_GRAVITY for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) P[i].Ti_endstep = -P[i].Ti_endstep - 1; #endif if(ThisTask == 0) printf("tree is done.\n"); #else /* gravity is switched off */ for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep == All.Ti_Current) for(j = 0; j < 3; j++) P[i].GravAccel[j] = 0; #endif /* Now the force computation is finished */ /* gather some diagnostic information */ timetreelist = malloc(sizeof(double) * NTask); timecommlist = malloc(sizeof(double) * NTask); costtreelist = malloc(sizeof(double) * NTask); numnodeslist = malloc(sizeof(int) * NTask); ewaldlist = malloc(sizeof(double) * NTask); nrecv = malloc(sizeof(int) * NTask); numnodes = Numnodestree; MPI_Gather(&costtotal, 1, MPI_DOUBLE, costtreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&numnodes, 1, MPI_INT, numnodeslist, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Gather(&timetree, 1, MPI_DOUBLE, timetreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&timecommsumm, 1, MPI_DOUBLE, timecommlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&NumPart, 1, MPI_INT, nrecv, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Gather(&ewaldcount, 1, MPI_DOUBLE, ewaldlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Reduce(&nexportsum, &nexport, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if(ThisTask == 0) { All.TotNumOfForces += ntot; fprintf(FdTimings, "Step= %d t= %g dt= %g \n", All.NumCurrentTiStep, All.Time, All.TimeStep); fprintf(FdTimings, "Nf= %d%09d total-Nf= %d%09d ex-frac= %g iter= %d\n", (int) (ntot / 1000000000), (int) (ntot % 1000000000), (int) (All.TotNumOfForces / 1000000000), (int) (All.TotNumOfForces % 1000000000), nexport / ((double) ntot), iter); /* note: on Linux, the 8-byte integer could be printed with the format identifier "%qd", but doesn't work on AIX */ fac = NTask / ((double) All.TotNumPart); for(i = 0, maxt = timetreelist[0], sumt = 0, plb_max = 0, maxnumnodes = 0, costtotal = 0, sumcomm = 0, ewaldtot = 0; i < NTask; i++) { costtotal += costtreelist[i]; sumcomm += timecommlist[i]; if(maxt < timetreelist[i]) maxt = timetreelist[i]; sumt += timetreelist[i]; plb = nrecv[i] * fac; if(plb > plb_max) plb_max = plb; if(numnodeslist[i] > maxnumnodes) maxnumnodes = numnodeslist[i]; ewaldtot += ewaldlist[i]; } fprintf(FdTimings, "work-load balance: %g max=%g avg=%g PE0=%g\n", maxt / (sumt / NTask), maxt, sumt / NTask, timetreelist[0]); fprintf(FdTimings, "particle-load balance: %g\n", plb_max); fprintf(FdTimings, "max. nodes: %d, filled: %g\n", maxnumnodes, maxnumnodes / (All.TreeAllocFactor * All.MaxPart)); fprintf(FdTimings, "part/sec=%g | %g ia/part=%g (%g)\n", ntot / (sumt + 1.0e-20), ntot / (maxt * NTask), ((double) (costtotal)) / ntot, ((double) ewaldtot) / ntot); fprintf(FdTimings, "\n"); fflush(FdTimings); All.CPU_TreeWalk += sumt / NTask; All.CPU_Imbalance += sumimbalance / NTask; All.CPU_CommSum += sumcomm / NTask; } free(nrecv); free(ewaldlist); free(numnodeslist); free(costtreelist); free(timecommlist); free(timetreelist); ASSERT_GX( tim==22 ); TimerEnd(tim++); TimerEnd(29); //MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f - %5.1f, %5.1f, %5.1f, %5.1f %c force timers d 29,31,30,33,net",TimerGet(29),TimerGet(31),TimerGet(30),TimerGet(33),TimerGet(29)-TimerGet(31)-TimerGet(30),100.0*TimerGet(31)/TimerGet(29),100.0*TimerGet(30)/TimerGet(29),100.0*TimerGet(33)/TimerGet(29),100.0*(TimerGet(29)-TimerGet(31)-TimerGet(30))/TimerGet(29),'%'); //MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f - %5.1f, %5.1f, %5.1f, %5.1f %c force timers a 29,31,30,33,net",TimerGetAccumulated(29),TimerGetAccumulated(31),TimerGetAccumulated(30),TimerGetAccumulated(33),TimerGetAccumulated(29)-TimerGetAccumulated(31)-TimerGetAccumulated(30),100.0*TimerGetAccumulated(31)/TimerGetAccumulated(29),100.0*TimerGetAccumulated(30)/TimerGetAccumulated(29),100.0*TimerGetAccumulated(33)/TimerGetAccumulated(29),100.0*(TimerGetAccumulated(29)-TimerGetAccumulated(31)-TimerGetAccumulated(30))/TimerGetAccumulated(29),'%'); }
/*! This routine does the test of the gravitational tree force by computing * the force for a random subset of particles with direct summation. */ void gravity_forcetest(void) { int ntot, iter = 0, ntotleft, nthis; double tstart, tend, timetree = 0; int i, j, ndone, ngrp, maxfill, place, ndonetot; #ifndef NOGRAVITY int *noffset, *nbuffer, *nsend, *nsend_local; int k, nexport; int level, sendTask, recvTask; double fac1; #ifndef NOMPI MPI_Status status; #endif #endif double costtotal, *costtreelist; double maxt, sumt, *timetreelist; double fac; char buf[200]; #ifdef PMGRID if(All.PM_Ti_endstep != All.Ti_Current) return; #endif if(All.ComovingIntegrationOn) set_softenings(); /* set new softening lengths */ for(i = 0, NumForceUpdate = 0; i < NumPart; i++) { if(P[i].Ti_endstep == All.Ti_Current) { if(get_random_number(P[i].ID) < FORCETEST) { P[i].Ti_endstep = -P[i].Ti_endstep - 1; NumForceUpdate++; } } } /* NumForceUpdate is the number of particles on this processor that want a force update */ #ifndef NOMPI MPI_Allreduce(&NumForceUpdate, &ntot, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); #else ntot = NumForceUpdate; #endif costtotal = 0; noffset = malloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = malloc(sizeof(int) * NTask); nsend_local = malloc(sizeof(int) * NTask); nsend = malloc(sizeof(int) * NTask * NTask); i = 0; /* beginn with this index */ ntotleft = ntot; /* particles left for all tasks together */ while(ntotleft > 0) { iter++; for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ tstart = second(); for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) if(P[i].Ti_endstep < 0) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 1; Exportflag[ThisTask] = 0; costtotal += force_treeevaluate_direct(i, 0); for(j = 0; j < NTask; j++) { if(Exportflag[j]) { for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataGet[nexport].Type = P[i].Type; #endif GravDataGet[nexport].w.OldAcc = P[i].OldAcc; GravDataIndexTable[nexport].Task = j; GravDataIndexTable[nexport].Index = i; GravDataIndexTable[nexport].SortIndex = nexport; nexport++; nsend_local[j]++; } } } tend = second(); timetree += timediff(tstart, tend); qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key); for(j = 0; j < nexport; j++) GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex]; for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; #ifndef NOMPI MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD); #else nsend[0] = nsend_local[i]; #endif /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_DIRECT_A, &GravDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_DIRECT_A, MPI_COMM_WORLD, &status); } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tstart = second(); for(j = 0; j < nbuffer[ThisTask]; j++) { costtotal += force_treeevaluate_direct(j, 1); } tend = second(); timetree += timediff(tstart, tend); /* get the result */ for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_DIRECT_B, &GravDataOut[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_DIRECT_B, MPI_COMM_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { place = GravDataIndexTable[noffset[recvTask] + j].Index; for(k = 0; k < 3; k++) P[place].GravAccelDirect[k] += GravDataOut[j + noffset[recvTask]].u.Acc[k]; } } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } level = ngrp - 1; } MPI_Allreduce(&ndone, &ndonetot, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ntotleft -= ndonetot; } free(nsend); free(nsend_local); free(nbuffer); free(noffset); /* now add things for comoving integration */ if(All.ComovingIntegrationOn) { #ifndef PERIODIC fac1 = 0.5 * All.Hubble * All.Hubble * All.Omega0 / All.G; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) for(j = 0; j < 3; j++) P[i].GravAccelDirect[j] += fac1 * P[i].Pos[j]; #endif } /* muliply by G */ for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) for(j = 0; j < 3; j++) P[i].GravAccelDirect[j] *= All.G; /* Finally, the following factor allows a computation of cosmological simulation with vacuum energy in physical coordinates */ if(All.ComovingIntegrationOn == 0) { fac1 = All.OmegaLambda * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) for(j = 0; j < 3; j++) P[i].GravAccelDirect[j] += fac1 * P[i].Pos[j]; } /* now output the forces to a file */ for(nthis = 0; nthis < NTask; nthis++) { if(nthis == ThisTask) { sprintf(buf, "%s%s", All.OutputDir, "forcetest.txt"); if(!(FdForceTest = fopen(buf, "a"))) { printf("error in opening file '%s'\n", buf); endrun(17); } for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) { #ifndef PMGRID fprintf(FdForceTest, "%d %g %g %g %g %g %g %g %g %g %g %g\n", P[i].Type, All.Time, All.Time - TimeOfLastTreeConstruction, P[i].Pos[0], P[i].Pos[1], P[i].Pos[2], P[i].GravAccelDirect[0], P[i].GravAccelDirect[1], P[i].GravAccelDirect[2], P[i].GravAccel[0], P[i].GravAccel[1], P[i].GravAccel[2]); #else fprintf(FdForceTest, "%d %g %g %g %g %g %g %g %g %g %g %g %g %g %g\n", P[i].Type, All.Time, All.Time - TimeOfLastTreeConstruction, P[i].Pos[0], P[i].Pos[1], P[i].Pos[2], P[i].GravAccelDirect[0], P[i].GravAccelDirect[1], P[i].GravAccelDirect[2], P[i].GravAccel[0], P[i].GravAccel[1], P[i].GravAccel[2], P[i].GravPM[0] + P[i].GravAccel[0], P[i].GravPM[1] + P[i].GravAccel[1], P[i].GravPM[2] + P[i].GravAccel[2]); #endif } fclose(FdForceTest); } #ifndef NOMPI MPI_Barrier(MPI_COMM_WORLD); #endif } for(i = 0; i < NumPart; i++) if(P[i].Ti_endstep < 0) P[i].Ti_endstep = -P[i].Ti_endstep - 1; /* Now the force computation is finished */ timetreelist = malloc(sizeof(double) * NTask); costtreelist = malloc(sizeof(double) * NTask); MPI_Gather(&costtotal, 1, MPI_DOUBLE, costtreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Gather(&timetree, 1, MPI_DOUBLE, timetreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if(ThisTask == 0) { fac = NTask / ((double) All.TotNumPart); for(i = 0, maxt = timetreelist[0], sumt = 0, costtotal = 0; i < NTask; i++) { costtotal += costtreelist[i]; if(maxt < timetreelist[i]) maxt = timetreelist[i]; sumt += timetreelist[i]; } fprintf(FdTimings, "DIRECT Nf= %d part/sec=%g | %g ia/part=%g \n", ntot, ntot / (sumt + 1.0e-20), ntot / (maxt * NTask), ((double) (costtotal)) / ntot); fprintf(FdTimings, "\n"); fflush(FdTimings); } free(costtreelist); free(timetreelist); }