VOID StartRayTrace() { INT pid; /* Our internal process id number. */ UINT begin; UINT end; THREAD_INIT_FREE(); LOCK(gm->pidlock) pid = gm->pid++; UNLOCK(gm->pidlock) BARINCLUDE(gm->start); if ((pid == 0) || (dostats)) CLOCK(begin); /* POSSIBLE ENHANCEMENT: Here's where one might lock processes down to processors if need be */ InitWorkPool(pid); InitRayTreeStack(Display.maxlevel, pid); /* * Wait for all processes to be created, initialize their work * pools, and arrive at this point; then proceed. This BARRIER * is absolutely required. Read comments in PutJob before * moving this barrier. */ BARRIER(gm->start, gm->nprocs) /* POSSIBLE ENHANCEMENT: Here's where one would RESET STATISTICS and TIMING if one wanted to measure only the parallel part */ // Reset Models CarbonEnableModels(); RayTrace(pid); if ((pid == 0) || (dostats)) { CLOCK(end); gm->partime[pid] = (end - begin) & 0x7FFFFFFF; if (pid == 0) gm->par_start_time = begin; } }
void slave() { long i; long j; long nstep; long iindex; long iday; double ysca1; double y; double factor; double sintemp; double curlt; double ressqr; long istart; long iend; long jstart; long jend; long ist; long ien; long jst; long jen; double fac; long dayflag=0; long dhourflag=0; long endflag=0; double ttime; double dhour; double day; long firstrow; long lastrow; long numrows; long firstcol; long lastcol; long numcols; long psiindex; double psibipriv; long psinum; long procid; unsigned long t1; ressqr = lev_res[numlev-1] * lev_res[numlev-1]; LOCK(locks->idlock) procid = global->id; global->id = global->id+1; UNLOCK(locks->idlock) /* POSSIBLE ENHANCEMENT: Here is where one might pin processes to processors to avoid migration. */ /* POSSIBLE ENHANCEMENT: Here is where one might distribute data structures across physically distributed memories in a round-robin fashion. */ firstcol = gp[procid].rel_start_x[numlev-1]; lastcol = firstcol + gp[procid].rel_num_x[numlev-1] - 1; firstrow = gp[procid].rel_start_y[numlev-1]; lastrow = firstrow + gp[procid].rel_num_y[numlev-1] - 1; numcols = gp[procid].rel_num_x[numlev-1]; numrows = gp[procid].rel_num_y[numlev-1]; if (procid > nprocs/2) { psinum = 2; } else { psinum = 1; } /* every process gets its own copy of the timing variables to avoid contention at shared memory locations. here, these variables are initialized. */ ttime = 0.0; dhour = 0.0; nstep = 0 ; day = 0.0; ysca1 = 0.5*ysca; if (procid == MASTER) { for(iindex = 0;iindex<=jm-1;iindex++) { y = ((double) iindex)*res; wrk2->f[iindex] = f0+beta*(y-ysca1); } } if (procid == MASTER) { fields2->psium[0][0]=0.0; } if (procid == nprocs-xprocs) { fields2->psium[im-1][0]=0.0; } if (procid == xprocs-1) { fields2->psium[0][jm-1]=0.0; } if (procid == nprocs-1) { fields2->psium[im-1][jm-1]=0.0; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { fields2->psium[0][j] = 0.0; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { fields2->psium[im-1][j] = 0.0; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { fields2->psium[j][0] = 0.0; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { fields2->psium[j][jm-1] = 0.0; } } for(i=firstrow;i<=lastrow;i++) { for(iindex=firstcol;iindex<=lastcol;iindex++) { fields2->psium[i][iindex] = 0.0; } } if (procid == MASTER) { fields2->psilm[0][0]=0.0; } if (procid == nprocs-xprocs) { fields2->psilm[im-1][0]=0.0; } if (procid == xprocs-1) { fields2->psilm[0][jm-1]=0.0; } if (procid == nprocs-1) { fields2->psilm[im-1][jm-1]=0.0; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { fields2->psilm[0][j] = 0.0; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { fields2->psilm[im-1][j] = 0.0; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { fields2->psilm[j][0] = 0.0; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { fields2->psilm[j][jm-1] = 0.0; } } for(i=firstrow;i<=lastrow;i++) { for(iindex=firstcol;iindex<=lastcol;iindex++) { fields2->psilm[i][iindex] = 0.0; } } if (procid == MASTER) { wrk1->psib[0][0]=1.0; } if (procid == xprocs-1) { wrk1->psib[0][jm-1]=1.0; } if (procid == nprocs-xprocs) { wrk1->psib[im-1][0]=1.0; } if (procid == nprocs-1) { wrk1->psib[im-1][jm-1]=1.0; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { wrk1->psib[0][j] = 1.0; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { wrk1->psib[im-1][j] = 1.0; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { wrk1->psib[j][0] = 1.0; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { wrk1->psib[j][jm-1] = 1.0; } } for(i=firstrow;i<=lastrow;i++) { for(iindex=firstcol;iindex<=lastcol;iindex++) { wrk1->psib[i][iindex] = 0.0; } } /* wait until all processes have completed the above initialization */ #if defined(MULTIPLE_BARRIERS) BARRIER(bars->sl_prini,nprocs) #else BARRIER(bars->barrier,nprocs) #endif istart = gp[procid].rel_start_y[numlev-1]; iend = istart + gp[procid].rel_num_y[numlev-1] - 1; jstart = gp[procid].rel_start_x[numlev-1]; jend = jstart + gp[procid].rel_num_x[numlev-1] - 1; ist = istart; ien = iend; jst = jstart; jen = jend; if (istart == 1) { istart = 0; } if (jstart == 1) { jstart = 0; } if (iend == im-2) { iend = im-1; } if (jend == jm-2) { jend = jm-1; } for(i=istart;i<=iend;i++) { for(j=jstart;j<=jend;j++) { multi->rhs_multi[numlev-1][i][j] = wrk1->psib[i][j] * ressqr; } } if (istart == 0) { for(j=jstart;j<=jend;j++) { multi->q_multi[numlev-1][0][j] = wrk1->psib[0][j]; } } if (iend == im-1) { for(j=jstart;j<=jend;j++) { multi->q_multi[numlev-1][im-1][j] = wrk1->psib[im-1][j]; } } if (jstart == 0) { for(i=istart;i<=iend;i++) { multi->q_multi[numlev-1][i][0] = wrk1->psib[i][0]; } } if (jend == jm-1) { for(i=istart;i<=iend;i++) { multi->q_multi[numlev-1][i][jm-1] = wrk1->psib[i][jm-1]; } } fac = 1.0 / (4.0 - ressqr*eig2); for(i=ist;i<=ien;i++) { for(j=jst;j<=jen;j++) { multi->q_multi[numlev-1][i][j] = fac * (wrk1->psib[i+1][j] + wrk1->psib[i-1][j] + wrk1->psib[i][j+1] + wrk1->psib[i][j-1] - ressqr*wrk1->psib[i][j]); } } #if defined(MULTIPLE_BARRIERS) BARRIER(bars->sl_prini,nprocs) #else BARRIER(bars->barrier,nprocs) #endif multig(procid); for(i=istart;i<=iend;i++) { for(j=jstart;j<=jend;j++) { wrk1->psib[i][j] = multi->q_multi[numlev-1][i][j]; } } #if defined(MULTIPLE_BARRIERS) BARRIER(bars->sl_psini,nprocs) #else BARRIER(bars->barrier,nprocs) #endif /* update the local running sum psibipriv by summing all the resulting values in that process's share of the psib matrix */ psibipriv=0.0; if (procid == MASTER) { psibipriv = psibipriv + 0.25*(wrk1->psib[0][0]); } if (procid == xprocs-1){ psibipriv = psibipriv + 0.25*(wrk1->psib[0][jm-1]); } if (procid == nprocs - xprocs) { psibipriv=psibipriv+0.25*(wrk1->psib[im-1][0]); } if (procid == nprocs-1) { psibipriv=psibipriv+0.25*(wrk1->psib[im-1][jm-1]); } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { psibipriv = psibipriv + 0.5*wrk1->psib[0][j]; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { psibipriv = psibipriv + 0.5*wrk1->psib[im-1][j]; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { psibipriv = psibipriv + 0.5*wrk1->psib[j][0]; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { psibipriv = psibipriv + 0.5*wrk1->psib[j][jm-1]; } } for(iindex=firstcol;iindex<=lastcol;iindex++) { for(i=firstrow;i<=lastrow;i++) { psibipriv = psibipriv + wrk1->psib[i][iindex]; } } /* update the shared variable psibi by summing all the psibiprivs of the individual processes into it. note that this combined private and shared sum method avoids accessing the shared variable psibi once for every element of the matrix. */ LOCK(locks->psibilock) global->psibi = global->psibi + psibipriv; UNLOCK(locks->psibilock) for(psiindex=0;psiindex<=1;psiindex++) { if (procid == MASTER) { fields->psim[psiindex][0][0] = 0.0; } if (procid == nprocs-xprocs) { fields->psim[psiindex][im-1][0] = 0.0; } if (procid == xprocs-1) { fields->psim[psiindex][0][jm-1] = 0.0; } if (procid == nprocs-1) { fields->psim[psiindex][im-1][jm-1] = 0.0; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { fields->psim[psiindex][0][j] = 0.0; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { fields->psim[psiindex][im-1][j] = 0.0; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { fields->psim[psiindex][j][0] = 0.0; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { fields->psim[psiindex][j][jm-1] = 0.0; } } for(i=firstrow;i<=lastrow;i++) { for(iindex=firstcol;iindex<=lastcol;iindex++) { fields->psim[psiindex][i][iindex] = 0.0; } } } /* initialize psi matrices the same way */ for(psiindex=0;psiindex<=1;psiindex++) { if (procid == MASTER) { fields->psi[psiindex][0][0] = 0.0; } if (procid == xprocs-1) { fields->psi[psiindex][0][jm-1] = 0.0; } if (procid == nprocs-xprocs) { fields->psi[psiindex][im-1][0] = 0.0; } if (procid == nprocs-1) { fields->psi[psiindex][im-1][jm-1] = 0.0; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { fields->psi[psiindex][0][j] = 0.0; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { fields->psi[psiindex][im-1][j] = 0.0; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { fields->psi[psiindex][j][0] = 0.0; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { fields->psi[psiindex][j][jm-1] = 0.0; } } for(i=firstrow;i<=lastrow;i++) { for(iindex=firstcol;iindex<=lastcol;iindex++) { fields->psi[psiindex][i][iindex] = 0.0; } } } /* compute input curl of wind stress */ ysca1 = .5*ysca; factor= -t0*pi/ysca1; if (procid == MASTER) { frcng->tauz[0][0] = 0.0; } if (procid == nprocs-xprocs) { frcng->tauz[im-1][0] = 0.0; } if (procid == xprocs-1) { sintemp = pi*((double) jmm1)*res/ysca1; sintemp = sin(sintemp); frcng->tauz[0][jm-1] = factor*sintemp; } if (procid == nprocs-1) { sintemp = pi*((double) jmm1)*res/ysca1; sintemp = sin(sintemp); frcng->tauz[im-1][jm-1] = frcng->tauz[0][jm-1]; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { sintemp = pi*((double) j)*res/ysca1; sintemp = sin(sintemp); curlt = factor*sintemp; frcng->tauz[0][j] = curlt; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { sintemp = pi*((double) j)*res/ysca1; sintemp = sin(sintemp); curlt = factor*sintemp; frcng->tauz[im-1][j] = curlt; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { frcng->tauz[j][0] = 0.0; } } if ((firstcol+numcols) == jm-1) { sintemp = pi*((double) jmm1)*res/ysca1; sintemp = sin(sintemp); curlt = factor*sintemp; for(j=firstrow;j<=lastrow;j++) { frcng->tauz[j][jm-1] = curlt; } } for(iindex=firstcol;iindex<=lastcol;iindex++) { sintemp = pi*((double) iindex)*res/ysca1; sintemp = sin(sintemp); curlt = factor*sintemp; for(i=firstrow;i<=lastrow;i++) { frcng->tauz[i][iindex] = curlt; } } #if defined(MULTIPLE_BARRIERS) BARRIER(bars->sl_onetime,nprocs) #else BARRIER(bars->barrier,nprocs) #endif /*************************************************************** one-time stuff over at this point ***************************************************************/ while (!endflag) { while ((!dayflag) || (!dhourflag)) { dayflag = 0; dhourflag = 0; if (nstep == 1) { if (procid == MASTER) { CLOCK(global->trackstart) } if ((procid == MASTER) || (do_stats)) { CLOCK(t1); gp[procid].total_time = t1; gp[procid].multi_time = 0; } /* POSSIBLE ENHANCEMENT: Here is where one might reset the statistics that one is measuring about the parallel execution */ // Reset Models CarbonEnableModels(); } slave2(procid,firstrow,lastrow,numrows,firstcol,lastcol,numcols); /* update time and step number note that these time and step variables are private i.e. every process has its own copy and keeps track of its own time */ ttime = ttime + dtau; nstep = nstep + 1; day = ttime/86400.0; if (day > ((double) outday0)) { dayflag = 1; iday = (long) day; dhour = dhour+dtau; if (dhour >= 86400.0) { dhourflag = 1; } } } dhour = 0.0; /* update values of psium array to psium + psim{1} */ if (procid == MASTER) { fields2->psium[0][0] = fields2->psium[0][0]+fields->psim[0][0][0]; } if (procid == nprocs-xprocs) { fields2->psium[im-1][0] = fields2->psium[im-1][0]+fields->psim[0][im-1][0]; } if (procid == xprocs-1) { fields2->psium[0][jm-1] = fields2->psium[0][jm-1]+fields->psim[0][0][jm-1]; } if (procid == nprocs-1) { fields2->psium[im-1][jm-1] = fields2->psium[im-1][jm-1]+fields->psim[0][im-1][jm-1]; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { fields2->psium[0][j] = fields2->psium[0][j]+fields->psim[0][0][j]; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { fields2->psium[im-1][j] = fields2->psium[im-1][j]+fields->psim[0][im-1][j]; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { fields2->psium[j][0] = fields2->psium[j][0]+fields->psim[0][j][0]; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { fields2->psium[j][jm-1] = fields2->psium[j][jm-1]+fields->psim[0][j][jm-1]; } } for(i=firstrow;i<=lastrow;i++) { for(iindex=firstcol;iindex<=lastcol;iindex++) { fields2->psium[i][iindex] = fields2->psium[i][iindex]+fields->psim[0][i][iindex]; } } /* update values of psilm array to psilm + psim[2] */ if (procid == MASTER) { fields2->psilm[0][0] = fields2->psilm[0][0]+fields->psim[1][0][0]; } if (procid == nprocs-xprocs) { fields2->psilm[im-1][0] = fields2->psilm[im-1][0]+fields->psim[1][im-1][0]; } if (procid == xprocs-1) { fields2->psilm[0][jm-1] = fields2->psilm[0][jm-1]+fields->psim[1][0][jm-1]; } if (procid == nprocs-1) { fields2->psilm[im-1][jm-1] = fields2->psilm[im-1][jm-1]+fields->psim[1][im-1][jm-1]; } if (firstrow == 1) { for(j=firstcol;j<=lastcol;j++) { fields2->psilm[0][j] = fields2->psilm[0][j]+fields->psim[1][0][j]; } } if ((firstrow+numrows) == im-1) { for(j=firstcol;j<=lastcol;j++) { fields2->psilm[im-1][j] = fields2->psilm[im-1][j]+fields->psim[1][im-1][j]; } } if (firstcol == 1) { for(j=firstrow;j<=lastrow;j++) { fields2->psilm[j][0] = fields2->psilm[j][0]+fields->psim[1][j][0]; } } if ((firstcol+numcols) == jm-1) { for(j=firstrow;j<=lastrow;j++) { fields2->psilm[j][jm-1] = fields2->psilm[j][jm-1]+fields->psim[1][j][jm-1]; } } for(i=firstrow;i<=lastrow;i++) { for(iindex=firstcol;iindex<=lastcol;iindex++) { fields2->psilm[i][iindex] = fields2->psilm[i][iindex]+fields->psim[1][i][iindex]; } } if (iday >= (long) outday3) { endflag = 1; } } if ((procid == MASTER) || (do_stats)) { CLOCK(t1); gp[procid].total_time = t1-gp[procid].total_time; } }
int main (int argc, string argv[]) { long c; while ((c = getopt(argc, argv, "h")) != -1) { switch(c) { case 'h': Help(); exit(-1); break; default: fprintf(stderr, "Only valid option is \"-h\".\n"); exit(-1); break; } } Global = NULL; initparam(defv); startrun(); initoutput(); tab_init(); Global->tracktime = 0; Global->partitiontime = 0; Global->treebuildtime = 0; Global->forcecalctime = 0; Global->current_id = 0; CLOCK(Global->computestart); printf("COMPUTESTART = %12lu\n",Global->computestart); // Enable Models at the start of parallel execution CarbonEnableModels(); CREATE(SlaveStart, NPROC); WAIT_FOR_END(NPROC); // Disable Models at the end of parallel execution CarbonDisableModels(); CLOCK(Global->computeend); printf("COMPUTEEND = %12lu\n",Global->computeend); printf("COMPUTETIME = %12lu\n",Global->computeend - Global->computestart); printf("TRACKTIME = %12lu\n",Global->tracktime); printf("PARTITIONTIME = %12lu\t%5.2f\n",Global->partitiontime, ((float)Global->partitiontime)/Global->tracktime); printf("TREEBUILDTIME = %12lu\t%5.2f\n",Global->treebuildtime, ((float)Global->treebuildtime)/Global->tracktime); printf("FORCECALCTIME = %12lu\t%5.2f\n",Global->forcecalctime, ((float)Global->forcecalctime)/Global->tracktime); printf("RESTTIME = %12lu\t%5.2f\n", Global->tracktime - Global->partitiontime - Global->treebuildtime - Global->forcecalctime, ((float)(Global->tracktime-Global->partitiontime- Global->treebuildtime-Global->forcecalctime))/ Global->tracktime); MAIN_END; }
void radiosity() { long process_id; long rad_start, refine_done, vertex_start, vertex_done; THREAD_INIT_FREE(); LOCK(global->index_lock); process_id = global->index++; UNLOCK(global->index_lock); process_id = process_id % n_processors; BARINCLUDE(global->barrier); if ((process_id == 0) || (dostats)) CLOCK(rad_start); /* POSSIBLE ENHANCEMENT: Here is where one might pin processes to processors to avoid migration */ /* POSSIBLE ENHANCEMENT: Here is where one might reset the statistics that one is measuring about the parallel execution */ // Enable Modeling CarbonEnableModels(); /* Decompose model objects into patches and build the BSP tree */ /* Create the initial tasks */ init_modeling_tasks(process_id) ; process_tasks(process_id) ; /* Gather rays & do BF refinement */ while( init_ray_tasks(process_id) ) { /* Wait till tasks are put in the queue */ BARRIER(global->barrier, n_processors); /* Then perform ray-gathering and BF-refinement till the solution converges */ process_tasks(process_id) ; } if ((process_id == 0) || (dostats)) CLOCK(refine_done); BARRIER(global->barrier, n_processors); if ((process_id == 0) || (dostats)) CLOCK(vertex_start); /* Compute area-weighted radiosity value at each vertex */ init_radavg_tasks( RAD_AVERAGING_MODE, process_id ) ; process_tasks(process_id) ; /* Then normalize the radiosity at vertices */ init_radavg_tasks( RAD_NORMALIZING_MODE, process_id ) ; process_tasks(process_id) ; if ((process_id == 0) || (dostats)) CLOCK(vertex_done); if ((process_id == 0) || (dostats)) { timing[process_id]->rad_start = rad_start; timing[process_id]->rad_time = vertex_done - rad_start; timing[process_id]->refine_time = refine_done - rad_start; timing[process_id]->vertex_time = vertex_done - vertex_start; timing[process_id]->wait_time = vertex_start - refine_done; } // Disable Models CarbonDisableModels(); }
EXTERN_ENV #include <stdio.h> #include "carbon_user.h" #include "parameters.h" #include "mdvar.h" #include "water.h" #include "wwpot.h" #include "cnst.h" #include "mddata.h" #include "fileio.h" #include "split.h" #include "global.h" /************************************************************************/ /* routine that implements the time-steps. Called by main routine and calls others */ double MDMAIN(long NSTEP, long NPRINT, long NSAVE, long NORD1, long ProcID) { double XTT; long i; double POTA,POTR,POTRF; double XVIR,AVGT,TEN; double TTMV = 0.0, TKIN = 0.0, TVIR = 0.0; /*.......ESTIMATE ACCELERATION FROM F/M */ INTRAF(&gl->VIR,ProcID); BARRIER(gl->start, NumProcs); INTERF(ACC,&gl->VIR,ProcID); BARRIER(gl->start, NumProcs); /* MOLECULAR DYNAMICS LOOP OVER ALL TIME-STEPS */ for (i=1;i <= NSTEP; i++) { TTMV=TTMV+1.00; /* reset simulator stats at beginning of second time-step */ /* POSSIBLE ENHANCEMENT: Here's where one start measurements to avoid cold-start effects. Recommended to do this at the beginning of the second timestep; i.e. if (i == 2). */ if (i == 2) { // Reset Models CarbonEnableModels(); } /* initialize various shared sums */ if (ProcID == 0) { long dir; if (i >= 2) { CLOCK(gl->trackstart); } gl->VIR = 0.0; gl->POTA = 0.0; gl->POTR = 0.0; gl->POTRF = 0.0; for (dir = XDIR; dir <= ZDIR; dir++) gl->SUM[dir] = 0.0; } if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->intrastart); } BARRIER(gl->start, NumProcs); PREDIC(TLC,NORD1,ProcID); INTRAF(&gl->VIR,ProcID); BARRIER(gl->start, NumProcs); if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->intraend); gl->intratime += gl->intraend - gl->intrastart; } if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->interstart); } INTERF(FORCES,&gl->VIR,ProcID); if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->interend); gl->intertime += gl->interend - gl->interstart; } if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->intrastart); } CORREC(PCC,NORD1,ProcID); BNDRY(ProcID); KINETI(gl->SUM,HMAS,OMAS,ProcID); BARRIER(gl->start, NumProcs); if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->intraend); gl->intratime += gl->intraend - gl->intrastart; } TKIN=TKIN+gl->SUM[0]+gl->SUM[1]+gl->SUM[2]; TVIR=TVIR-gl->VIR; /* check if potential energy is to be computed, and if printing and/or saving is to be done, this time step. Note that potential energy is computed once every NPRINT time-steps */ if (((i % NPRINT) == 0) || ( (NSAVE > 0) && ((i % NSAVE) == 0))){ if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->interstart); } /* call potential energy computing routine */ POTENG(&gl->POTA,&gl->POTR,&gl->POTRF,ProcID); BARRIER(gl->start, NumProcs); if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->interend); gl->intertime += gl->interend - gl->interstart; } POTA=gl->POTA*FPOT; POTR=gl->POTR*FPOT; POTRF=gl->POTRF*FPOT; /* compute some values to print */ XVIR=TVIR*FPOT*0.50/TTMV; AVGT=TKIN*FKIN*TEMP*2.00/(3.00*TTMV); TEN=(gl->SUM[0]+gl->SUM[1]+gl->SUM[2])*FKIN; XTT=POTA+POTR+POTRF+TEN; if ((i % NPRINT) == 0 && ProcID == 0) { fprintf(six," %5ld %14.5lf %12.5lf %12.5lf \ %12.5lf\n %16.3lf %16.5lf %16.5lf\n", i,TEN,POTA,POTR,POTRF,XTT,AVGT,XVIR); } } /* wait for everyone to finish time-step */ BARRIER(gl->start, NumProcs); if ((ProcID == 0) && (i >= 2)) { CLOCK(gl->trackend); gl->tracktime += gl->trackend - gl->trackstart; } } /* for i */
int main(int argc, char *argv[]) { long i; long c; extern char *optarg; long m1; long factor; long pages; unsigned long start; CLOCK(start); while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) { switch(c) { case 'p': P = atoi(optarg); if (P < 1) { printerr("P must be >= 1\n"); exit(-1); } if (log_2(P) == -1) { printerr("P must be a power of 2\n"); exit(-1); } break; case 'm': M = atoi(optarg); m1 = M/2; if (2*m1 != M) { printerr("M must be even\n"); exit(-1); } break; case 'n': num_cache_lines = atoi(optarg); orig_num_lines = num_cache_lines; if (num_cache_lines < 1) { printerr("Number of cache lines must be >= 1\n"); exit(-1); } break; case 'l': log2_line_size = atoi(optarg); if (log2_line_size < 0) { printerr("Log base 2 of cache line length in bytes must be >= 0\n"); exit(-1); } break; case 's': dostats = !dostats; break; case 't': test_result = !test_result; break; case 'o': doprint = !doprint; break; case 'h': printf("Usage: FFT <options>\n\n"); printf("options:\n"); printf(" -mM : M = even integer; 2**M total complex data points transformed.\n"); printf(" -pP : P = number of processors; Must be a power of 2.\n"); printf(" -nN : N = number of cache lines.\n"); printf(" -lL : L = Log base 2 of cache line length in bytes.\n"); printf(" -s : Print individual processor timing statistics.\n"); printf(" -t : Perform FFT and inverse FFT. Test output by comparing the\n"); printf(" integral of the original data to the integral of the data that\n"); printf(" results from performing the FFT and inverse FFT.\n"); printf(" -o : Print out complex data points.\n"); printf(" -h : Print out command line options.\n\n"); printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n", DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE); exit(0); break; } } MAIN_INITENV(,80000000); N = 1<<M; rootN = 1<<(M/2); rowsperproc = rootN/P; if (rowsperproc == 0) { printerr("Matrix not large enough. 2**(M/2) must be >= P\n"); exit(-1); } line_size = 1 << log2_line_size; if (line_size < 2*sizeof(double)) { printf("WARNING: Each element is a complex double (%ld bytes)\n",2*sizeof(double)); printf(" => Less than one element per cache line\n"); printf(" Computing transpose blocking factor\n"); factor = (2*sizeof(double)) / line_size; num_cache_lines = orig_num_lines / factor; } if (line_size <= 2*sizeof(double)) { pad_length = 1; } else { pad_length = line_size / (2*sizeof(double)); } if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) { pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE; if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) { pages ++; } pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc); } else { pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) / (2 * sizeof(double) * rowsperproc); if (pad_length * (2 * sizeof(double) * rowsperproc) != (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) { printerr("Padding algorithm unsuccessful\n"); exit(-1); } } Global = (struct GlobalMemory *) G_MALLOC(sizeof(struct GlobalMemory)); x = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE); trans = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE); umain = (double *) G_MALLOC(2*rootN*sizeof(double)); umain2 = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE); Global->transtimes = (long *) G_MALLOC(P*sizeof(long)); Global->totaltimes = (long *) G_MALLOC(P*sizeof(long)); if (Global == NULL) { printerr("Could not malloc memory for Global\n"); exit(-1); } else if (x == NULL) { printerr("Could not malloc memory for x\n"); exit(-1); } else if (trans == NULL) { printerr("Could not malloc memory for trans\n"); exit(-1); } else if (umain == NULL) { printerr("Could not malloc memory for umain\n"); exit(-1); } else if (umain2 == NULL) { printerr("Could not malloc memory for umain2\n"); exit(-1); } x = (double *) (((unsigned long) x) + PAGE_SIZE - ((unsigned long) x) % PAGE_SIZE); trans = (double *) (((unsigned long) trans) + PAGE_SIZE - ((unsigned long) trans) % PAGE_SIZE); umain2 = (double *) (((unsigned long) umain2) + PAGE_SIZE - ((unsigned long) umain2) % PAGE_SIZE); /* In order to optimize data distribution, the data structures x, trans, and umain2 have been aligned so that each begins on a page boundary. This ensures that the amount of padding calculated by the program is such that each processor's partition ends on a page boundary, thus ensuring that all data from these structures that are needed by a processor can be allocated to its local memory */ /* POSSIBLE ENHANCEMENT: Here is where one might distribute the x, trans, and umain2 data structures across physically distributed memories as desired. One way to place data is as follows: double *base; long i; i = ((N/P)+(rootN/P)*pad_length)*2; base = &(x[0]); for (j=0;j<P;j++) { Place all addresses x such that (base <= x < base+i) on node j base += i; } The trans and umain2 data structures can be placed in a similar manner. */ printf("\n"); printf("FFT with Blocking Transpose\n"); printf(" %ld Complex Doubles\n",N); printf(" %ld Processors\n",P); if (num_cache_lines != orig_num_lines) { printf(" %ld Cache lines\n",orig_num_lines); printf(" %ld Cache lines for blocking transpose\n",num_cache_lines); } else { printf(" %ld Cache lines\n",num_cache_lines); } printf(" %d Byte line size\n",(1 << log2_line_size)); printf(" %d Bytes per page\n",PAGE_SIZE); printf("\n"); BARINIT(Global->start, P); LOCKINIT(Global->idlock); Global->id = 0; InitX(x); /* place random values in x */ if (test_result) { ck1 = CheckSum(x); } if (doprint) { printf("Original data values:\n"); PrintArray(N, x); } InitU(N,umain); /* initialize u arrays*/ InitU2(N,umain2,rootN); /* fire off P processes */ // Enable Models CarbonEnableModels(); CREATE(SlaveStart, P); WAIT_FOR_END(P); // Disable Models CarbonDisableModels(); if (doprint) { if (test_result) { printf("Data values after inverse FFT:\n"); } else { printf("Data values after FFT:\n"); } PrintArray(N, x); } transtime = Global->transtimes[0]; printf("\n"); printf(" PROCESS STATISTICS\n"); printf(" Computation Transpose Transpose\n"); printf(" Proc Time Time Fraction\n"); printf(" 0 %10ld %10ld %8.5f\n", Global->totaltimes[0],Global->transtimes[0], ((double)Global->transtimes[0])/Global->totaltimes[0]); if (dostats) { transtime2 = Global->transtimes[0]; avgtranstime = Global->transtimes[0]; avgcomptime = Global->totaltimes[0]; maxtotal = Global->totaltimes[0]; mintotal = Global->totaltimes[0]; maxfrac = ((double)Global->transtimes[0])/Global->totaltimes[0]; minfrac = ((double)Global->transtimes[0])/Global->totaltimes[0]; avgfractime = ((double)Global->transtimes[0])/Global->totaltimes[0]; for (i=1;i<P;i++) { if (Global->transtimes[i] > transtime) { transtime = Global->transtimes[i]; } if (Global->transtimes[i] < transtime2) { transtime2 = Global->transtimes[i]; } if (Global->totaltimes[i] > maxtotal) { maxtotal = Global->totaltimes[i]; } if (Global->totaltimes[i] < mintotal) { mintotal = Global->totaltimes[i]; } if (((double)Global->transtimes[i])/Global->totaltimes[i] > maxfrac) { maxfrac = ((double)Global->transtimes[i])/Global->totaltimes[i]; } if (((double)Global->transtimes[i])/Global->totaltimes[i] < minfrac) { minfrac = ((double)Global->transtimes[i])/Global->totaltimes[i]; } printf(" %3ld %10ld %10ld %8.5f\n", i,Global->totaltimes[i],Global->transtimes[i], ((double)Global->transtimes[i])/Global->totaltimes[i]); avgtranstime += Global->transtimes[i]; avgcomptime += Global->totaltimes[i]; avgfractime += ((double)Global->transtimes[i])/Global->totaltimes[i]; } printf(" Avg %10.0f %10.0f %8.5f\n", ((double) avgcomptime)/P,((double) avgtranstime)/P,avgfractime/P); printf(" Max %10ld %10ld %8.5f\n", maxtotal,transtime,maxfrac); printf(" Min %10ld %10ld %8.5f\n", mintotal,transtime2,minfrac); } Global->starttime = start; printf("\n"); printf(" TIMING INFORMATION\n"); printf("Start time : %16lu\n", Global->starttime); printf("Initialization finish time : %16lu\n", Global->initdonetime); printf("Overall finish time : %16lu\n", Global->finishtime); printf("Total time with initialization : %16lu\n", Global->finishtime-Global->starttime); printf("Total time without initialization : %16lu\n", Global->finishtime-Global->initdonetime); printf("Overall transpose time : %16ld\n", transtime); printf("Overall transpose fraction : %16.5f\n", ((double) transtime)/(Global->finishtime-Global->initdonetime)); printf("\n"); if (test_result) { ck3 = CheckSum(x); printf(" INVERSE FFT TEST RESULTS\n"); printf("Checksum difference is %.3f (%.3f, %.3f)\n", ck1-ck3, ck1, ck3); if (fabs(ck1-ck3) < 0.001) { printf("TEST PASSED\n"); } else { printf("TEST FAILED\n"); } } MAIN_END; }
void Render_Loop() { long step,i; PIXEL *local_image_address; MPIXEL *local_mask_image_address; char outfile[FILENAME_STRING_SIZE]; long image_partition,mask_image_partition; float inv_num_nodes; long my_node; THREAD_INIT_FREE(); LOCK(Global->IndexLock); my_node = Global->Index++; UNLOCK(Global->IndexLock); my_node = my_node%num_nodes; BARINCLUDE(Global->TimeBarrier); BARINCLUDE(Global->SlaveBarrier); /* POSSIBLE ENHANCEMENT: Here's where one might bind the process to a processor, if one wanted to. */ // Reset Models Here CarbonEnableModels(); inv_num_nodes = 1.0/(float)num_nodes; image_partition = ROUNDUP(image_length*inv_num_nodes); mask_image_partition = ROUNDUP(mask_image_length*inv_num_nodes); #ifdef DIM for (dim=0; dim<NM; dim++) { #endif for (step=0; step<ROTATE_STEPS; step++) { /* do rotation sequence */ /* POSSIBLE ENHANCEMENT: Here is where one might reset statistics, if one wanted to. */ frame = step; /* initialize images here */ local_image_address = image_address + image_partition * my_node; local_mask_image_address = mask_image_address + mask_image_partition * my_node; BARRIER(Global->SlaveBarrier,num_nodes); if (my_node == num_nodes-1) { for (i=image_partition*my_node; i<image_length; i++) *local_image_address++ = background; if (adaptive) for (i=mask_image_partition*my_node; i<mask_image_length; i++) *local_mask_image_address++ = NULL_PIXEL; } else { for (i=0; i<image_partition; i++) *local_image_address++ = background; if (adaptive) for (i=0; i<mask_image_partition; i++) *local_mask_image_address++ = NULL_PIXEL; } if (my_node == ROOT) { #ifdef DIM Select_View((float)STEP_SIZE, dim); #else Select_View((float)STEP_SIZE, Y); #endif } BARRIER(Global->SlaveBarrier,num_nodes); Global->Counter = num_nodes; Global->Queue[num_nodes][0] = num_nodes; Global->Queue[my_node][0] = 0; Render(my_node); if (my_node == ROOT) { if (ROTATE_STEPS > 1) { #ifdef DIM sprintf(outfile, "%s_%ld",filename, 1000+dim*ROTATE_STEPS+step); #else sprintf(outfile, "%s_%ld.tiff",filename, 1000+step); #endif /* Store_Image(outfile); p = image_address; for (zz = 0;zz < image_length;zz++) { tiff_image[zz] = (long) ((*p)*256*256*256 + (*p)*256*256 + (*p)*256 + (*p)); p++; } tiff_save_rgba(outfile,tiff_image,image_len[X],image_len[Y]); */ WriteGrayscaleTIFF(outfile, image_len[X],image_len[Y],image_len[X], image_address); } else { /* Store_Image(filename); p = image_address; for (zz = 0;zz < image_length;zz++) { tiff_image[zz] = (long) ((*p)*256*256*256 + (*p)*256*256 + (*p)*256 + (*p)); p++; } tiff_save_rgba(filename,tiff_image,image_len[X],image_len[Y]); */ strcat(filename,".tiff"); WriteGrayscaleTIFF(filename, image_len[X],image_len[Y],image_len[X], image_address); } } } #ifdef DIM } #endif }
int main(int argc, char **argv) { /* default values for the control parameters of the driver */ /* are in parameters.h */ if ((argc == 2) && ((strncmp(argv[1],"-h",strlen("-h")) == 0) || (strncmp(argv[1],"-H",strlen("-H")) == 0))) { printf("Usage: WATER-SPATIAL < infile, where the contents of infile can be\nobtained from the comments at the top of water.C and the first scanf \nin main() in water.C\n\n"); exit(0); } /* POSSIBLE ENHANCEMENT: One might bind the first process to a processor here, even before the other (child) processes are bound later in mdmain(). */ six = stdout; TEMP =298.0; RHO =0.9980; /* read input */ if (scanf("%lf%ld%ld%ld%ld%ld%ld%ld%ld%lf",&TSTEP, &NMOL, &NSTEP, &NORDER, &NSAVE, &NRST, &NPRINT, &NFMC,&NumProcs, &CUTOFF) != 10) fprintf(stderr,"ERROR: Usage: water < infile, which must have 10 fields, see SPLASH documentation or comment at top of water.C\n"); printf("Using %ld procs on %ld steps of %ld mols\n", NumProcs, NSTEP, NMOL); printf("Other parameters:\n\tTSTEP = %8.2e\n\tNORDER = %ld\n\tNSAVE = %ld\n",TSTEP,NORDER,NSAVE); printf("\tNRST = %ld\n\tNPRINT = %ld\n\tNFMC = %ld\n\tCUTOFF = %lf\n\n",NRST,NPRINT,NFMC,CUTOFF); /* set up scaling factors and constants */ NORD1=NORDER+1; CNSTNT(NORD1,TLC); /* sub. call to set up constants */ SYSCNS(); /* sub. call to initialize system constants */ printf("%ld boxes with %ld processors\n\n", BOX_PER_SIDE * BOX_PER_SIDE * BOX_PER_SIDE, NumProcs); if (NumProcs > (BOX_PER_SIDE * BOX_PER_SIDE * BOX_PER_SIDE)) { fprintf(stderr,"ERROR: less boxes (%ld) than processors (%ld)\n", BOX_PER_SIDE * BOX_PER_SIDE * BOX_PER_SIDE, NumProcs); fflush(stderr); exit(-1); } fprintf(six,"\nTEMPERATURE = %8.2f K\n",TEMP); fprintf(six,"DENSITY = %8.5f G/C.C.\n",RHO); fprintf(six,"NUMBER OF MOLECULES = %8ld\n",NMOL); fprintf(six,"NUMBER OF PROCESSORS = %8ld\n",NumProcs); fprintf(six,"TIME STEP = %8.2e SEC\n",TSTEP); fprintf(six,"ORDER USED TO SOLVE F=MA = %8ld \n",NORDER); fprintf(six,"NO. OF TIME STEPS = %8ld \n",NSTEP); fprintf(six,"FREQUENCY OF DATA SAVING = %8ld \n",NSAVE); fprintf(six,"FREQUENCY TO WRITE RST FILE= %8ld \n",NRST); fflush(six); { /* do memory initializations */ long procnum, i, j, k, l; struct list_of_boxes *temp_box; long xprocs, yprocs, zprocs; long x_inc, y_inc, z_inc; long x_ct, y_ct, z_ct; long x_left, y_left, z_left; long x_first, y_first, z_first; long x_last, y_last, z_last; double proccbrt; long gmem_size = sizeof(struct GlobalMemory); MAIN_INITENV(,40000000,); /* macro call to initialize shared memory etc. */ THREAD_INIT_FREE(); /* Allocate space for main (BOX) data structure as well as * synchronization variables */ start_end = (first_last_array **) G_MALLOC(sizeof(first_last_array *) * NumProcs); for (i=0; i < NumProcs; i++) { start_end[i] = (first_last_array *) G_MALLOC(sizeof(first_last_array)); } /* Calculate start and finish box numbers for processors */ xprocs = 0; yprocs = 0; proccbrt = (double) pow((double) NumProcs, 1.0/3.0) + 0.00000000000001; j = (long) proccbrt; if (j<1) j = 1; while ((xprocs == 0) && (j>0)) { k = (long) sqrt((double) (NumProcs / j)); if (k<1) k=1; while ((yprocs == 0) && (k>0)) { l = NumProcs/(j*k); if ((j*k*l) == NumProcs) { xprocs = j; yprocs = k; zprocs = l; } /* if */ k--; } /* while yprocs && k */ j--; } /* while xprocs && j */ printf("xprocs = %ld\typrocs = %ld\tzprocs = %ld\n", xprocs, yprocs, zprocs); fflush(stdout); /* Fill in start_end array values */ procnum = 0; x_inc = BOX_PER_SIDE/xprocs; y_inc = BOX_PER_SIDE/yprocs; z_inc = BOX_PER_SIDE/zprocs; x_left = BOX_PER_SIDE - (xprocs*x_inc); y_left = BOX_PER_SIDE - (yprocs*y_inc); z_left = BOX_PER_SIDE - (zprocs*z_inc); printf("x_inc = %ld\t y_inc = %ld\t z_inc = %ld\n",x_inc,y_inc,z_inc); printf("x_left = %ld\t y_left = %ld\t z_left = %ld\n",x_left,y_left,z_left); fflush(stdout); x_first = 0; x_ct = x_left; x_last = -1; x_inc++; for (i=0; i<xprocs; i++) { y_ct = y_left; if (x_ct == 0) x_inc--; x_last += x_inc; y_first = 0; y_last = -1; y_inc++; for (j=0; j<yprocs; j++) { z_ct = z_left; if (y_ct == 0) y_inc--; y_last += y_inc; z_first = 0; z_last = -1; z_inc++; for (k=0; k<zprocs; k++) { if (z_ct == 0) z_inc--; z_last += z_inc; start_end[procnum]->box[XDIR][FIRST] = x_first; start_end[procnum]->box[XDIR][LAST] = min(x_last, BOX_PER_SIDE - 1); start_end[procnum]->box[YDIR][FIRST] = y_first; start_end[procnum]->box[YDIR][LAST] = min(y_last, BOX_PER_SIDE - 1); start_end[procnum]->box[ZDIR][FIRST] = z_first; start_end[procnum]->box[ZDIR][LAST] = min(z_last, BOX_PER_SIDE - 1); z_first = z_last + 1; z_ct--; procnum++; } y_first = y_last + 1; y_ct--; } x_first = x_last + 1; x_ct--; } /* Allocate space for my_boxes array */ my_boxes = (box_list **) G_MALLOC(NumProcs * sizeof(box_list *)); /* Set all box ptrs to null */ for (i=0; i<NumProcs; i++) my_boxes[i] = NULL; /* Set up links for all boxes for initial interf and intraf */ temp_box = my_boxes[0]; while (temp_box) { temp_box = temp_box->next_box; } /* Allocate space for BOX array */ BOX = (box_type ***) G_MALLOC(BOX_PER_SIDE * sizeof(box_type **)); for (i=0; i < BOX_PER_SIDE; i++) { BOX[i] = (box_type **) G_MALLOC( BOX_PER_SIDE * sizeof(box_type *)); for (j=0; j < BOX_PER_SIDE; j++) { BOX[i][j] = (box_type *) G_MALLOC(BOX_PER_SIDE * sizeof(box_type)); for (k=0; k < BOX_PER_SIDE; k++) { BOX[i][j][k].list = NULL; LOCKINIT(BOX[i][j][k].boxlock); } } } /* for i */ gl = (struct GlobalMemory *) G_MALLOC(gmem_size); /* macro calls to initialize synch variables */ BARINIT(gl->start, NumProcs); BARINIT(gl->InterfBar, NumProcs); BARINIT(gl->PotengBar, NumProcs); LOCKINIT(gl->IOLock); LOCKINIT(gl->IndexLock); LOCKINIT(gl->IntrafVirLock); LOCKINIT(gl->InterfVirLock); LOCKINIT(gl->KinetiSumLock); LOCKINIT(gl->PotengSumLock); } fprintf(six,"SPHERICAL CUTOFF RADIUS = %8.4f ANGSTROM\n",CUTOFF); fflush(six); IRST=0; /* call initialization routine */ INITIA(); gl->tracktime = 0; gl->intratime = 0; gl->intertime = 0; /* initialize Index to 1 so that the first created child gets id 1, not 0 */ gl->Index = 1; if (NSAVE > 0) { /* not true for input decks provided */ fprintf(six,"COLLECTING X AND V DATA AT EVERY %4ld TIME STEPS \n",NSAVE); } /* spawn helper processes */ CLOCK(gl->computestart); // Enable Models at the start of parallel execution CarbonEnableModels(); CREATE(WorkStart, NumProcs); /* macro to make main process wait for all others to finish */ WAIT_FOR_END(NumProcs); // Disable Models at the end of parallel execution CarbonDisableModels(); CLOCK(gl->computeend); printf("COMPUTESTART (after initialization) = %lu\n",gl->computestart); printf("COMPUTEEND = %lu\n",gl->computeend); printf("COMPUTETIME (after initialization) = %lu\n",gl->computeend-gl->computestart); printf("Measured Time (2nd timestep onward) = %lu\n",gl->tracktime); printf("Intramolecular time only (2nd timestep onward) = %lu\n",gl->intratime); printf("Intermolecular time only (2nd timestep onward) = %lu\n",gl->intertime); printf("Other time (2nd timestep onward) = %lu\n",gl->tracktime - gl->intratime - gl->intertime); printf("\nExited Happily with XTT = %g (note: XTT value is garbage if NPRINT > NSTEP)\n", XTT); MAIN_END; } /* main.c */
void ParallelExecute () { long my_id; long num_boxes; unsigned long start, finish = 0; time_info *local_time; long time_all = 0; time_info *timing; unsigned long local_init_done = 0; THREAD_INIT_FREE(); BARINCLUDE(G_Memory->synch); local_time = (time_info *) malloc(sizeof(struct _Time_Info) * MAX_TIME_STEPS); BARRIER(G_Memory->synch, Number_Of_Processors); LOCK(G_Memory->count_lock); my_id = G_Memory->id; G_Memory->id++; UNLOCK(G_Memory->count_lock); /* POSSIBLE ENHANCEMENT: Here is where one might pin processes to processors to avoid migration */ if (my_id == 0) { time_all = 1; } else if (do_stats) { time_all = 1; } if (my_id == 0) { /* have to allocate extra space since it will construct the grid by * itself for the first time step */ CreateParticleList(my_id, Total_Particles); InitParticleList(my_id, Total_Particles, 0); } else { CreateParticleList(my_id, ((Total_Particles * PDF) / Number_Of_Processors)); InitParticleList(my_id, 0, 0); } num_boxes = 1.333 * (Total_Particles / (OCCUPANCY * MAX_PARTICLES_PER_BOX)); if (my_id == 0) CreateBoxes(my_id, TOLERANCE * num_boxes); else CreateBoxes(my_id, TOLERANCE * num_boxes * BDF / Number_Of_Processors); if (my_id == 0) { LockedPrint("Starting FMM with %d processor%s\n", Number_Of_Processors, (Number_Of_Processors == 1) ? "" : "s"); } BARRIER(G_Memory->synch, Number_Of_Processors); Local[my_id].Time = 0.0; for (MY_TIME_STEP = 0; MY_TIME_STEP < Time_Steps; MY_TIME_STEP++) { if (MY_TIME_STEP == 2) { /* POSSIBLE ENHANCEMENT: Here is where one might reset the statistics that one is measuring about the parallel execution */ // Reset Models CarbonEnableModels(); } if (MY_TIME_STEP == 2) { if (do_stats || my_id == 0) { CLOCK(local_init_done); } } if (MY_TIME_STEP == 0) { CLOCK(start); } else start = finish; ConstructGrid(my_id,local_time,time_all); ConstructLists(my_id,local_time,time_all); PartitionGrid(my_id,local_time,time_all); StepSimulation(my_id,local_time,time_all); DestroyGrid(my_id,local_time,time_all); CLOCK(finish); Local[my_id].Time += Timestep_Dur; MY_TIMING[MY_TIME_STEP].total_time = finish - start; } if (my_id == 0) { CLOCK(endtime); } BARRIER(G_Memory->synch, Number_Of_Processors); for (MY_TIME_STEP = 0; MY_TIME_STEP < Time_Steps; MY_TIME_STEP++) { timing = &(MY_TIMING[MY_TIME_STEP]); timing->other_time = local_time[MY_TIME_STEP].other_time; timing->construct_time = local_time[MY_TIME_STEP].construct_time; timing->list_time = local_time[MY_TIME_STEP].list_time; timing->partition_time = local_time[MY_TIME_STEP].partition_time; timing->pass_time = local_time[MY_TIME_STEP].pass_time; timing->inter_time = local_time[MY_TIME_STEP].inter_time; timing->barrier_time = local_time[MY_TIME_STEP].barrier_time; timing->intra_time = local_time[MY_TIME_STEP].intra_time; } Local[my_id].init_done_times = local_init_done; BARRIER(G_Memory->synch, Number_Of_Processors); }