/* * InitGlobalMemory () * * Args : none. * * Returns : nothing. * * Side Effects : Allocates all the global storage for G_Memory. * */ void InitGlobalMemory () { int i; G_Memory = (g_mem *) G_MALLOC(sizeof(g_mem)); G_Memory->i_array = (int *) G_MALLOC(Number_Of_Processors * sizeof(int)); G_Memory->d_array = (double *) G_MALLOC(Number_Of_Processors * sizeof(double)); if (G_Memory == NULL) { printf("Ran out of global memory in InitGlobalMemory\n"); exit(-1); } G_Memory->count = 0; G_Memory->id = 0; LOCKINIT(G_Memory->io_lock); LOCKINIT(G_Memory->mal_lock); LOCKINIT(G_Memory->single_lock); LOCKINIT(G_Memory->count_lock); ALOCKINIT(G_Memory->lock_array, MAX_LOCKS); BARINIT(G_Memory->synch); BARINIT(G_Memory->done_barrier); G_Memory->max_x = -MAX_REAL; G_Memory->min_x = MAX_REAL; G_Memory->max_y = -MAX_REAL; G_Memory->min_y = MAX_REAL; }
void init_taskq(long process_id) { long i ; long qid ; long task_index = 0 ; long task_per_queue ; long n_tasks ; /* Reset task assignment index */ task_struct[process_id].crnt_taskq_id = 0 ; /* Initialize task queues */ task_per_queue = (MAX_TASKS + n_taskqueues - 1) / n_taskqueues ; for( qid = 0 ; qid < n_taskqueues ; qid++ ) { /* Initialize free list */ if (task_index + task_per_queue > MAX_TASKS ) n_tasks = MAX_TASKS - task_index ; else n_tasks = task_per_queue ; for( i = task_index ; i < task_index + n_tasks - 1 ; i++ ) global->task_buf[i].next = &global->task_buf[i+1] ; global->task_buf[ i ].next = 0 ; global->task_queue[ qid ].free = &global->task_buf[ task_index ] ; global->task_queue[ qid ].n_free = n_tasks ; /* Initialize task queue */ global->task_queue[ qid ].top = 0 ; global->task_queue[ qid ].tail = 0 ; global->task_queue[ qid ].n_tasks = 0 ; /* Initialize locks */ LOCKINIT(global->task_queue[ qid ].q_lock); LOCKINIT(global->task_queue[ qid ].f_lock); /* Update index for next queue */ task_index += n_tasks ; } /* Initialize local free lists */ task_struct[process_id].n_local_free_task = 0 ; task_struct[process_id].local_free_task = 0 ; }
void init_interactionlist(long process_id) { long i ; /* Initialize Interaction free list */ for( i = 0 ; i < MAX_INTERACTIONS-1 ; i++ ) global->interaction_buf[i].next = &global->interaction_buf[i+1] ; global->interaction_buf[ MAX_INTERACTIONS-1 ].next = 0 ; global->free_interaction = global->interaction_buf ; global->n_free_interactions = MAX_INTERACTIONS ; LOCKINIT(global->free_interaction_lock); }
void init_elemlist(long process_id) { long i ; /* Initialize Element free list */ for( i = 0 ; i < MAX_ELEMENTS-1 ; i++ ) { global->element_buf[i].center = &global->element_buf[i+1] ; /* Initialize lock variable */ global->element_buf[i].elem_lock = get_sharedlock( SHARED_LOCK_SEG1, process_id ) ; } global->element_buf[ MAX_ELEMENTS-1 ].center = 0 ; global->element_buf[ MAX_ELEMENTS-1 ].elem_lock = get_sharedlock( SHARED_LOCK_SEG1, process_id ) ; global->free_element = global->element_buf ; global->n_free_elements = MAX_ELEMENTS ; LOCKINIT(global->free_element_lock); }
void init_global(long process_id) { /* Clear BSP root pointer */ global->index = 1; /* ****** */ global->bsp_root = 0 ; LOCKINIT(global->index_lock); LOCKINIT(global->bsp_tree_lock); /* Initialize radiosity statistics variables */ LOCKINIT(global->avg_radiosity_lock); global->converged = 0 ; global->prev_total_energy.r = 0.0 ; global->prev_total_energy.g = 0.0 ; global->prev_total_energy.b = 0.0 ; global->total_energy.r = 1.0 ; global->total_energy.g = 1.0 ; global->total_energy.b = 1.0 ; global->total_patch_area = 1.0 ; global->iteration_count = -1 ; /* init_ray_task() increments to 0 */ /* Initialize the cost sum */ LOCKINIT(global->cost_sum_lock); global->cost_sum = 0 ; global->cost_estimate_sum = 0 ; /* Initialize the barrier */ BARINIT(global->barrier, n_processors); LOCKINIT(global->pbar_lock); global->pbar_count = 0 ; /* Initialize task counter */ global->task_counter = 0 ; LOCKINIT(global->task_counter_lock); /* Initialize task queue */ init_taskq(process_id) ; /* Initialize Patch, Element, Interaction free lists */ init_patchlist(process_id) ; init_elemlist(process_id) ; init_interactionlist(process_id) ; init_elemvertex(process_id) ; init_edge(process_id) ; /* Initialize statistical info */ init_stat_info(process_id) ; }
int main(int argc, char *argv[]) { long i; long j; long xextra; long xportion; long yextra; long yportion; long lower; double procsqrt; long k; long logtest; long my_num; unsigned long computeend; double min_total; double max_total; double avg_total; double min_multi; double max_multi; double avg_multi; double min_frac; double max_frac; double avg_frac; extern char *optarg; long ch; unsigned long start; CLOCK(start) while ((ch = getopt(argc, argv, "n:p:e:r:t:soh")) != -1) { switch(ch) { case 'n': im = atoi(optarg); if (im > IMAX) { printerr("Max grid size exceeded\n"); exit(-1); } if (log_2(im-2) == -1) { printerr("Grid must be ((power of 2)+2) in each dimension\n"); exit(-1); } break; case 'p': nprocs = atoi(optarg); if (nprocs < 1) { printerr("P must be >= 1\n"); exit(-1); } if (log_2(nprocs) == -1) { printerr("P must be a power of 2\n"); exit(-1); } break; case 'e': tolerance = atof(optarg); break; case 'r': res = atof(optarg); break; case 't': dtau = atof(optarg); break; case 's': do_stats = !do_stats; break; case 'o': do_output = !do_output; break; case 'h': printf("Usage: OCEAN <options>\n\n"); printf("options:\n"); printf(" -nN : Simulate NxN ocean. N must be (power of 2)+2.\n"); printf(" -pP : P = number of processors. P must be power of 2.\n"); printf(" -eE : E = error tolerance for iterative relaxation.\n"); printf(" -rR : R = distance between grid points in meters.\n"); printf(" -tT : T = timestep in seconds.\n"); printf(" -s : Print timing statistics.\n"); printf(" -o : Print out relaxation residual values.\n"); printf(" -h : Print out command line options.\n\n"); printf("Default: OCEAN -n%1d -p%1d -e%1g -r%1g -t%1g\n", DEFAULT_N,DEFAULT_P,DEFAULT_E,DEFAULT_R,DEFAULT_T); exit(0); break; } } MAIN_INITENV(,60000000) logtest = im-2; numlev = 1; while (logtest != 1) { if (logtest%2 != 0) { printerr("Cannot determine number of multigrid levels\n"); exit(-1); } logtest = logtest / 2; numlev++; } if (numlev > MAX_LEVELS) { printerr("Max grid levels exceeded for multigrid\n"); exit(-1); } jm = im; printf("\n"); printf("Ocean simulation with W-cycle multigrid solver\n"); printf(" Processors : %1ld\n",nprocs); printf(" Grid size : %1ld x %1ld\n",im,jm); printf(" Grid resolution (meters) : %0.2f\n",res); printf(" Time between relaxations (seconds) : %0.0f\n",dtau); printf(" Error tolerance : %0.7g\n",tolerance); printf("\n"); gp = (struct Global_Private *) G_MALLOC((nprocs+1)*sizeof(struct Global_Private)); for (i=0;i<nprocs;i++) { gp[i].multi_time = 0; gp[i].total_time = 0; } global = (struct global_struct *) G_MALLOC(sizeof(struct global_struct)); fields = (struct fields_struct *) G_MALLOC(sizeof(struct fields_struct)); fields2 = (struct fields2_struct *) G_MALLOC(sizeof(struct fields2_struct)); wrk1 = (struct wrk1_struct *) G_MALLOC(sizeof(struct wrk1_struct)); wrk3 = (struct wrk3_struct *) G_MALLOC(sizeof(struct wrk3_struct)); wrk2 = (struct wrk2_struct *) G_MALLOC(sizeof(struct wrk2_struct)); wrk4 = (struct wrk4_struct *) G_MALLOC(sizeof(struct wrk4_struct)); wrk6 = (struct wrk6_struct *) G_MALLOC(sizeof(struct wrk6_struct)); wrk5 = (struct wrk5_struct *) G_MALLOC(sizeof(struct wrk5_struct)); frcng = (struct frcng_struct *) G_MALLOC(sizeof(struct frcng_struct)); iter = (struct iter_struct *) G_MALLOC(sizeof(struct iter_struct)); guess = (struct guess_struct *) G_MALLOC(sizeof(struct guess_struct)); multi = (struct multi_struct *) G_MALLOC(sizeof(struct multi_struct)); locks = (struct locks_struct *) G_MALLOC(sizeof(struct locks_struct)); bars = (struct bars_struct *) G_MALLOC(sizeof(struct bars_struct)); LOCKINIT(locks->idlock) LOCKINIT(locks->psiailock) LOCKINIT(locks->psibilock) LOCKINIT(locks->donelock) LOCKINIT(locks->error_lock) LOCKINIT(locks->bar_lock) BARINIT(bars->iteration) BARINIT(bars->gsudn) BARINIT(bars->p_setup) BARINIT(bars->p_redph) BARINIT(bars->p_soln) BARINIT(bars->p_subph) BARINIT(bars->sl_prini) BARINIT(bars->sl_psini) BARINIT(bars->sl_onetime) BARINIT(bars->sl_phase_1) BARINIT(bars->sl_phase_2) BARINIT(bars->sl_phase_3) BARINIT(bars->sl_phase_4) BARINIT(bars->sl_phase_5) BARINIT(bars->sl_phase_6) BARINIT(bars->sl_phase_7) BARINIT(bars->sl_phase_8) BARINIT(bars->sl_phase_9) BARINIT(bars->sl_phase_10) BARINIT(bars->error_barrier) imx[numlev-1] = im; jmx[numlev-1] = jm; lev_res[numlev-1] = res; lev_tol[numlev-1] = tolerance; multi->err_multi = 0.0; multi->numspin = 0; for (i=0;i<nprocs;i++) { multi->spinflag[i] = 0; } for (i=numlev-2;i>=0;i--) { imx[i] = ((imx[i+1] - 2) / 2) + 2; jmx[i] = ((jmx[i+1] - 2) / 2) + 2; lev_res[i] = lev_res[i+1] * 2; } xprocs = 0; yprocs = 0; procsqrt = sqrt((double) nprocs); j = (long) procsqrt; while ((xprocs == 0) && (j > 0)) { k = nprocs / j; if (k * j == nprocs) { if (k > j) { xprocs = j; yprocs = k; } else { xprocs = k; yprocs = j; } } j--; } if (xprocs == 0) { printerr("Could not find factors for subblocking\n"); exit(-1); } /* Determine starting coord and number of points to process in */ /* each direction */ for (i=0;i<numlev;i++) { xportion = (jmx[i] - 2) / xprocs; xextra = (jmx[i] - 2) % xprocs; for (j=0;j<xprocs;j++) { if (xextra == 0) { for (k=0;k<yprocs;k++) { gp[k*xprocs+j].rel_start_x[i] = j * xportion + 1; gp[k*xprocs+j].rel_num_x[i] = xportion; } } else { if (j + 1 > xextra) { for (k=0;k<yprocs;k++) { lower = xextra * (xportion + 1); gp[k*xprocs+j].rel_start_x[i] = lower + (j - xextra) * xportion + 1; gp[k*xprocs+j].rel_num_x[i] = xportion; } } else { for (k=0;k<yprocs;k++) { gp[k*xprocs+j].rel_start_x[i] = j * (xportion + 1) + 1; gp[k*xprocs+j].rel_num_x[i] = xportion + 1; } } } } yportion = (imx[i] - 2) / yprocs; yextra = (imx[i] - 2) % yprocs; for (j=0;j<yprocs;j++) { if (yextra == 0) { for (k=0;k<xprocs;k++) { gp[j*xprocs+k].rel_start_y[i] = j * yportion + 1; gp[j*xprocs+k].rel_num_y[i] = yportion; } } else { if (j + 1 > yextra) { for (k=0;k<xprocs;k++) { lower = yextra * (yportion + 1); gp[j*xprocs+k].rel_start_y[i] = lower + (j - yextra) * yportion + 1; gp[j*xprocs+k].rel_num_y[i] = yportion; } } else { for (k=0;k<xprocs;k++) { gp[j*xprocs+k].rel_start_y[i] = j * (yportion + 1) + 1; gp[j*xprocs+k].rel_num_y[i] = yportion + 1; } } } } } i_int_coeff[0] = 0.0; j_int_coeff[0] = 0.0; for (i=0;i<numlev;i++) { i_int_coeff[i] = 1.0/(imx[i]-1); j_int_coeff[i] = 1.0/(jmx[i]-1); } for (my_num=0;my_num<nprocs;my_num++) { for (i=0;i<numlev;i++) { gp[my_num].rlist[i] = gp[my_num].rel_start_y[i]; gp[my_num].rljst[i] = gp[my_num].rel_start_x[i]; gp[my_num].rlien[i] = gp[my_num].rlist[i] + gp[my_num].rel_num_y[i] - 1; gp[my_num].rljen[i] = gp[my_num].rljst[i] + gp[my_num].rel_num_x[i] - 1; gp[my_num].iist[i] = gp[my_num].rel_start_y[i]; gp[my_num].ijst[i] = gp[my_num].rel_start_x[i]; gp[my_num].iien[i] = gp[my_num].iist[i] + gp[my_num].rel_num_y[i] - 1; gp[my_num].ijen[i] = gp[my_num].ijst[i] + gp[my_num].rel_num_x[i] - 1; gp[my_num].pist[i] = gp[my_num].rel_start_y[i]; gp[my_num].pjst[i] = gp[my_num].rel_start_x[i]; gp[my_num].pien[i] = gp[my_num].pist[i] + gp[my_num].rel_num_y[i] - 1; gp[my_num].pjen[i] = gp[my_num].pjst[i] + gp[my_num].rel_num_x[i] - 1; if (gp[my_num].pist[i] == 1) { gp[my_num].pist[i] = 0; } if (gp[my_num].pjst[i] == 1) { gp[my_num].pjst[i] = 0; } if (gp[my_num].pien[i] == imx[i] - 2) { gp[my_num].pien[i] = imx[i]-1; } if (gp[my_num].pjen[i] == jmx[i] - 2) { gp[my_num].pjen[i] = jmx[i]-1; } if (gp[my_num].rlist[i] % 2 == 0) { gp[my_num].eist[i] = gp[my_num].rlist[i]; gp[my_num].oist[i] = gp[my_num].rlist[i] + 1; } else { gp[my_num].eist[i] = gp[my_num].rlist[i] + 1; gp[my_num].oist[i] = gp[my_num].rlist[i]; } if (gp[my_num].rljst[i] % 2 == 0) { gp[my_num].ejst[i] = gp[my_num].rljst[i]; gp[my_num].ojst[i] = gp[my_num].rljst[i] + 1; } else { gp[my_num].ejst[i] = gp[my_num].rljst[i] + 1; gp[my_num].ojst[i] = gp[my_num].rljst[i]; } if (gp[my_num].rlien[i] == imx[i]-2) { gp[my_num].rlien[i] = gp[my_num].rlien[i] - 1; if (gp[my_num].rlien[i] % 2 == 0) { gp[my_num].ojest[i] = gp[my_num].ojst[i]; gp[my_num].ejest[i] = gp[my_num].ejst[i]; } else { gp[my_num].ojest[i] = gp[my_num].ejst[i]; gp[my_num].ejest[i] = gp[my_num].ojst[i]; } } if (gp[my_num].rljen[i] == jmx[i]-2) { gp[my_num].rljen[i] = gp[my_num].rljen[i] - 1; if (gp[my_num].rljen[i] % 2 == 0) { gp[my_num].oiest[i] = gp[my_num].oist[i]; gp[my_num].eiest[i] = gp[my_num].eist[i]; } else { gp[my_num].oiest[i] = gp[my_num].eist[i]; gp[my_num].eiest[i] = gp[my_num].oist[i]; } } } } /* initialize constants and variables id is a global shared variable that has fetch-and-add operations performed on it by processes to obtain their pids. */ global->id = 0; global->psibi = 0.0; pi = atan(1.0); pi = 4.*pi; factjacob = -1./(12.*res*res); factlap = 1./(res*res); eig2 = -h*f0*f0/(h1*h3*gpr); jmm1 = jm-1 ; ysca = ((double) jmm1)*res ; for (i=0;i<im;i++) { for (j=0;j<jm;j++) { guess->oldga[i][j] = 0.0; guess->oldgb[i][j] = 0.0; } } if (do_output) { printf(" MULTIGRID OUTPUTS\n"); } CREATE(slave, nprocs); WAIT_FOR_END(nprocs); CLOCK(computeend) printf("\n"); printf(" PROCESS STATISTICS\n"); printf(" Total Multigrid Multigrid\n"); printf(" Proc Time Time Fraction\n"); printf(" 0 %15.0f %15.0f %10.3f\n", gp[0].total_time,gp[0].multi_time, gp[0].multi_time/gp[0].total_time); if (do_stats) { min_total = max_total = avg_total = gp[0].total_time; min_multi = max_multi = avg_multi = gp[0].multi_time; min_frac = max_frac = avg_frac = gp[0].multi_time/gp[0].total_time; for (i=1;i<nprocs;i++) { if (gp[i].total_time > max_total) { max_total = gp[i].total_time; } if (gp[i].total_time < min_total) { min_total = gp[i].total_time; } if (gp[i].multi_time > max_multi) { max_multi = gp[i].multi_time; } if (gp[i].multi_time < min_multi) { min_multi = gp[i].multi_time; } if (gp[i].multi_time/gp[i].total_time > max_frac) { max_frac = gp[i].multi_time/gp[i].total_time; } if (gp[i].multi_time/gp[i].total_time < min_frac) { min_frac = gp[i].multi_time/gp[i].total_time; } avg_total += gp[i].total_time; avg_multi += gp[i].multi_time; avg_frac += gp[i].multi_time/gp[i].total_time; } avg_total = avg_total / nprocs; avg_multi = avg_multi / nprocs; avg_frac = avg_frac / nprocs; for (i=1;i<nprocs;i++) { printf(" %3ld %15.0f %15.0f %10.3f\n", i, gp[i].total_time, gp[i].multi_time, gp[i].multi_time/gp[i].total_time); } printf(" Avg %15.0f %15.0f %10.3f\n", avg_total,avg_multi,avg_frac); printf(" Min %15.0f %15.0f %10.3f\n", min_total,min_multi,min_frac); printf(" Max %15.0f %15.0f %10.3f\n", max_total,max_multi,max_frac); } printf("\n"); global->starttime = start; printf(" TIMING INFORMATION\n"); printf("Start time : %16lu\n", global->starttime); printf("Initialization finish time : %16lu\n", global->trackstart); printf("Overall finish time : %16lu\n", computeend); printf("Total time with initialization : %16lu\n", computeend-global->starttime); printf("Total time without initialization : %16lu\n", computeend-global->trackstart); printf(" (excludes first timestep)\n"); printf("\n"); MAIN_END }
int main(int argc, CHAR *argv[]) { INT i; UINT begin; UINT end; UINT lapsed; MATRIX vtrans, Vinv; /* View transformation and inverse. */ /* * First, process command line arguments. */ i = 1; while ((i < argc) && (argv[i][0] == '-')) { switch (argv[i][1]) { case '?': case 'h': case 'H': Usage(); exit(1); case 'a': case 'A': AntiAlias = TRUE; if (argv[i][2] != '\0') { NumSubRays = atoi(&argv[i][2]); } else { NumSubRays = atoi(&argv[++i][0]); } break; case 'm': if (argv[i][2] != '\0') { MaxGlobMem = atoi(&argv[i][2]); } else { MaxGlobMem = atoi(&argv[++i][0]); } break; case 'p': if (argv[i][2] != '\0') { nprocs = atoi(&argv[i][2]); } else { nprocs = atoi(&argv[++i][0]); } break; case 's': case 'S': dostats = TRUE; break; default: fprintf(stderr, "%s: Invalid option \'%c\'.\n", ProgName, argv[i][0]); exit(1); } i++; } if (i == argc) { Usage(); exit(1); } /* * Make sure nprocs is within valid range. */ if (nprocs < 1 || nprocs > MAX_PROCS) { fprintf(stderr, "%s: Valid range for #processors is [1, %d].\n", ProgName, MAX_PROCS); exit(1); } /* * Print command line parameters. */ printf("\n"); printf("Number of processors: \t%ld\n", nprocs); printf("Global shared memory size:\t%ld MB\n", MaxGlobMem); printf("Samples per pixel: \t%ld\n", NumSubRays); printf("\n"); /* * Initialize the shared memory environment and request the total * amount of amount of shared memory we might need. This * includes memory for the database, grid, and framebuffer. */ MaxGlobMem <<= 20; /* Convert MB to bytes. */ MAIN_INITENV(,MaxGlobMem + 512*1024) THREAD_INIT_FREE(); gm = (GMEM *)G_MALLOC(sizeof(GMEM)); /* * Perform shared environment initializations. */ gm->nprocs = nprocs; gm->pid = 0; gm->rid = 1; BARINIT(gm->start, nprocs) LOCKINIT(gm->pidlock) LOCKINIT(gm->ridlock) LOCKINIT(gm->memlock) ALOCKINIT(gm->wplock, nprocs) /* POSSIBLE ENHANCEMENT: Here is where one might distribute the raystruct data structure across physically distributed memories as desired. */ if (!GlobalHeapInit(MaxGlobMem)) { fprintf(stderr, "%s: Cannot initialize global heap.\n", ProgName); exit(1); } /* * Initialize HUG parameters, read environment and geometry files. */ Huniform_defaults(); ReadEnvFile(/* *argv*/argv[i]); ReadGeoFile(GeoFileName); OpenFrameBuffer(); /* * Compute view transform and its inverse. */ CreateViewMatrix(); MatrixCopy(vtrans, View.vtrans); MatrixInverse(Vinv, vtrans); MatrixCopy(View.vtransInv, Vinv); /* * Print out what we have so far. */ printf("Number of primitive objects: \t%ld\n", prim_obj_cnt); printf("Number of primitive elements:\t%ld\n", prim_elem_cnt); /* * Preprocess database into hierarchical uniform grid. */ if (TraversalType == TT_HUG) BuildHierarchy_Uniform(); /* * Now create slave processes. */ CLOCK(begin) CREATE(StartRayTrace, gm->nprocs); WAIT_FOR_END(gm->nprocs); CLOCK(end) /* * We are finished. Clean up, print statistics and run time. */ CloseFrameBuffer(PicFileName); PrintStatistics(); lapsed = (end - begin) & 0x7FFFFFFF; printf("TIMING STATISTICS MEASURED BY MAIN PROCESS:\n"); printf(" Overall start time %20lu\n", begin); printf(" Overall end time %20lu\n", end); printf(" Total time with initialization %20lu\n", lapsed); printf(" Total time without initialization %20lu\n", end - gm->par_start_time); if (dostats) { unsigned totalproctime, maxproctime, minproctime; printf("\n\n\nPER-PROCESS STATISTICS:\n"); printf("%20s%20s\n","Proc","Time"); printf("%20s%20s\n\n","","Tracing Rays"); for (i = 0; i < gm->nprocs; i++) printf("%20ld%20ld\n",i,gm->partime[i]); totalproctime = gm->partime[0]; minproctime = gm->partime[0]; maxproctime = gm->partime[0]; for (i = 1; i < gm->nprocs; i++) { totalproctime += gm->partime[i]; if (gm->partime[i] > maxproctime) maxproctime = gm->partime[i]; if (gm->partime[i] < minproctime) minproctime = gm->partime[i]; } printf("\n\n%20s%20d\n","Max = ",maxproctime); printf("%20s%20d\n","Min = ",minproctime); printf("%20s%20d\n","Avg = ",(int) (((double) totalproctime) / ((double) (1.0 * gm->nprocs)))); } MAIN_END }
int main(int argc, char *argv[]) { long i; long c; extern char *optarg; long m1; long factor; long pages; unsigned long start; CLOCK(start); while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) { switch(c) { case 'p': P = atoi(optarg); if (P < 1) { printerr("P must be >= 1\n"); exit(-1); } if (log_2(P) == -1) { printerr("P must be a power of 2\n"); exit(-1); } break; case 'm': M = atoi(optarg); m1 = M/2; if (2*m1 != M) { printerr("M must be even\n"); exit(-1); } break; case 'n': num_cache_lines = atoi(optarg); orig_num_lines = num_cache_lines; if (num_cache_lines < 1) { printerr("Number of cache lines must be >= 1\n"); exit(-1); } break; case 'l': log2_line_size = atoi(optarg); if (log2_line_size < 0) { printerr("Log base 2 of cache line length in bytes must be >= 0\n"); exit(-1); } break; case 's': dostats = !dostats; break; case 't': test_result = !test_result; break; case 'o': doprint = !doprint; break; case 'h': printf("Usage: FFT <options>\n\n"); printf("options:\n"); printf(" -mM : M = even integer; 2**M total complex data points transformed.\n"); printf(" -pP : P = number of processors; Must be a power of 2.\n"); printf(" -nN : N = number of cache lines.\n"); printf(" -lL : L = Log base 2 of cache line length in bytes.\n"); printf(" -s : Print individual processor timing statistics.\n"); printf(" -t : Perform FFT and inverse FFT. Test output by comparing the\n"); printf(" integral of the original data to the integral of the data that\n"); printf(" results from performing the FFT and inverse FFT.\n"); printf(" -o : Print out complex data points.\n"); printf(" -h : Print out command line options.\n\n"); printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n", DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE); exit(0); break; } } MAIN_INITENV(,80000000); N = 1<<M; rootN = 1<<(M/2); rowsperproc = rootN/P; if (rowsperproc == 0) { printerr("Matrix not large enough. 2**(M/2) must be >= P\n"); exit(-1); } line_size = 1 << log2_line_size; if (line_size < 2*sizeof(double)) { printf("WARNING: Each element is a complex double (%ld bytes)\n",2*sizeof(double)); printf(" => Less than one element per cache line\n"); printf(" Computing transpose blocking factor\n"); factor = (2*sizeof(double)) / line_size; num_cache_lines = orig_num_lines / factor; } if (line_size <= 2*sizeof(double)) { pad_length = 1; } else { pad_length = line_size / (2*sizeof(double)); } if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) { pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE; if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) { pages ++; } pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc); } else { pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) / (2 * sizeof(double) * rowsperproc); if (pad_length * (2 * sizeof(double) * rowsperproc) != (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) { printerr("Padding algorithm unsuccessful\n"); exit(-1); } } Global = (struct GlobalMemory *) G_MALLOC(sizeof(struct GlobalMemory)); x = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE); trans = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE); umain = (double *) G_MALLOC(2*rootN*sizeof(double)); umain2 = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE); Global->transtimes = (long *) G_MALLOC(P*sizeof(long)); Global->totaltimes = (long *) G_MALLOC(P*sizeof(long)); if (Global == NULL) { printerr("Could not malloc memory for Global\n"); exit(-1); } else if (x == NULL) { printerr("Could not malloc memory for x\n"); exit(-1); } else if (trans == NULL) { printerr("Could not malloc memory for trans\n"); exit(-1); } else if (umain == NULL) { printerr("Could not malloc memory for umain\n"); exit(-1); } else if (umain2 == NULL) { printerr("Could not malloc memory for umain2\n"); exit(-1); } x = (double *) (((unsigned long) x) + PAGE_SIZE - ((unsigned long) x) % PAGE_SIZE); trans = (double *) (((unsigned long) trans) + PAGE_SIZE - ((unsigned long) trans) % PAGE_SIZE); umain2 = (double *) (((unsigned long) umain2) + PAGE_SIZE - ((unsigned long) umain2) % PAGE_SIZE); /* In order to optimize data distribution, the data structures x, trans, and umain2 have been aligned so that each begins on a page boundary. This ensures that the amount of padding calculated by the program is such that each processor's partition ends on a page boundary, thus ensuring that all data from these structures that are needed by a processor can be allocated to its local memory */ /* POSSIBLE ENHANCEMENT: Here is where one might distribute the x, trans, and umain2 data structures across physically distributed memories as desired. One way to place data is as follows: double *base; long i; i = ((N/P)+(rootN/P)*pad_length)*2; base = &(x[0]); for (j=0;j<P;j++) { Place all addresses x such that (base <= x < base+i) on node j base += i; } The trans and umain2 data structures can be placed in a similar manner. */ printf("\n"); printf("FFT with Blocking Transpose\n"); printf(" %ld Complex Doubles\n",N); printf(" %ld Processors\n",P); if (num_cache_lines != orig_num_lines) { printf(" %ld Cache lines\n",orig_num_lines); printf(" %ld Cache lines for blocking transpose\n",num_cache_lines); } else { printf(" %ld Cache lines\n",num_cache_lines); } printf(" %d Byte line size\n",(1 << log2_line_size)); printf(" %d Bytes per page\n",PAGE_SIZE); printf("\n"); BARINIT(Global->start, P); LOCKINIT(Global->idlock); Global->id = 0; InitX(x); /* place random values in x */ if (test_result) { ck1 = CheckSum(x); } if (doprint) { printf("Original data values:\n"); PrintArray(N, x); } InitU(N,umain); /* initialize u arrays*/ InitU2(N,umain2,rootN); /* fire off P processes */ CREATE(SlaveStart, P); WAIT_FOR_END(P); if (doprint) { if (test_result) { printf("Data values after inverse FFT:\n"); } else { printf("Data values after FFT:\n"); } PrintArray(N, x); } transtime = Global->transtimes[0]; printf("\n"); printf(" PROCESS STATISTICS\n"); printf(" Computation Transpose Transpose\n"); printf(" Proc Time Time Fraction\n"); printf(" 0 %10ld %10ld %8.5f\n", Global->totaltimes[0],Global->transtimes[0], ((double)Global->transtimes[0])/Global->totaltimes[0]); if (dostats) { transtime2 = Global->transtimes[0]; avgtranstime = Global->transtimes[0]; avgcomptime = Global->totaltimes[0]; maxtotal = Global->totaltimes[0]; mintotal = Global->totaltimes[0]; maxfrac = ((double)Global->transtimes[0])/Global->totaltimes[0]; minfrac = ((double)Global->transtimes[0])/Global->totaltimes[0]; avgfractime = ((double)Global->transtimes[0])/Global->totaltimes[0]; for (i=1;i<P;i++) { if (Global->transtimes[i] > transtime) { transtime = Global->transtimes[i]; } if (Global->transtimes[i] < transtime2) { transtime2 = Global->transtimes[i]; } if (Global->totaltimes[i] > maxtotal) { maxtotal = Global->totaltimes[i]; } if (Global->totaltimes[i] < mintotal) { mintotal = Global->totaltimes[i]; } if (((double)Global->transtimes[i])/Global->totaltimes[i] > maxfrac) { maxfrac = ((double)Global->transtimes[i])/Global->totaltimes[i]; } if (((double)Global->transtimes[i])/Global->totaltimes[i] < minfrac) { minfrac = ((double)Global->transtimes[i])/Global->totaltimes[i]; } printf(" %3ld %10ld %10ld %8.5f\n", i,Global->totaltimes[i],Global->transtimes[i], ((double)Global->transtimes[i])/Global->totaltimes[i]); avgtranstime += Global->transtimes[i]; avgcomptime += Global->totaltimes[i]; avgfractime += ((double)Global->transtimes[i])/Global->totaltimes[i]; } printf(" Avg %10.0f %10.0f %8.5f\n", ((double) avgcomptime)/P,((double) avgtranstime)/P,avgfractime/P); printf(" Max %10ld %10ld %8.5f\n", maxtotal,transtime,maxfrac); printf(" Min %10ld %10ld %8.5f\n", mintotal,transtime2,minfrac); } Global->starttime = start; printf("\n"); printf(" TIMING INFORMATION\n"); printf("Start time : %16lu\n", Global->starttime); printf("Initialization finish time : %16lu\n", Global->initdonetime); printf("Overall finish time : %16lu\n", Global->finishtime); printf("Total time with initialization : %16lu\n", Global->finishtime-Global->starttime); printf("Total time without initialization : %16lu\n", Global->finishtime-Global->initdonetime); printf("Overall transpose time : %16ld\n", transtime); printf("Overall transpose fraction : %16.5f\n", ((double) transtime)/(Global->finishtime-Global->initdonetime)); printf("\n"); if (test_result) { ck3 = CheckSum(x); printf(" INVERSE FFT TEST RESULTS\n"); printf("Checksum difference is %.3f (%.3f, %.3f)\n", ck1-ck3, ck1, ck3); if (fabs(ck1-ck3) < 0.001) { printf("TEST PASSED\n"); } else { printf("TEST FAILED\n"); } } MAIN_END; }
int main(int argc, char **argv) { /* default values for the control parameters of the driver */ /* are in parameters.h */ if ((argc == 2) && ((strncmp(argv[1],"-h",strlen("-h")) == 0) || (strncmp(argv[1],"-H",strlen("-H")) == 0))) { printf("Usage: WATER-SPATIAL < infile, where the contents of infile can be\nobtained from the comments at the top of water.C and the first scanf \nin main() in water.C\n\n"); exit(0); } #else int main(void) { #endif /* POSSIBLE ENHANCEMENT: One might bind the first process to a processor here, even before the other (child) processes are bound later in mdmain(). */ six = stdout; TEMP =298.0; RHO =0.9980; /* read input */ #ifndef SIM_SOCLIB if (scanf("%lf%ld%ld%ld%ld%ld%ld%ld%ld%lf",&TSTEP, &NMOL, &NSTEP, &NORDER, &NSAVE, &NRST, &NPRINT, &NFMC,&NumProcs, &CUTOFF) != 10) fprintf(stderr,"ERROR: Usage: water < infile, which must have 10 fields, see SPLASH documentation or comment at top of water.C\n"); #else TSTEP = 1.5e-16; NMOL = NMOLS; NSTEP = 3; NORDER = 6; NSAVE = -1 ; NRST = 3000 ; NPRINT = 3 ; NFMC = 0; NumProcs = NB_P; CUTOFF = 6.212752; #endif printf("Using %ld procs on %ld steps of %ld mols\n", NumProcs, NSTEP, NMOL); printf("Other parameters:\n\tTSTEP = %8.2e\n\tNORDER = %ld\n\tNSAVE = %ld\n",TSTEP,NORDER,NSAVE); printf("\tNRST = %ld\n\tNPRINT = %ld\n\tNFMC = %ld\n\tCUTOFF = %lf\n\n",NRST,NPRINT,NFMC,CUTOFF); /* set up scaling factors and constants */ NORD1=NORDER+1; CNSTNT(NORD1,TLC); /* sub. call to set up constants */ SYSCNS(); /* sub. call to initialize system constants */ printf("%ld boxes with %ld processors\n\n", BOX_PER_SIDE * BOX_PER_SIDE * BOX_PER_SIDE, NumProcs); if (NumProcs > (BOX_PER_SIDE * BOX_PER_SIDE * BOX_PER_SIDE)) { fprintf(stderr,"ERROR: less boxes (%ld) than processors (%ld)\n", BOX_PER_SIDE * BOX_PER_SIDE * BOX_PER_SIDE, NumProcs); fflush(stderr); exit(-1); } fprintf(six,"\nTEMPERATURE = %8.2f K\n",TEMP); fprintf(six,"DENSITY = %8.5f G/C.C.\n",RHO); fprintf(six,"NUMBER OF MOLECULES = %8ld\n",NMOL); fprintf(six,"NUMBER OF PROCESSORS = %8ld\n",NumProcs); fprintf(six,"TIME STEP = %8.2e SEC\n",TSTEP); fprintf(six,"ORDER USED TO SOLVE F=MA = %8ld \n",NORDER); fprintf(six,"NO. OF TIME STEPS = %8ld \n",NSTEP); fprintf(six,"FREQUENCY OF DATA SAVING = %8ld \n",NSAVE); fprintf(six,"FREQUENCY TO WRITE RST FILE= %8ld \n",NRST); fflush(six); { /* do memory initializations */ long procnum, i, j, k, l; struct list_of_boxes *temp_box; long xprocs, yprocs, zprocs; long x_inc, y_inc, z_inc; long x_ct, y_ct, z_ct; long x_left, y_left, z_left; long x_first, y_first, z_first; long x_last, y_last, z_last; double proccbrt; long gmem_size = sizeof(struct GlobalMemory); MAIN_INITENV((NumProcs),40000000,); /* macro call to initialize shared memory etc. */ /* Allocate space for main (BOX) data structure as well as * synchronization variables */ start_end = (first_last_array **) G_MALLOC(sizeof(first_last_array *) * NumProcs); for (i=0; i < NumProcs; i++) { start_end[i] = (first_last_array *) G_MALLOC(sizeof(first_last_array)); } /* Calculate start and finish box numbers for processors */ xprocs = 0; yprocs = 0; proccbrt = (double) pow((double) NumProcs, 1.0/3.0) + 0.00000000000001; j = (long) proccbrt; if (j<1) j = 1; while ((xprocs == 0) && (j>0)) { k = (long) sqrt((double) (NumProcs / j)); if (k<1) k=1; while ((yprocs == 0) && (k>0)) { l = NumProcs/(j*k); if ((j*k*l) == NumProcs) { xprocs = j; yprocs = k; zprocs = l; } /* if */ k--; } /* while yprocs && k */ j--; } /* while xprocs && j */ printf("xprocs = %ld\typrocs = %ld\tzprocs = %ld\n", xprocs, yprocs, zprocs); fflush(stdout); /* Fill in start_end array values */ procnum = 0; x_inc = BOX_PER_SIDE/xprocs; y_inc = BOX_PER_SIDE/yprocs; z_inc = BOX_PER_SIDE/zprocs; x_left = BOX_PER_SIDE - (xprocs*x_inc); y_left = BOX_PER_SIDE - (yprocs*y_inc); z_left = BOX_PER_SIDE - (zprocs*z_inc); printf("x_inc = %ld\t y_inc = %ld\t z_inc = %ld\n",x_inc,y_inc,z_inc); printf("x_left = %ld\t y_left = %ld\t z_left = %ld\n",x_left,y_left,z_left); fflush(stdout); x_first = 0; x_ct = x_left; x_last = -1; x_inc++; for (i=0; i<xprocs; i++) { y_ct = y_left; if (x_ct == 0) x_inc--; x_last += x_inc; y_first = 0; y_last = -1; y_inc++; for (j=0; j<yprocs; j++) { z_ct = z_left; if (y_ct == 0) y_inc--; y_last += y_inc; z_first = 0; z_last = -1; z_inc++; for (k=0; k<zprocs; k++) { if (z_ct == 0) z_inc--; z_last += z_inc; start_end[procnum]->box[XDIR][FIRST] = x_first; start_end[procnum]->box[XDIR][LAST] = min(x_last, BOX_PER_SIDE - 1); start_end[procnum]->box[YDIR][FIRST] = y_first; start_end[procnum]->box[YDIR][LAST] = min(y_last, BOX_PER_SIDE - 1); start_end[procnum]->box[ZDIR][FIRST] = z_first; start_end[procnum]->box[ZDIR][LAST] = min(z_last, BOX_PER_SIDE - 1); z_first = z_last + 1; z_ct--; procnum++; } y_first = y_last + 1; y_ct--; } x_first = x_last + 1; x_ct--; } /* Allocate space for my_boxes array */ my_boxes = (box_list **) G_MALLOC(NumProcs * sizeof(box_list *)); /* Set all box ptrs to null */ for (i=0; i<NumProcs; i++) my_boxes[i] = NULL; /* Set up links for all boxes for initial interf and intraf */ temp_box = my_boxes[0]; while (temp_box) { temp_box = temp_box->next_box; } /* Allocate space for BOX array */ BOX = (box_type ***) G_MALLOC(BOX_PER_SIDE * sizeof(box_type **)); for (i=0; i < BOX_PER_SIDE; i++) { BOX[i] = (box_type **) G_MALLOC( BOX_PER_SIDE * sizeof(box_type *)); for (j=0; j < BOX_PER_SIDE; j++) { BOX[i][j] = (box_type *) G_MALLOC(BOX_PER_SIDE * sizeof(box_type)); for (k=0; k < BOX_PER_SIDE; k++) { BOX[i][j][k].list = NULL; LOCKINIT(BOX[i][j][k].boxlock); } } } /* for i */ gl = (struct GlobalMemory *) G_MALLOC(gmem_size); /* macro calls to initialize synch variables */ BARINIT(gl->start, NumProcs); BARINIT(gl->InterfBar, NumProcs); BARINIT(gl->PotengBar, NumProcs); LOCKINIT(gl->IOLock); LOCKINIT(gl->IndexLock); LOCKINIT(gl->IntrafVirLock); LOCKINIT(gl->InterfVirLock); LOCKINIT(gl->KinetiSumLock); LOCKINIT(gl->PotengSumLock); } fprintf(six,"SPHERICAL CUTOFF RADIUS = %8.4f ANGSTROM\n",CUTOFF); fflush(six); IRST=0; /* call initialization routine */ INITIA(); gl->tracktime = 0; gl->intratime = 0; gl->intertime = 0; /* initialize Index to 1 so that the first created child gets id 1, not 0 */ gl->Index = 1; if (NSAVE > 0) { /* not true for input decks provided */ fprintf(six,"COLLECTING X AND V DATA AT EVERY %4ld TIME STEPS \n",NSAVE); } /* spawn helper processes */ CLOCK(gl->computestart); CREATE(WorkStart, NumProcs); /* macro to make main process wait for all others to finish */ WAIT_FOR_END(NumProcs); CLOCK(gl->computeend); printf("COMPUTESTART (after initialization) = %lu\n",gl->computestart); printf("COMPUTEEND = %lu\n",gl->computeend); printf("COMPUTETIME (after initialization) = %lu\n",gl->computeend-gl->computestart); printf("Measured Time (2nd timestep onward) = %lu\n",gl->tracktime); printf("Intramolecular time only (2nd timestep onward) = %lu\n",gl->intratime); printf("Intermolecular time only (2nd timestep onward) = %lu\n",gl->intertime); printf("Other time (2nd timestep onward) = %lu\n",gl->tracktime - gl->intratime - gl->intertime); printf("\nExited Happily with XTT = %g (note: XTT value is garbage if NPRINT > NSTEP)\n", XTT); MAIN_END; } /* main.c */
void Frame() { long starttime,stoptime,exectime,i; Init_Options(); printf("*****Entering init_decomposition with num_nodes = %ld\n",num_nodes); fflush(stdout); Init_Decomposition(); printf("*****Exited init_decomposition with num_nodes = %ld\n",num_nodes); fflush(stdout); Global = (struct GlobalMemory *)NU_MALLOC(sizeof(struct GlobalMemory),0); BARINIT(Global->SlaveBarrier, num_nodes); BARINIT(Global->TimeBarrier, num_nodes); LOCKINIT(Global->IndexLock); LOCKINIT(Global->CountLock); ALOCKINIT(Global->QLock,MAX_NUMPROC+1); /* load dataset from file to each node */ #ifndef RENDER_ONLY CLOCK(starttime); Load_Map(filename); CLOCK(stoptime); mclock(stoptime,starttime,&exectime); printf("wall clock execution time to load map: %lu ms\n", exectime); #endif CLOCK(starttime); #ifndef RENDER_ONLY Compute_Normal(); #ifdef PREPROCESS Store_Normal(filename); #endif #else Load_Normal(filename); #endif CLOCK(stoptime); mclock(stoptime,starttime,&exectime); printf("wall clock execution time to compute normal: %lu ms\n", exectime); CLOCK(starttime); #ifndef RENDER_ONLY Compute_Opacity(); #ifdef PREPROCESS Store_Opacity(filename); #endif #else Load_Opacity(filename); #endif CLOCK(stoptime); mclock(stoptime,starttime,&exectime); printf("wall clock execution time to compute opacity: %lu ms\n", exectime); Compute_Pre_View(); shd_length = LOOKUP_SIZE; Allocate_Shading_Table(&shd_address,shd_length); /* allocate space for image */ image_len[X] = frust_len; image_len[Y] = frust_len; image_length = image_len[X] * image_len[Y]; Allocate_Image(&image_address,image_length); if (num_nodes == 1) { block_xlen = image_len[X]; block_ylen = image_len[Y]; num_blocks = 1; num_xblocks = 1; num_yblocks = 1; image_block = image_address; } else { num_xblocks = ROUNDUP((float)image_len[X]/(float)block_xlen); num_yblocks = ROUNDUP((float)image_len[Y]/(float)block_ylen); num_blocks = num_xblocks * num_yblocks; Lallocate_Image(&image_block,block_xlen*block_ylen); } CLOCK(starttime); #ifndef RENDER_ONLY Compute_Octree(); #ifdef PREPROCESS Store_Octree(filename); #endif #else Load_Octree(filename); #endif CLOCK(stoptime); mclock(stoptime,starttime,&exectime); printf("wall clock execution time to compute octree: %lu ms\n", exectime); #ifdef PREPROCESS return; #endif if (adaptive) { printf("1.\n"); for (i=0; i<NI; i++) { mask_image_len[i] = image_len[i]; } mask_image_length = image_length; Allocate_MImage(&mask_image_address, mask_image_length); if (num_nodes == 1) mask_image_block = (PIXEL *)mask_image_address; else Lallocate_Image(&mask_image_block, block_xlen*block_ylen); printf("2.\n"); } #ifndef RENDER_ONLY Deallocate_Map(&map_address); #endif Global->Index = NODE0; printf("\nRendering...\n"); printf("node\tframe\ttime\titime\trays\thrays\tsamples trilirped\n"); CREATE(Render_Loop, num_nodes); }
int main(int argc, char **argv) { /* default values for the control parameters of the driver */ /* are in parameters.h */ if ((argc == 2) &&((strncmp(argv[1],"-h",strlen("-h")) == 0) || (strncmp(argv[1],"-H",strlen("-H")) == 0))) { printf("Usage: WATER-NSQUARED < infile, where the contents of infile can be\nobtained from the comments at the top of water.C and the first scanf \nin main() in water.C\n\n"); exit(0); } /* POSSIBLE ENHANCEMENT: Here's where one might bind the main process (process 0) to a processor if one wanted to. Others can be bound in the WorkStart routine. */ six = stdout; /* output file */ TEMP =298.0; RHO =0.9980; CUTOFF=0.0; /* read input */ if (scanf("%lf%ld%ld%ld%ld%ld%ld%ld%ld%lf",&TSTEP, &NMOL, &NSTEP, &NORDER, &NSAVE, &NRST, &NPRINT, &NFMC,&NumProcs, &CUTOFF) != 10) fprintf(stderr,"ERROR: Usage: water < infile, which must have 10 fields, see SPLASH documentation or comment at top of water.C\n"); if (NMOL > MAXLCKS) { fprintf(stderr, "Just so you know ... Lock array in global.H has size %ld < %ld (NMOL)\n code will still run correctly but there may be lock contention\n\n", MAXLCKS, NMOL); } printf("Using %ld procs on %ld steps of %ld mols\n", NumProcs, NSTEP, NMOL); printf("Other parameters:\n\tTSTEP = %8.2e\n\tNORDER = %ld\n\tNSAVE = %ld\n",TSTEP,NORDER,NSAVE); printf("\tNRST = %ld\n\tNPRINT = %ld\n\tNFMC = %ld\n\tCUTOFF = %lf\n\n",NRST,NPRINT,NFMC,CUTOFF); /* SET UP SCALING FACTORS AND CONSTANTS */ NORD1=NORDER+1; CNSTNT(NORD1,TLC); /* sub. call to set up constants */ { /* Do memory initializations */ long pid; long mol_size = sizeof(molecule_type) * NMOL; long gmem_size = sizeof(struct GlobalMemory); /* POSSIBLE ENHANCEMENT: One might bind the first process to a processor here, even before the other (child) processes are bound later in mdmain(). */ MAIN_INITENV(,70000000,); /* macro call to initialize shared memory etc. */ THREAD_INIT_FREE(); /* allocate space for main (VAR) data structure as well as synchronization variables */ /* POSSIBLE ENHANCEMENT: One might want to allocate a process's portion of the VAR array and what it points to in its local memory */ VAR = (molecule_type *) G_MALLOC(mol_size); gl = (struct GlobalMemory *) G_MALLOC(gmem_size); /* POSSIBLE ENHANCEMENT: One might want to allocate process i's PFORCES[i] array in its local memory */ PFORCES = (double ****) G_MALLOC(NumProcs * sizeof (double ***)); { long i,j,k; for (i = 0; i < NumProcs; i++) { PFORCES[i] = (double ***) G_MALLOC(NMOL * sizeof (double **)); for (j = 0; j < NMOL; j++) { PFORCES[i][j] = (double **) G_MALLOC(NDIR * sizeof (double *)); for (k = 0; k < NDIR; k++) { PFORCES[i][j][k] = (double *) G_MALLOC(NATOM * sizeof (double)); } } } } /* macro calls to initialize synch varibles */ BARINIT(gl->start, NumProcs); BARINIT(gl->InterfBar, NumProcs); BARINIT(gl->PotengBar, NumProcs); LOCKINIT(gl->IOLock); LOCKINIT(gl->IndexLock); LOCKINIT(gl->IntrafVirLock); LOCKINIT(gl->InterfVirLock); LOCKINIT(gl->FXLock); LOCKINIT(gl->FYLock); LOCKINIT(gl->FZLock); if (NMOL < MAXLCKS) { ALOCKINIT(gl->MolLock, NMOL); } else { ALOCKINIT(gl->MolLock, MAXLCKS); } LOCKINIT(gl->KinetiSumLock); LOCKINIT(gl->PotengSumLock); /* set up control for static scheduling */ MolsPerProc = NMOL/NumProcs; StartMol[0] = 0; for (pid = 1; pid < NumProcs; pid += 1) { StartMol[pid] = StartMol[pid-1] + MolsPerProc; } StartMol[NumProcs] = NMOL; } SYSCNS(); /* sub. call to initialize system constants */ fprintf(six,"\nTEMPERATURE = %8.2f K\n",TEMP); fprintf(six,"DENSITY = %8.5f G/C.C.\n",RHO); fprintf(six,"NUMBER OF MOLECULES = %8ld\n",NMOL); fprintf(six,"NUMBER OF PROCESSORS = %8ld\n",NumProcs); fprintf(six,"TIME STEP = %8.2e SEC\n",TSTEP); fprintf(six,"ORDER USED TO SOLVE F=MA = %8ld \n",NORDER); fprintf(six,"NO. OF TIME STEPS = %8ld \n",NSTEP); fprintf(six,"FREQUENCY OF DATA SAVING = %8ld \n",NSAVE); fprintf(six,"FREQUENCY TO WRITE RST FILE= %8ld \n",NRST); fprintf(six,"SPHERICAL CUTOFF RADIUS = %8.4f ANGSTROM\n",CUTOFF); fflush(six); /* initialization routine; also reads displacements and sets up random velocities*/ INITIA(); /*.....start molecular dynamic loop */ gl->tracktime = 0; gl->intratime = 0; gl->intertime = 0; /* initialize Index to 1 so that the first created child gets id 1, not 0 */ gl->Index = 1; if (NSAVE > 0) /* not true for input decks provided */ fprintf(six,"COLLECTING X AND V DATA AT EVERY %4ld TIME STEPS \n",NSAVE); /* spawn helper processes, each getting its unique process id */ CLOCK(gl->computestart); CREATE(WorkStart, NumProcs); /* macro to make main process wait for all others to finish */ WAIT_FOR_END(NumProcs); CLOCK(gl->computeend); printf("COMPUTESTART (after initialization) = %lu\n",gl->computestart); printf("COMPUTEEND = %lu\n",gl->computeend); printf("COMPUTETIME (after initialization) = %lu\n",gl->computeend-gl->computestart); printf("Measured Time (2nd timestep onward) = %lu\n",gl->tracktime); printf("Intramolecular time only (2nd timestep onward) = %lu\n",gl->intratime); printf("Intermolecular time only (2nd timestep onward) = %lu\n",gl->intertime); printf("Other time (2nd timestep onward) = %lu\n",gl->tracktime - gl->intratime - gl->intertime); printf("\nExited Happily with XTT = %g (note: XTT value is garbage if NPRINT > NSTEP)\n", XTT); MAIN_END; } /* main.c */
int main(int argc, char *argv[]) { long i; long j; long k; long x_part; long y_part; long d_size; long itemp; long jtemp; double procsqrt; long temp = 0; double min_total; double max_total; double avg_total; double min_multi; double max_multi; double avg_multi; double min_frac; double max_frac; double avg_frac; long ch; extern char *optarg; unsigned long computeend; unsigned long start; CLOCK(start) while ((ch = getopt(argc, argv, "n:p:e:r:t:soh")) != -1) { switch(ch) { case 'n': im = atoi(optarg); if (log_2(im-2) == -1) { printerr("Grid must be ((power of 2)+2) in each dimension\n"); exit(-1); } break; case 'p': nprocs = atoi(optarg); if (nprocs < 1) { printerr("P must be >= 1\n"); exit(-1); } if (log_2(nprocs) == -1) { printerr("P must be a power of 2\n"); exit(-1); } break; case 'e': tolerance = atof(optarg); break; case 'r': res = atof(optarg); break; case 't': dtau = atof(optarg); break; case 's': do_stats = !do_stats; break; case 'o': do_output = !do_output; break; case 'h': printf("Usage: OCEAN <options>\n\n"); printf("options:\n"); printf(" -nN : Simulate NxN ocean. N must be (power of 2)+2.\n"); printf(" -pP : P = number of processors. P must be power of 2.\n"); printf(" -eE : E = error tolerance for iterative relaxation.\n"); printf(" -rR : R = distance between grid points in meters.\n"); printf(" -tT : T = timestep in seconds.\n"); printf(" -s : Print timing statistics.\n"); printf(" -o : Print out relaxation residual values.\n"); printf(" -h : Print out command line options.\n\n"); printf("Default: OCEAN -n%1d -p%1d -e%1g -r%1g -t%1g\n", DEFAULT_N,DEFAULT_P,DEFAULT_E,DEFAULT_R,DEFAULT_T); exit(0); break; } } MAIN_INITENV(,60000000) THREAD_INIT_FREE(); jm = im; printf("\n"); printf("Ocean simulation with W-cycle multigrid solver\n"); printf(" Processors : %1ld\n",nprocs); printf(" Grid size : %1ld x %1ld\n",im,jm); printf(" Grid resolution (meters) : %0.2f\n",res); printf(" Time between relaxations (seconds) : %0.0f\n",dtau); printf(" Error tolerance : %0.7g\n",tolerance); printf("\n"); xprocs = 0; yprocs = 0; procsqrt = sqrt((double) nprocs); j = (long) procsqrt; while ((xprocs == 0) && (j > 0)) { k = nprocs / j; if (k * j == nprocs) { if (k > j) { xprocs = j; yprocs = k; } else { xprocs = k; yprocs = j; } } j--; } if (xprocs == 0) { printerr("Could not find factors for subblocking\n"); exit(-1); } minlevel = 0; itemp = 1; jtemp = 1; numlev = 0; minlevel = 0; while (itemp < (im-2)) { itemp = itemp*2; jtemp = jtemp*2; if ((itemp/yprocs > 1) && (jtemp/xprocs > 1)) { numlev++; } } if (numlev == 0) { printerr("Must have at least 2 grid points per processor in each dimension\n"); exit(-1); } imx = (long *) G_MALLOC(numlev*sizeof(long)); jmx = (long *) G_MALLOC(numlev*sizeof(long)); lev_res = (double *) G_MALLOC(numlev*sizeof(double)); lev_tol = (double *) G_MALLOC(numlev*sizeof(double)); i_int_coeff = (double *) G_MALLOC(numlev*sizeof(double)); j_int_coeff = (double *) G_MALLOC(numlev*sizeof(double)); xpts_per_proc = (long *) G_MALLOC(numlev*sizeof(long)); ypts_per_proc = (long *) G_MALLOC(numlev*sizeof(long)); imx[numlev-1] = im; jmx[numlev-1] = jm; lev_res[numlev-1] = res; lev_tol[numlev-1] = tolerance; for (i=numlev-2;i>=0;i--) { imx[i] = ((imx[i+1] - 2) / 2) + 2; jmx[i] = ((jmx[i+1] - 2) / 2) + 2; lev_res[i] = lev_res[i+1] * 2; } for (i=0;i<numlev;i++) { xpts_per_proc[i] = (jmx[i]-2) / xprocs; ypts_per_proc[i] = (imx[i]-2) / yprocs; } for (i=numlev-1;i>=0;i--) { if ((xpts_per_proc[i] < 2) || (ypts_per_proc[i] < 2)) { minlevel = i+1; break; } } for (i=0;i<numlev;i++) { temp += imx[i]; } temp = 0; j = 0; for (k=0;k<numlev;k++) { for (i=0;i<imx[k];i++) { j++; temp += jmx[k]; } } d_size = nprocs*sizeof(double ***); psi = (double ****) G_MALLOC(d_size); psim = (double ****) G_MALLOC(d_size); work1 = (double ****) G_MALLOC(d_size); work4 = (double ****) G_MALLOC(d_size); work5 = (double ****) G_MALLOC(d_size); work7 = (double ****) G_MALLOC(d_size); temparray = (double ****) G_MALLOC(d_size); d_size = 2*sizeof(double **); for (i=0;i<nprocs;i++) { psi[i] = (double ***) G_MALLOC(d_size); psim[i] = (double ***) G_MALLOC(d_size); work1[i] = (double ***) G_MALLOC(d_size); work4[i] = (double ***) G_MALLOC(d_size); work5[i] = (double ***) G_MALLOC(d_size); work7[i] = (double ***) G_MALLOC(d_size); temparray[i] = (double ***) G_MALLOC(d_size); } d_size = nprocs*sizeof(double **); psium = (double ***) G_MALLOC(d_size); psilm = (double ***) G_MALLOC(d_size); psib = (double ***) G_MALLOC(d_size); ga = (double ***) G_MALLOC(d_size); gb = (double ***) G_MALLOC(d_size); work2 = (double ***) G_MALLOC(d_size); work3 = (double ***) G_MALLOC(d_size); work6 = (double ***) G_MALLOC(d_size); tauz = (double ***) G_MALLOC(d_size); oldga = (double ***) G_MALLOC(d_size); oldgb = (double ***) G_MALLOC(d_size); gp = (struct Global_Private *) G_MALLOC((nprocs+1)*sizeof(struct Global_Private)); for (i=0;i<nprocs;i++) { gp[i].rel_num_x = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].rel_num_y = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].eist = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].ejst = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].oist = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].ojst = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].rlist = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].rljst = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].rlien = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].rljen = (long *) G_MALLOC(numlev*sizeof(long)); gp[i].multi_time = 0; gp[i].total_time = 0; } subblock(); x_part = (jm - 2)/xprocs + 2; y_part = (im - 2)/yprocs + 2; d_size = x_part*y_part*sizeof(double) + y_part*sizeof(double *); global = (struct global_struct *) G_MALLOC(sizeof(struct global_struct)); for (i=0;i<nprocs;i++) { psi[i][0] = (double **) G_MALLOC(d_size); psi[i][1] = (double **) G_MALLOC(d_size); psim[i][0] = (double **) G_MALLOC(d_size); psim[i][1] = (double **) G_MALLOC(d_size); psium[i] = (double **) G_MALLOC(d_size); psilm[i] = (double **) G_MALLOC(d_size); psib[i] = (double **) G_MALLOC(d_size); ga[i] = (double **) G_MALLOC(d_size); gb[i] = (double **) G_MALLOC(d_size); work1[i][0] = (double **) G_MALLOC(d_size); work1[i][1] = (double **) G_MALLOC(d_size); work2[i] = (double **) G_MALLOC(d_size); work3[i] = (double **) G_MALLOC(d_size); work4[i][0] = (double **) G_MALLOC(d_size); work4[i][1] = (double **) G_MALLOC(d_size); work5[i][0] = (double **) G_MALLOC(d_size); work5[i][1] = (double **) G_MALLOC(d_size); work6[i] = (double **) G_MALLOC(d_size); work7[i][0] = (double **) G_MALLOC(d_size); work7[i][1] = (double **) G_MALLOC(d_size); temparray[i][0] = (double **) G_MALLOC(d_size); temparray[i][1] = (double **) G_MALLOC(d_size); tauz[i] = (double **) G_MALLOC(d_size); oldga[i] = (double **) G_MALLOC(d_size); oldgb[i] = (double **) G_MALLOC(d_size); } f = (double *) G_MALLOC(im*sizeof(double)); multi = (struct multi_struct *) G_MALLOC(sizeof(struct multi_struct)); d_size = numlev*sizeof(double **); if (numlev%2 == 1) { /* To make sure that the actual data starts double word aligned, add an extra pointer */ d_size += sizeof(double **); } for (i=0;i<numlev;i++) { d_size += ((imx[i]-2)/yprocs+2)*((jmx[i]-2)/xprocs+2)*sizeof(double)+ ((imx[i]-2)/yprocs+2)*sizeof(double *); } d_size *= nprocs; if (nprocs%2 == 1) { /* To make sure that the actual data starts double word aligned, add an extra pointer */ d_size += sizeof(double ***); } d_size += nprocs*sizeof(double ***); q_multi = (double ****) G_MALLOC(d_size); rhs_multi = (double ****) G_MALLOC(d_size); locks = (struct locks_struct *) G_MALLOC(sizeof(struct locks_struct)); bars = (struct bars_struct *) G_MALLOC(sizeof(struct bars_struct)); LOCKINIT(locks->idlock) LOCKINIT(locks->psiailock) LOCKINIT(locks->psibilock) LOCKINIT(locks->donelock) LOCKINIT(locks->error_lock) LOCKINIT(locks->bar_lock) #if defined(MULTIPLE_BARRIERS) BARINIT(bars->iteration, nprocs) BARINIT(bars->gsudn, nprocs) BARINIT(bars->p_setup, nprocs) BARINIT(bars->p_redph, nprocs) BARINIT(bars->p_soln, nprocs) BARINIT(bars->p_subph, nprocs) BARINIT(bars->sl_prini, nprocs) BARINIT(bars->sl_psini, nprocs) BARINIT(bars->sl_onetime, nprocs) BARINIT(bars->sl_phase_1, nprocs) BARINIT(bars->sl_phase_2, nprocs) BARINIT(bars->sl_phase_3, nprocs) BARINIT(bars->sl_phase_4, nprocs) BARINIT(bars->sl_phase_5, nprocs) BARINIT(bars->sl_phase_6, nprocs) BARINIT(bars->sl_phase_7, nprocs) BARINIT(bars->sl_phase_8, nprocs) BARINIT(bars->sl_phase_9, nprocs) BARINIT(bars->sl_phase_10, nprocs) BARINIT(bars->error_barrier, nprocs) #else BARINIT(bars->barrier, nprocs) #endif link_all(); multi->err_multi = 0.0; i_int_coeff[0] = 0.0; j_int_coeff[0] = 0.0; for (i=0;i<numlev;i++) { i_int_coeff[i] = 1.0/(imx[i]-1); j_int_coeff[i] = 1.0/(jmx[i]-1); } /* initialize constants and variables id is a global shared variable that has fetch-and-add operations performed on it by processes to obtain their pids. */ global->id = 0; global->psibi = 0.0; pi = atan(1.0); pi = 4.*pi; factjacob = -1./(12.*res*res); factlap = 1./(res*res); eig2 = -h*f0*f0/(h1*h3*gpr); jmm1 = jm-1 ; ysca = ((double) jmm1)*res ; im = (imx[numlev-1]-2)/yprocs + 2; jm = (jmx[numlev-1]-2)/xprocs + 2; if (do_output) { printf(" MULTIGRID OUTPUTS\n"); } CREATE(slave, nprocs); WAIT_FOR_END(nprocs); CLOCK(computeend) printf("\n"); printf(" PROCESS STATISTICS\n"); printf(" Total Multigrid Multigrid\n"); printf(" Proc Time Time Fraction\n"); printf(" 0 %15.0f %15.0f %10.3f\n", gp[0].total_time,gp[0].multi_time, gp[0].multi_time/gp[0].total_time); if (do_stats) { min_total = max_total = avg_total = gp[0].total_time; min_multi = max_multi = avg_multi = gp[0].multi_time; min_frac = max_frac = avg_frac = gp[0].multi_time/gp[0].total_time; for (i=1;i<nprocs;i++) { if (gp[i].total_time > max_total) { max_total = gp[i].total_time; } if (gp[i].total_time < min_total) { min_total = gp[i].total_time; } if (gp[i].multi_time > max_multi) { max_multi = gp[i].multi_time; } if (gp[i].multi_time < min_multi) { min_multi = gp[i].multi_time; } if (gp[i].multi_time/gp[i].total_time > max_frac) { max_frac = gp[i].multi_time/gp[i].total_time; } if (gp[i].multi_time/gp[i].total_time < min_frac) { min_frac = gp[i].multi_time/gp[i].total_time; } avg_total += gp[i].total_time; avg_multi += gp[i].multi_time; avg_frac += gp[i].multi_time/gp[i].total_time; } avg_total = avg_total / nprocs; avg_multi = avg_multi / nprocs; avg_frac = avg_frac / nprocs; for (i=1;i<nprocs;i++) { printf(" %3ld %15.0f %15.0f %10.3f\n", i,gp[i].total_time,gp[i].multi_time, gp[i].multi_time/gp[i].total_time); } printf(" Avg %15.0f %15.0f %10.3f\n", avg_total,avg_multi,avg_frac); printf(" Min %15.0f %15.0f %10.3f\n", min_total,min_multi,min_frac); printf(" Max %15.0f %15.0f %10.3f\n", max_total,max_multi,max_frac); } printf("\n"); global->starttime = start; printf(" TIMING INFORMATION\n"); printf("Start time : %16lu\n", global->starttime); printf("Initialization finish time : %16lu\n", global->trackstart); printf("Overall finish time : %16lu\n", computeend); printf("Total time with initialization : %16lu\n", computeend-global->starttime); printf("Total time without initialization : %16lu\n", computeend-global->trackstart); printf(" (excludes first timestep)\n"); printf("\n"); MAIN_END }