extern "C" void do_calc(void) { double g = 9.80; double sigma = 0.95; int icount, jcount; struct timeval tstart_cpu; // Initialize state variables for GPU calculation. int &mype = mesh->mype; int &numpe = mesh->numpe; //int levmx = mesh->levmx; size_t &ncells_global = mesh->ncells_global; size_t &ncells = mesh->ncells; size_t &ncells_ghost = mesh->ncells_ghost; vector<int> mpot; vector<int> mpot_global; size_t old_ncells = ncells; size_t old_ncells_global = ncells_global; size_t new_ncells = 0; double deltaT = 0.0; // Main loop. for (int nburst = 0; nburst < outputInterval && ncycle < niter; nburst++, ncycle++) { // Define basic domain decomposition parameters for GPU. old_ncells = ncells; old_ncells_global = ncells_global; MPI_Barrier(MPI_COMM_WORLD); cpu_timer_start(&tstart_cpu); // Calculate the real time step for the current discrete time step. deltaT = state->set_timestep(g, sigma); simTime += deltaT; cpu_timer_start(&tstart_cpu); mesh->calc_neighbors_local(); mesh->partition_measure(); // Currently not working -- may need to be earlier? //if (mesh->have_boundary) { // state->add_boundary_cells(); //} // Apply BCs is currently done as first part of gpu_finite_difference and so comparison won't work here // Execute main kernel cpu_timer_start(&tstart_cpu); state->calc_finite_difference(deltaT); // Size of arrays gets reduced to just the real cells in this call for have_boundary = 0 state->remove_boundary_cells(); cpu_timer_start(&tstart_cpu); mpot.resize(ncells_ghost); new_ncells = state->calc_refine_potential(mpot, icount, jcount); cpu_timer_start(&tstart_cpu); //int add_ncells = new_ncells - old_ncells; state->rezone_all(icount, jcount, mpot); // Clear does not delete mpot, so have to swap with an empty vector to get // it to delete the mpot memory. This is all to avoid valgrind from showing // it as a reachable memory leak //mpot.clear(); vector<int>().swap(mpot); ncells = new_ncells; mesh->ncells = new_ncells; cpu_timer_start(&tstart_cpu); state->do_load_balance_local(new_ncells); // XXX // mesh->proc.resize(ncells); // if (icount) { // vector<int> index(ncells); // mesh->partition_cells(numpe, index, cycle_reorder); // } } // End burst loop double H_sum = state->mass_sum(enhanced_precision_sum); if (isnan(H_sum)) { printf("Got a NAN on cycle %d\n",ncycle); exit(-1); } if (mype == 0){ printf("Iteration %3d timestep %lf Sim Time %lf cells %ld Mass Sum %14.12lg Mass Change %12.6lg\n", ncycle, deltaT, simTime, ncells_global, H_sum, H_sum - H_sum_initial); } #ifdef HAVE_GRAPHICS mesh->x.resize(ncells); mesh->dx.resize(ncells); mesh->y.resize(ncells); mesh->dy.resize(ncells); mesh->calc_spatial_coordinates(0); cpu_timer_start(&tstart_cpu); #ifdef HAVE_MPE set_mysize(ncells); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_data(&state->H[0]); set_cell_proc(&mesh->proc[0]); #endif #ifdef HAVE_OPENGL vector<int> &nsizes = mesh->nsizes; vector<int> &ndispl = mesh->ndispl; set_mysize(ncells_global); //vector<spatial_t> x_global; //vector<spatial_t> dx_global; //vector<spatial_t> y_global; //vector<spatial_t> dy_global; //vector<state_t> H_global; //vector<int> proc_global; if (mype == 0) { x_global.resize(ncells_global); dx_global.resize(ncells_global); y_global.resize(ncells_global); dy_global.resize(ncells_global); H_global.resize(ncells_global); proc_global.resize(ncells_global); } MPI_Gatherv(&mesh->x[0], nsizes[mype], MPI_SPATIAL_T, &x_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&mesh->dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&mesh->y[0], nsizes[mype], MPI_SPATIAL_T, &y_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&mesh->dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&state->H[0], nsizes[mype], MPI_STATE_T, &H_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, 0, MPI_COMM_WORLD); if (view_mode == 0) { mesh->proc.resize(ncells); for (size_t ii = 0; ii<ncells; ii++){ mesh->proc[ii] = mesh->mype; } MPI_Gatherv(&mesh->proc[0], nsizes[mype], MPI_INT, &proc_global[0], &nsizes[0], &ndispl[0], MPI_INT, 0, MPI_COMM_WORLD); } set_cell_coordinates(&x_global[0], &dx_global[0], &y_global[0], &dy_global[0]); set_cell_data(&H_global[0]); set_cell_proc(&proc_global[0]); #endif set_viewmode(view_mode); set_circle_radius(circle_radius); draw_scene(); MPI_Barrier(MPI_COMM_WORLD); cpu_time_graphics += cpu_timer_stop(tstart_cpu); #endif // Output final results and timing information. if (ncycle >= niter) { //free_display(); // Get overall program timing. double elapsed_time = cpu_timer_stop(tstart); long long mem_used = memstats_memused(); if (mem_used > 0) { mesh->parallel_memory_output("Memory used ",mem_used, 0); mesh->parallel_memory_output("Memory peak ",memstats_mempeak(), 0); mesh->parallel_memory_output("Memory free ",memstats_memfree(), 0); mesh->parallel_memory_output("Memory available ",memstats_memtotal(), 0); } state->output_timing_info(do_cpu_calc, do_gpu_calc, elapsed_time); mesh->parallel_timer_output("CPU: graphics time was",cpu_time_graphics, 0); mesh->print_partition_measure(); mesh->print_calc_neighbor_type(); mesh->print_partition_type(); if (mype ==0) { printf("CPU: rezone frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_rezone_count()/(double)ncycle*100.0 ); printf("CPU: calc neigh frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_calc_neigh_count()/(double)ncycle*100.0 ); printf("CPU: load balance frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_load_balance_count()/(double)ncycle*100.0 ); printf("CPU: refine_smooth_iter per rezone \t %8.4f\t\n", (double)mesh->get_cpu_refine_smooth_count()/(double)mesh->get_cpu_rezone_count() ); } mesh->terminate(); state->terminate(); delete mesh; delete state; L7_Terminate(); exit(0); } // Complete final output. }
int main(int argc, char **argv) { // Process command-line arguments, if any. int mype=0; int numpe=0; parseInput(argc, argv); L7_Init(&mype, &numpe, &argc, argv); #if 1 // SKG make things sane for debugging signal(SIGSEGV, SIG_DFL); #endif struct timeval tstart_setup; cpu_timer_start(&tstart_setup); real_t circ_radius = 6.0; // Scale the circle appropriately for the mesh size. circ_radius = circ_radius * (real_t) nx / 128.0; int boundary = 1; int parallel_in = 1; // figure out the max number of threads that can be spawned if (0 == mype) { int nt = omp_get_max_threads(); printf("--- num openmp threads: %d\n", nt); fflush(stdout); } mesh = new Mesh(nx, ny, levmx, ndim, boundary, parallel_in, do_gpu_calc); if (DEBUG) { //if (mype == 0) mesh->print(); char filename[10]; sprintf(filename,"out%1d",mype); mesh->fp=fopen(filename,"w"); //mesh->print_local(); } mesh->init(nx, ny, circ_radius, initial_order, do_gpu_calc); size_t &ncells = mesh->ncells; size_t &ncells_global = mesh->ncells_global; int &noffset = mesh->noffset; state = new State(mesh); state->init(do_gpu_calc); vector<int> &nsizes = mesh->nsizes; vector<int> &ndispl = mesh->ndispl; vector<spatial_t> &x = mesh->x; vector<spatial_t> &dx = mesh->dx; vector<spatial_t> &y = mesh->y; vector<spatial_t> &dy = mesh->dy; nsizes.resize(numpe); ndispl.resize(numpe); int ncells_int = ncells; MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD); ndispl[0]=0; for (int ip=1; ip<numpe; ip++){ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1]; } noffset = ndispl[mype]; state->resize(ncells); state->fill_circle(circ_radius, 100.0, 7.0); x.clear(); dx.clear(); y.clear(); dy.clear(); // Kahan-type enhanced precision sum implementation. double H_sum = state->mass_sum(enhanced_precision_sum); if (mype == 0) printf ("Mass of initialized cells equal to %14.12lg\n", H_sum); H_sum_initial = H_sum; double cpu_time_main_setup = cpu_timer_stop(tstart_setup); mesh->parallel_timer_output("CPU: setup time time was",cpu_time_main_setup, 0); long long mem_used = memstats_memused(); if (mem_used > 0) { mesh->parallel_memory_output("Memory used in startup ",mem_used, 0); mesh->parallel_memory_output("Memory peak in startup ",memstats_mempeak(), 0); mesh->parallel_memory_output("Memory free at startup ",memstats_memfree(), 0); mesh->parallel_memory_output("Memory available at startup ",memstats_memtotal(), 0); } if (mype == 0) { printf("Iteration 0 timestep n/a Sim Time 0.0 cells %ld Mass Sum %14.12lg\n", ncells_global, H_sum); } for (int i = 0; i < MESH_COUNTER_SIZE; i++){ mesh->cpu_counters[i]=0; } for (int i = 0; i < MESH_TIMER_SIZE; i++){ mesh->cpu_timers[i]=0.0; } #ifdef HAVE_GRAPHICS #ifdef HAVE_OPENGL set_mysize(ncells_global); //vector<state_t> H_global; //vector<spatial_t> x_global; //vector<spatial_t> dx_global; //vector<spatial_t> y_global; //vector<spatial_t> dy_global; //vector<int> proc_global; if (mype == 0){ H_global.resize(ncells_global); x_global.resize(ncells_global); dx_global.resize(ncells_global); y_global.resize(ncells_global); dy_global.resize(ncells_global); proc_global.resize(ncells_global); } MPI_Gatherv(&x[0], nsizes[mype], MPI_SPATIAL_T, &x_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&y[0], nsizes[mype], MPI_SPATIAL_T, &y_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, 0, MPI_COMM_WORLD); MPI_Gatherv(&state->H[0], nsizes[mype], MPI_STATE_T, &H_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, 0, MPI_COMM_WORLD); set_cell_data(&H_global[0]); set_cell_coordinates(&x_global[0], &dx_global[0], &y_global[0], &dy_global[0]); if (view_mode == 0) { mesh->proc.resize(ncells); for (size_t ii = 0; ii<ncells; ii++){ mesh->proc[ii] = mesh->mype; } MPI_Gatherv(&mesh->proc[0], nsizes[mype], MPI_INT, &proc_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD); } set_cell_proc(&proc_global[0]); #endif #ifdef HAVE_MPE set_mysize(ncells); set_cell_data(&state->H[0]); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_proc(&mesh->proc[0]); #endif set_window((float)mesh->xmin, (float)mesh->xmax, (float)mesh->ymin, (float)mesh->ymax); set_viewmode(view_mode); set_outline((int)outline); init_display(&argc, argv, "Shallow Water"); set_circle_radius(circle_radius); draw_scene(); if (verbose) sleep(5); sleep(2); // Set flag to show mesh results rather than domain decomposition. view_mode = 1; // Clear superposition of circle on grid output. circle_radius = -1.0; MPI_Barrier(MPI_COMM_WORLD); cpu_timer_start(&tstart); set_idle_function(&do_calc); start_main_loop(); #else MPI_Barrier(MPI_COMM_WORLD); cpu_timer_start(&tstart); for (int it = 0; it < 10000000; it++) { do_calc(); } #endif return 0; }
extern "C" void do_calc(void) { double g = 9.80; double sigma = 0.95; int icount, jcount; // Initialize state variables for GPU calculation. size_t &ncells = mesh->ncells; vector<int> mpot; size_t old_ncells = ncells; size_t new_ncells = 0; double H_sum = -1.0; double deltaT = 0.0; // Main loop. for (int nburst = 0; nburst < outputInterval && ncycle < niter; nburst++, ncycle++) { old_ncells = ncells; // Calculate the real time step for the current discrete time step. deltaT = state->set_timestep(g, sigma); simTime += deltaT; if (mesh->nlft == NULL) mesh->calc_neighbors(); mesh->partition_measure(); // Currently not working -- may need to be earlier? //if (do_cpu_calc && ! mesh->have_boundary) { // state->add_boundary_cells(mesh); //} // Apply BCs is currently done as first part of gpu_finite_difference and so comparison won't work here // Execute main kernel state->calc_finite_difference(deltaT); // Size of arrays gets reduced to just the real cells in this call for have_boundary = 0 state->remove_boundary_cells(); mpot.resize(ncells); new_ncells = state->calc_refine_potential(mpot, icount, jcount); // Resize the mesh, inserting cells where refinement is necessary. state->rezone_all(icount, jcount, mpot); mpot.clear(); mesh->ncells = new_ncells; ncells = new_ncells; mesh->proc.resize(ncells); if (icount) { vector<int> index(ncells); mesh->partition_cells(numpe, index, cycle_reorder); state->state_reorder(index); state->memory_reset_ptrs(); } mesh->ncells = ncells; } H_sum = state->mass_sum(enhanced_precision_sum); if (isnan(H_sum)) { printf("Got a NAN on cycle %d\n",ncycle); exit(-1); } printf("Iteration %3d timestep %lf Sim Time %lf cells %ld Mass Sum %14.12lg Mass Change %12.6lg\n", ncycle, deltaT, simTime, ncells, H_sum, H_sum - H_sum_initial); struct timeval tstart_cpu; cpu_timer_start(&tstart_cpu); #ifdef HAVE_GRAPHICS mesh->calc_spatial_coordinates(0); set_mysize(ncells); set_viewmode(view_mode); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_data(&state->H[0]); set_cell_proc(&mesh->proc[0]); set_circle_radius(circle_radius); draw_scene(); #endif cpu_time_graphics += cpu_timer_stop(tstart_cpu); // Output final results and timing information. if (ncycle >= niter) { //free_display(); // Get overall program timing. double elapsed_time = cpu_timer_stop(tstart); long long mem_used = memstats_memused(); //if (mem_used > 0) { printf("Memory used %lld kB\n",mem_used); printf("Memory peak %lld kB\n",memstats_mempeak()); printf("Memory free %lld kB\n",memstats_memfree()); printf("Memory available %lld kB\n",memstats_memtotal()); //} state->output_timing_info(do_cpu_calc, do_gpu_calc, elapsed_time); printf("CPU: graphics time was\t %8.4f\ts\n", cpu_time_graphics ); mesh->print_partition_measure(); mesh->print_calc_neighbor_type(); mesh->print_partition_type(); printf("CPU: rezone frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_rezone_count()/(double)ncycle*100.0 ); printf("CPU: calc neigh frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_calc_neigh_count()/(double)ncycle*100.0 ); printf("CPU: refine_smooth_iter per rezone \t %8.4f\t\n", (double)mesh->get_cpu_refine_smooth_count()/(double)mesh->get_cpu_rezone_count() ); mesh->terminate(); state->terminate(); delete mesh; delete state; exit(0); } // Complete final output. }
int main(int argc, char **argv) { // Needed for code to compile correctly on the Mac int mype=0; int numpe=-1; // Process command-line arguments, if any. parseInput(argc, argv); struct timeval tstart_setup; cpu_timer_start(&tstart_setup); numpe = 16; double circ_radius = 6.0; // Scale the circle appropriately for the mesh size. circ_radius = circ_radius * (double) nx / 128.0; int boundary = 1; int parallel_in = 0; mesh = new Mesh(nx, ny, levmx, ndim, boundary, parallel_in, do_gpu_calc); if (DEBUG) { //if (mype == 0) mesh->print(); char filename[10]; sprintf(filename,"out%1d",mype); mesh->fp=fopen(filename,"w"); //mesh->print_local(); } mesh->init(nx, ny, circ_radius, initial_order, do_gpu_calc); size_t &ncells = mesh->ncells; state = new State(mesh); state->init(do_gpu_calc); mesh->proc.resize(ncells); mesh->calc_distribution(numpe); state->fill_circle(circ_radius, 100.0, 5.0); mesh->nlft = NULL; mesh->nrht = NULL; mesh->nbot = NULL; mesh->ntop = NULL; // Kahan-type enhanced precision sum implementation. double H_sum = state->mass_sum(enhanced_precision_sum); printf ("Mass of initialized cells equal to %14.12lg\n", H_sum); H_sum_initial = H_sum; double cpu_time_main_setup = cpu_timer_stop(tstart_setup); state->parallel_timer_output(numpe,mype,"CPU: setup time time was",cpu_time_main_setup); long long mem_used = memstats_memused(); if (mem_used > 0) { printf("Memory used in startup %lld kB\n",mem_used); printf("Memory peak in startup %lld kB\n",memstats_mempeak()); printf("Memory free at startup %lld kB\n",memstats_memfree()); printf("Memory available at startup %lld kB\n",memstats_memtotal()); } printf("Iteration 0 timestep n/a Sim Time 0.0 cells %ld Mass Sum %14.12lg\n", ncells, H_sum); mesh->cpu_calc_neigh_counter=0; mesh->cpu_time_calc_neighbors=0.0; mesh->cpu_rezone_counter=0; mesh->cpu_time_rezone_all=0.0; mesh->cpu_refine_smooth_counter=0; // Set up grid. #ifdef GRAPHICS_OUTPUT mesh->write_grid(n); #endif #ifdef HAVE_GRAPHICS set_mysize(ncells); set_viewmode(view_mode); set_window(mesh->xmin, mesh->xmax, mesh->ymin, mesh->ymax); set_outline((int)outline); init_display(&argc, argv, "Shallow Water", mype); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_data(&state->H[0]); set_cell_proc(&mesh->proc[0]); set_circle_radius(circle_radius); draw_scene(); //if (verbose) sleep(5); sleep(2); // Set flag to show mesh results rather than domain decomposition. view_mode = 1; // Clear superposition of circle on grid output. circle_radius = -1.0; cpu_timer_start(&tstart); set_idle_function(&do_calc); start_main_loop(); #else cpu_timer_start(&tstart); for (int it = 0; it < 10000000; it++) { do_calc(); } #endif return 0; }
extern "C" void do_calc(void) { double g = 9.80; double sigma = 0.95; int icount, jcount; if (cycle_reorder == ZORDER || cycle_reorder == HILBERT_SORT) { do_comparison_calc = 1; do_sync = 0; do_gpu_sync = 1; } size_t ncells = mesh->ncells; cl_mem &dev_H = state->dev_H; cl_mem &dev_U = state->dev_U; cl_mem &dev_V = state->dev_V; cl_mem &dev_celltype = mesh->dev_celltype; cl_mem &dev_i = mesh->dev_i; cl_mem &dev_j = mesh->dev_j; cl_mem &dev_level = mesh->dev_level; cl_mem &dev_mpot = state->dev_mpot; vector<int> mpot; size_t old_ncells = ncells; size_t new_ncells = 0; size_t new_ncells_gpu = 0; double H_sum = -1.0; double deltaT = 0.0; cl_command_queue command_queue = ezcl_get_command_queue(); // Main loop. for (int nburst = 0; nburst < outputInterval && ncycle < niter; nburst++, ncycle++) { // To reduce drift in solution if (do_sync) { ezcl_enqueue_read_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->H[0], NULL); ezcl_enqueue_read_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->U[0], NULL); ezcl_enqueue_read_buffer(command_queue, dev_V, CL_TRUE, 0, ncells*sizeof(cl_state_t), (void *)&state->V[0], NULL); } // Define basic domain decomposition parameters for GPU. old_ncells = ncells; // Calculate the real time step for the current discrete time step. double deltaT_cpu = state->set_timestep(g, sigma); double deltaT_gpu = state->gpu_set_timestep(sigma); #ifdef XXX // Compare time step values and pass deltaT in to the kernel. if (do_comparison_calc) { if (fabs(deltaT_gpu - deltaT_cpu) > .000001) { printf("Error with deltaT calc --- cpu %lf gpu %lf\n",deltaT_cpu,deltaT_gpu); } } #endif deltaT = (do_gpu_calc) ? deltaT_gpu : deltaT_cpu; simTime += deltaT; if (mesh->nlft == NULL) mesh->calc_neighbors(); if (mesh->dev_nlft == NULL) mesh->gpu_calc_neighbors(); if (do_comparison_calc) { mesh->compare_neighbors_gpu_global_to_cpu_global(); } mesh->partition_measure(); // Currently not working -- may need to be earlier? //if (do_cpu_calc && ! mesh->have_boundary) { // state->add_boundary_cells(mesh); //} // Apply BCs is currently done as first part of gpu_finite_difference and so comparison won't work here // Execute main kernel state->calc_finite_difference(deltaT); state->gpu_calc_finite_difference(deltaT); if (do_comparison_calc) { // Need to compare dev_H to H, etc state->compare_state_gpu_global_to_cpu_global("finite difference",ncycle,ncells); } if (ezcl_get_compute_device() == COMPUTE_DEVICE_ATI) { fflush(stdout); exit(0); } // Size of arrays gets reduced to just the real cells in this call for have_boundary = 0 state->remove_boundary_cells(); mpot.resize(ncells); new_ncells = state->calc_refine_potential(mpot, icount, jcount); //printf("DEBUG cpu icount %d jcount %d new_ncells %d\n",icount,jcount,new_ncells); new_ncells_gpu = state->gpu_calc_refine_potential(icount, jcount); if (do_comparison_calc) { if (new_ncells != new_ncells_gpu) { printf("ERROR -- new_ncells cpu %lu not equal to new_ncells gpu %lu\n",new_ncells,new_ncells_gpu); exit(0); } // Need to compare dev_mpot to mpot if (dev_mpot) { mesh->compare_mpot_gpu_global_to_cpu_global(&mpot[0], dev_mpot); } } // Sync up cpu array with gpu version to reduce differences due to minor numerical differences // otherwise cell count will diverge causing code problems and crashes if (dev_mpot) { if (do_sync) { ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot[0], NULL); } if (do_gpu_sync) { ezcl_enqueue_write_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot[0], NULL); } } if (do_comparison_calc) { // This compares ioffset for each block in the calculation if (dev_mpot) { mesh->compare_ioffset_gpu_global_to_cpu_global(old_ncells, &mpot[0]); } } if (do_gpu_sync) { if (dev_mpot) { size_t local_work_size = MIN(old_ncells, TILE_SIZE); size_t global_work_size = ((old_ncells+local_work_size - 1) /local_work_size) * local_work_size; //size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; // For on-device global reduction kernel. size_t block_size = global_work_size/local_work_size; vector<int> ioffset(block_size); int mtotal = 0; for (int ig=0; ig<(int)(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){ int mcount = 0; for (int ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){ if (ic >= (int)old_ncells) break; if (mesh->celltype[ic] == REAL_CELL) { mcount += mpot[ic] ? 4 : 1; } else { mcount += mpot[ic] ? 2 : 1; } } ioffset[ig] = mtotal; mtotal += mcount; } ezcl_enqueue_write_buffer(command_queue, mesh->dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset[0], NULL); } } if (do_comparison_calc) { new_ncells = new_ncells_gpu; } //int add_ncells = new_ncells - old_ncells; state->rezone_all(icount, jcount, mpot); // Clear does not delete mpot, so have to swap with an empty vector to get // it to delete the mpot memory. This is all to avoid valgrind from showing // it as a reachable memory leak //mpot.clear(); vector<int>().swap(mpot); // Resize the mesh, inserting cells where refinement is necessary. if (dev_mpot) state->gpu_rezone_all(icount, jcount, localStencil); ncells = new_ncells; mesh->ncells = new_ncells; //ezcl_device_memory_remove(dev_ioffset); if (do_comparison_calc) { state->compare_state_gpu_global_to_cpu_global("rezone all",ncycle,ncells); mesh->compare_indices_gpu_global_to_cpu_global(); } //if (do_gpu_calc) { // int bcount = mesh->gpu_count_BCs(); //} mesh->proc.resize(ncells); if (icount || jcount) { if (cycle_reorder == ZORDER || cycle_reorder == HILBERT_SORT) { mesh->calc_spatial_coordinates(0); } vector<int> index(ncells); mesh->partition_cells(numpe, index, cycle_reorder); //state->state_reorder(index); if (do_gpu_sync) { ezcl_enqueue_write_buffer(command_queue, dev_celltype, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->celltype[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->i[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->j[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), (void *)&mesh->level[0], NULL); } } if (do_gpu_sync) { ezcl_enqueue_write_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->H[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->U[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_V, CL_TRUE, 0, ncells*sizeof(cl_state_t), (void *)&state->V[0], NULL); } } // End burst loop H_sum = state->mass_sum(enhanced_precision_sum); if (isnan(H_sum)) { printf("Got a NAN on cycle %d\n",ncycle); exit(-1); } if (do_comparison_calc) { double total_mass = state->gpu_mass_sum(enhanced_precision_sum); if (fabs(total_mass - H_sum) > CONSERVATION_EPS) printf("Error: mass sum gpu %f cpu %f\n", total_mass, H_sum);/***/ } printf("Iteration %3d timestep %lf Sim Time %lf cells %ld Mass Sum %14.12lg Mass Change %12.6lg\n", ncycle, deltaT, simTime, ncells, H_sum, H_sum - H_sum_initial); #ifdef HAVE_GRAPHICS if (do_cpu_calc){ mesh->calc_spatial_coordinates(0); } if (do_gpu_calc){ cl_mem dev_x = ezcl_malloc(NULL, const_cast<char *>("dev_x"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); cl_mem dev_dx = ezcl_malloc(NULL, const_cast<char *>("dev_dx"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); cl_mem dev_y = ezcl_malloc(NULL, const_cast<char *>("dev_y"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); cl_mem dev_dy = ezcl_malloc(NULL, const_cast<char *>("dev_dy"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); mesh->gpu_calc_spatial_coordinates(dev_x, dev_dx, dev_y, dev_dy); if (do_comparison_calc){ #ifdef FULL_PRECISION mesh->compare_coordinates_gpu_global_to_cpu_global_double(dev_x, dev_dx, dev_y, dev_dy, dev_H, &state->H[0]); #else mesh->compare_coordinates_gpu_global_to_cpu_global_float(dev_x, dev_dx, dev_y, dev_dy, dev_H, &state->H[0]); #endif } ezcl_device_memory_remove(dev_x); ezcl_device_memory_remove(dev_dx); ezcl_device_memory_remove(dev_y); ezcl_device_memory_remove(dev_dy); } set_mysize(ncells); set_viewmode(view_mode); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_data(&state->H[0]); set_cell_proc(&mesh->proc[0]); set_circle_radius(circle_radius); draw_scene(); #endif // Output final results and timing information. if (ncycle >= niter) { //free_display(); // Get overall program timing. double elapsed_time = cpu_timer_stop(tstart); state->output_timing_info(do_cpu_calc, do_gpu_calc, elapsed_time); mesh->print_partition_measure(); mesh->print_calc_neighbor_type(); mesh->print_partition_type(); printf("CPU: rezone frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_rezone_count()/(double)ncycle*100.0 ); printf("CPU: calc neigh frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_calc_neigh_count()/(double)ncycle*100.0 ); if (mesh->get_cpu_rezone_count() > 0) { printf("CPU: refine_smooth_iter per rezone \t %8.4f\t\n", (double)mesh->get_cpu_refine_smooth_count()/(double)mesh->get_cpu_rezone_count() ); } printf("GPU: rezone frequency \t %8.4f\tpercent\n", (double)mesh->get_gpu_rezone_count()/(double)ncycle*100.0 ); printf("GPU: calc neigh frequency \t %8.4f\tpercent\n", (double)mesh->get_gpu_calc_neigh_count()/(double)ncycle*100.0 ); if (mesh->get_gpu_rezone_count() > 0) { printf("GPU: refine_smooth_iter per rezone \t %8.4f\t\n", (double)mesh->get_gpu_refine_smooth_count()/(double)mesh->get_gpu_rezone_count() ); } if (mesh->dev_nlft != NULL){ ezcl_device_memory_remove(mesh->dev_nlft); ezcl_device_memory_remove(mesh->dev_nrht); ezcl_device_memory_remove(mesh->dev_nbot); ezcl_device_memory_remove(mesh->dev_ntop); } mesh->terminate(); state->terminate(); ezcl_terminate(); ezcl_mem_walk_all(); exit(0); } // Complete final output. }
int main(int argc, char **argv) { int ierr; // Needed for code to compile correctly on the Mac int mype=0; int numpe=-1; // Process command-line arguments, if any. parseInput(argc, argv); numpe = 16; ierr = ezcl_devtype_init(CL_DEVICE_TYPE_GPU, 0); if (ierr == EZCL_NODEVICE) { ierr = ezcl_devtype_init(CL_DEVICE_TYPE_CPU, 0); } if (ierr != EZCL_SUCCESS) { printf("No opencl device available -- aborting\n"); exit(-1); } real_t circ_radius = 6.0; // Scale the circle appropriately for the mesh size. circ_radius = circ_radius * (real_t) nx / 128.0; int boundary = 1; int parallel_in = 0; mesh = new Mesh(nx, ny, levmx, ndim, boundary, parallel_in, do_gpu_calc); if (DEBUG) { //if (mype == 0) mesh->print(); char filename[10]; sprintf(filename,"out%1d",mype); mesh->fp=fopen(filename,"w"); //mesh->print_local(); } mesh->init(nx, ny, circ_radius, initial_order, do_gpu_calc); size_t &ncells = mesh->ncells; state = new State(mesh); state->init(do_gpu_calc); mesh->proc.resize(ncells); mesh->calc_distribution(numpe); state->fill_circle(circ_radius, 100.0, 7.0); cl_mem &dev_celltype = mesh->dev_celltype; cl_mem &dev_i = mesh->dev_i; cl_mem &dev_j = mesh->dev_j; cl_mem &dev_level = mesh->dev_level; cl_mem &dev_H = state->dev_H; cl_mem &dev_U = state->dev_U; cl_mem &dev_V = state->dev_V; state_t *H = state->H; state_t *U = state->U; state_t *V = state->V; state->allocate_device_memory(ncells); size_t one = 1; state->dev_deltaT = ezcl_malloc(NULL, const_cast<char *>("dev_deltaT"), &one, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0); size_t mem_request = (int)((float)ncells*mesh->mem_factor); dev_celltype = ezcl_malloc(NULL, const_cast<char *>("dev_celltype"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0); dev_i = ezcl_malloc(NULL, const_cast<char *>("dev_i"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0); dev_j = ezcl_malloc(NULL, const_cast<char *>("dev_j"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0); dev_level = ezcl_malloc(NULL, const_cast<char *>("dev_level"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0); cl_command_queue command_queue = ezcl_get_command_queue(); ezcl_enqueue_write_buffer(command_queue, dev_celltype, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->celltype[0], &start_write_event); ezcl_enqueue_write_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->i[0], NULL ); ezcl_enqueue_write_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->j[0], NULL ); ezcl_enqueue_write_buffer(command_queue, dev_level, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->level[0], NULL ); ezcl_enqueue_write_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&H[0], NULL ); ezcl_enqueue_write_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&U[0], NULL ); ezcl_enqueue_write_buffer(command_queue, dev_V, CL_TRUE, 0, ncells*sizeof(cl_state_t), (void *)&V[0], &end_write_event ); state->gpu_time_write += ezcl_timer_calc(&start_write_event, &end_write_event); mesh->nlft = NULL; mesh->nrht = NULL; mesh->nbot = NULL; mesh->ntop = NULL; mesh->dev_nlft = NULL; mesh->dev_nrht = NULL; mesh->dev_nbot = NULL; mesh->dev_ntop = NULL; if (ezcl_get_compute_device() == COMPUTE_DEVICE_ATI) enhanced_precision_sum = false; // Kahan-type enhanced precision sum implementation. double H_sum = state->mass_sum(enhanced_precision_sum); printf ("Mass of initialized cells equal to %14.12lg\n", H_sum); H_sum_initial = H_sum; printf("Iteration 0 timestep n/a Sim Time 0.0 cells %ld Mass Sum %14.12lg\n", ncells, H_sum); mesh->cpu_calc_neigh_counter=0; mesh->cpu_time_calc_neighbors=0.0; mesh->cpu_rezone_counter=0; mesh->cpu_time_rezone_all=0.0; mesh->cpu_refine_smooth_counter=0; // Set up grid. #ifdef GRAPHICS_OUTPUT mesh->write_grid(n); #endif #ifdef HAVE_GRAPHICS set_mysize(ncells); set_viewmode(view_mode); set_window((float)mesh->xmin, (float)mesh->xmax, (float)mesh->ymin, (float)mesh->ymax); set_outline((int)outline); init_display(&argc, argv, "Shallow Water", mype); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_data(&H[0]); set_cell_proc(&mesh->proc[0]); set_circle_radius(circle_radius); draw_scene(); //if (verbose) sleep(5); sleep(2); // Set flag to show mesh results rather than domain decomposition. view_mode = 1; // Clear superposition of circle on grid output. circle_radius = -1.0; cpu_timer_start(&tstart); set_idle_function(&do_calc); start_main_loop(); #else cpu_timer_start(&tstart); for (int it = 0; it < 10000000; it++) { do_calc(); } #endif return 0; }