static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) { const S *ego = (const S *) ego_; P *pln; const problem_rdft *p; iodim *d; INT rs, cs, b, n; static const plan_adt padt = { X(rdft_solve), X(null_awake), print, destroy }; UNUSED(plnr); if (ego->bufferedp) { if (!applicable_buf(ego_, p_)) return (plan *)0; } else { if (!applicable(ego_, p_)) return (plan *)0; } p = (const problem_rdft *) p_; if (R2HC_KINDP(p->kind[0])) { rs = p->sz->dims[0].is; cs = p->sz->dims[0].os; pln = MKPLAN_RDFT(P, &padt, ego->bufferedp ? apply_buf_r2hc : apply_r2hc); } else { rs = p->sz->dims[0].os; cs = p->sz->dims[0].is; pln = MKPLAN_RDFT(P, &padt, ego->bufferedp ? apply_buf_hc2r : apply_hc2r); } d = p->sz->dims; n = d[0].n; pln->k = ego->k; pln->n = n; pln->rs0 = rs; pln->rs = X(mkstride)(n, 2 * rs); pln->csr = X(mkstride)(n, cs); pln->csi = X(mkstride)(n, -cs); pln->ioffset = ioffset(p->kind[0], n, cs); b = compute_batchsize(n); pln->brs = X(mkstride)(n, 2 * b); pln->bcsr = X(mkstride)(n, b); pln->bcsi = X(mkstride)(n, -b); pln->bioffset = ioffset(p->kind[0], n, b); X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs); pln->slv = ego; X(ops_zero)(&pln->super.super.ops); X(ops_madd2)(pln->vl / ego->desc->genus->vl, &ego->desc->ops, &pln->super.super.ops); if (ego->bufferedp) pln->super.super.ops.other += 2 * n * pln->vl; pln->super.super.could_prune_now_p = !ego->bufferedp; return &(pln->super.super); }
extern "C" void do_calc(void) { double g = 9.80; double sigma = 0.95; int icount, jcount; if (cycle_reorder == ZORDER || cycle_reorder == HILBERT_SORT) { do_comparison_calc = 1; do_sync = 0; do_gpu_sync = 1; } size_t ncells = mesh->ncells; cl_mem &dev_H = state->dev_H; cl_mem &dev_U = state->dev_U; cl_mem &dev_V = state->dev_V; cl_mem &dev_celltype = mesh->dev_celltype; cl_mem &dev_i = mesh->dev_i; cl_mem &dev_j = mesh->dev_j; cl_mem &dev_level = mesh->dev_level; cl_mem &dev_mpot = state->dev_mpot; vector<int> mpot; size_t old_ncells = ncells; size_t new_ncells = 0; size_t new_ncells_gpu = 0; double H_sum = -1.0; double deltaT = 0.0; cl_command_queue command_queue = ezcl_get_command_queue(); // Main loop. for (int nburst = 0; nburst < outputInterval && ncycle < niter; nburst++, ncycle++) { // To reduce drift in solution if (do_sync) { ezcl_enqueue_read_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->H[0], NULL); ezcl_enqueue_read_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->U[0], NULL); ezcl_enqueue_read_buffer(command_queue, dev_V, CL_TRUE, 0, ncells*sizeof(cl_state_t), (void *)&state->V[0], NULL); } // Define basic domain decomposition parameters for GPU. old_ncells = ncells; // Calculate the real time step for the current discrete time step. double deltaT_cpu = state->set_timestep(g, sigma); double deltaT_gpu = state->gpu_set_timestep(sigma); #ifdef XXX // Compare time step values and pass deltaT in to the kernel. if (do_comparison_calc) { if (fabs(deltaT_gpu - deltaT_cpu) > .000001) { printf("Error with deltaT calc --- cpu %lf gpu %lf\n",deltaT_cpu,deltaT_gpu); } } #endif deltaT = (do_gpu_calc) ? deltaT_gpu : deltaT_cpu; simTime += deltaT; if (mesh->nlft == NULL) mesh->calc_neighbors(); if (mesh->dev_nlft == NULL) mesh->gpu_calc_neighbors(); if (do_comparison_calc) { mesh->compare_neighbors_gpu_global_to_cpu_global(); } mesh->partition_measure(); // Currently not working -- may need to be earlier? //if (do_cpu_calc && ! mesh->have_boundary) { // state->add_boundary_cells(mesh); //} // Apply BCs is currently done as first part of gpu_finite_difference and so comparison won't work here // Execute main kernel state->calc_finite_difference(deltaT); state->gpu_calc_finite_difference(deltaT); if (do_comparison_calc) { // Need to compare dev_H to H, etc state->compare_state_gpu_global_to_cpu_global("finite difference",ncycle,ncells); } if (ezcl_get_compute_device() == COMPUTE_DEVICE_ATI) { fflush(stdout); exit(0); } // Size of arrays gets reduced to just the real cells in this call for have_boundary = 0 state->remove_boundary_cells(); mpot.resize(ncells); new_ncells = state->calc_refine_potential(mpot, icount, jcount); //printf("DEBUG cpu icount %d jcount %d new_ncells %d\n",icount,jcount,new_ncells); new_ncells_gpu = state->gpu_calc_refine_potential(icount, jcount); if (do_comparison_calc) { if (new_ncells != new_ncells_gpu) { printf("ERROR -- new_ncells cpu %lu not equal to new_ncells gpu %lu\n",new_ncells,new_ncells_gpu); exit(0); } // Need to compare dev_mpot to mpot if (dev_mpot) { mesh->compare_mpot_gpu_global_to_cpu_global(&mpot[0], dev_mpot); } } // Sync up cpu array with gpu version to reduce differences due to minor numerical differences // otherwise cell count will diverge causing code problems and crashes if (dev_mpot) { if (do_sync) { ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot[0], NULL); } if (do_gpu_sync) { ezcl_enqueue_write_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot[0], NULL); } } if (do_comparison_calc) { // This compares ioffset for each block in the calculation if (dev_mpot) { mesh->compare_ioffset_gpu_global_to_cpu_global(old_ncells, &mpot[0]); } } if (do_gpu_sync) { if (dev_mpot) { size_t local_work_size = MIN(old_ncells, TILE_SIZE); size_t global_work_size = ((old_ncells+local_work_size - 1) /local_work_size) * local_work_size; //size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; // For on-device global reduction kernel. size_t block_size = global_work_size/local_work_size; vector<int> ioffset(block_size); int mtotal = 0; for (int ig=0; ig<(int)(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){ int mcount = 0; for (int ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){ if (ic >= (int)old_ncells) break; if (mesh->celltype[ic] == REAL_CELL) { mcount += mpot[ic] ? 4 : 1; } else { mcount += mpot[ic] ? 2 : 1; } } ioffset[ig] = mtotal; mtotal += mcount; } ezcl_enqueue_write_buffer(command_queue, mesh->dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset[0], NULL); } } if (do_comparison_calc) { new_ncells = new_ncells_gpu; } //int add_ncells = new_ncells - old_ncells; state->rezone_all(icount, jcount, mpot); // Clear does not delete mpot, so have to swap with an empty vector to get // it to delete the mpot memory. This is all to avoid valgrind from showing // it as a reachable memory leak //mpot.clear(); vector<int>().swap(mpot); // Resize the mesh, inserting cells where refinement is necessary. if (dev_mpot) state->gpu_rezone_all(icount, jcount, localStencil); ncells = new_ncells; mesh->ncells = new_ncells; //ezcl_device_memory_remove(dev_ioffset); if (do_comparison_calc) { state->compare_state_gpu_global_to_cpu_global("rezone all",ncycle,ncells); mesh->compare_indices_gpu_global_to_cpu_global(); } //if (do_gpu_calc) { // int bcount = mesh->gpu_count_BCs(); //} mesh->proc.resize(ncells); if (icount || jcount) { if (cycle_reorder == ZORDER || cycle_reorder == HILBERT_SORT) { mesh->calc_spatial_coordinates(0); } vector<int> index(ncells); mesh->partition_cells(numpe, index, cycle_reorder); //state->state_reorder(index); if (do_gpu_sync) { ezcl_enqueue_write_buffer(command_queue, dev_celltype, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->celltype[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->i[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), (void *)&mesh->j[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), (void *)&mesh->level[0], NULL); } } if (do_gpu_sync) { ezcl_enqueue_write_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->H[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), (void *)&state->U[0], NULL); ezcl_enqueue_write_buffer(command_queue, dev_V, CL_TRUE, 0, ncells*sizeof(cl_state_t), (void *)&state->V[0], NULL); } } // End burst loop H_sum = state->mass_sum(enhanced_precision_sum); if (isnan(H_sum)) { printf("Got a NAN on cycle %d\n",ncycle); exit(-1); } if (do_comparison_calc) { double total_mass = state->gpu_mass_sum(enhanced_precision_sum); if (fabs(total_mass - H_sum) > CONSERVATION_EPS) printf("Error: mass sum gpu %f cpu %f\n", total_mass, H_sum);/***/ } printf("Iteration %3d timestep %lf Sim Time %lf cells %ld Mass Sum %14.12lg Mass Change %12.6lg\n", ncycle, deltaT, simTime, ncells, H_sum, H_sum - H_sum_initial); #ifdef HAVE_GRAPHICS if (do_cpu_calc){ mesh->calc_spatial_coordinates(0); } if (do_gpu_calc){ cl_mem dev_x = ezcl_malloc(NULL, const_cast<char *>("dev_x"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); cl_mem dev_dx = ezcl_malloc(NULL, const_cast<char *>("dev_dx"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); cl_mem dev_y = ezcl_malloc(NULL, const_cast<char *>("dev_y"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); cl_mem dev_dy = ezcl_malloc(NULL, const_cast<char *>("dev_dy"), &ncells, sizeof(cl_spatial_t), CL_MEM_READ_WRITE, 0); mesh->gpu_calc_spatial_coordinates(dev_x, dev_dx, dev_y, dev_dy); if (do_comparison_calc){ #ifdef FULL_PRECISION mesh->compare_coordinates_gpu_global_to_cpu_global_double(dev_x, dev_dx, dev_y, dev_dy, dev_H, &state->H[0]); #else mesh->compare_coordinates_gpu_global_to_cpu_global_float(dev_x, dev_dx, dev_y, dev_dy, dev_H, &state->H[0]); #endif } ezcl_device_memory_remove(dev_x); ezcl_device_memory_remove(dev_dx); ezcl_device_memory_remove(dev_y); ezcl_device_memory_remove(dev_dy); } set_mysize(ncells); set_viewmode(view_mode); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_data(&state->H[0]); set_cell_proc(&mesh->proc[0]); set_circle_radius(circle_radius); draw_scene(); #endif // Output final results and timing information. if (ncycle >= niter) { //free_display(); // Get overall program timing. double elapsed_time = cpu_timer_stop(tstart); state->output_timing_info(do_cpu_calc, do_gpu_calc, elapsed_time); mesh->print_partition_measure(); mesh->print_calc_neighbor_type(); mesh->print_partition_type(); printf("CPU: rezone frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_rezone_count()/(double)ncycle*100.0 ); printf("CPU: calc neigh frequency \t %8.4f\tpercent\n", (double)mesh->get_cpu_calc_neigh_count()/(double)ncycle*100.0 ); if (mesh->get_cpu_rezone_count() > 0) { printf("CPU: refine_smooth_iter per rezone \t %8.4f\t\n", (double)mesh->get_cpu_refine_smooth_count()/(double)mesh->get_cpu_rezone_count() ); } printf("GPU: rezone frequency \t %8.4f\tpercent\n", (double)mesh->get_gpu_rezone_count()/(double)ncycle*100.0 ); printf("GPU: calc neigh frequency \t %8.4f\tpercent\n", (double)mesh->get_gpu_calc_neigh_count()/(double)ncycle*100.0 ); if (mesh->get_gpu_rezone_count() > 0) { printf("GPU: refine_smooth_iter per rezone \t %8.4f\t\n", (double)mesh->get_gpu_refine_smooth_count()/(double)mesh->get_gpu_rezone_count() ); } if (mesh->dev_nlft != NULL){ ezcl_device_memory_remove(mesh->dev_nlft); ezcl_device_memory_remove(mesh->dev_nrht); ezcl_device_memory_remove(mesh->dev_nbot); ezcl_device_memory_remove(mesh->dev_ntop); } mesh->terminate(); state->terminate(); ezcl_terminate(); ezcl_mem_walk_all(); exit(0); } // Complete final output. }