/* * This function advances the state of the system by nsteps timesteps. The * curr is the current state of the system. * next is the output matrix to store the next time step into. */ static void fwd(TYPE *next, TYPE *curr, TYPE *vsq, TYPE *c_coeff, int nx, int ny, int dimx, int dimy, int radius) { #pragma acc parallel copy(next[0:dimx * dimy], curr[0:dimx * dimy], \ vsq[0:dimx * dimy], c_coeff[0:NUM_COEFF]) #pragma acc loop gang worker vector collapse(2) for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { int this_offset = POINT_OFFSET(x, y, dimx, radius); TYPE temp = 2.0f * curr[this_offset] - next[this_offset]; TYPE div = c_coeff[0] * curr[this_offset]; #pragma acc loop seq for (int d = 1; d <= radius; d++) { int y_pos_offset = POINT_OFFSET(x, y + d, dimx, radius); int y_neg_offset = POINT_OFFSET(x, y - d, dimx, radius); int x_pos_offset = POINT_OFFSET(x + d, y, dimx, radius); int x_neg_offset = POINT_OFFSET(x - d, y, dimx, radius); div += c_coeff[d] * (curr[y_pos_offset] + curr[y_neg_offset] + curr[x_pos_offset] + curr[x_neg_offset]); } next[this_offset] = temp + div * vsq[this_offset]; } } }
/* * This function advances the state of the system by nsteps timesteps. The * curr is the current state of the system. * next is the output matrix to store the next time step into. */ static void fwd(TYPE *next, TYPE *curr, TYPE *vsq, TYPE *c_coeff, int nx, int ny, int nz, int dimx, int dimy, int dimz, int radius) { for (int z = 0; z < nz; z++) { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { int this_offset = POINT_OFFSET(x, y, z, dimy, dimx, radius); TYPE temp = 2.0f * curr[this_offset] - next[this_offset]; TYPE div = c_coeff[0] * curr[this_offset]; for (int d = 1; d <= radius; d++) { int z_pos_offset = POINT_OFFSET(x, y, z + d, dimy, dimx, radius); int z_neg_offset = POINT_OFFSET(x, y, z - d, dimy, dimx, radius); int y_pos_offset = POINT_OFFSET(x, y + d, z, dimy, dimx, radius); int y_neg_offset = POINT_OFFSET(x, y - d, z, dimy, dimx, radius); int x_pos_offset = POINT_OFFSET(x + d, y, z, dimy, dimx, radius); int x_neg_offset = POINT_OFFSET(x - d, y, z, dimy, dimx, radius); div += c_coeff[d] * (curr[z_pos_offset] + curr[z_neg_offset] + curr[y_pos_offset] + curr[y_neg_offset] + curr[x_pos_offset] + curr[x_neg_offset]); } next[this_offset] = temp + div * vsq[this_offset]; } } } }
/* * This function advances the state of the system by nsteps timesteps. The * curr is the current state of the system. * next is the output matrix to store the next time step into. */ static void fwd(TYPE *next, TYPE *curr, TYPE *vsq, TYPE *c_coeff, int nx, int ny, int dimx, int dimy, int radius) { #pragma omp parallel for collapse(2) for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { int this_offset = POINT_OFFSET(x, y, dimx, radius); TYPE temp = 2.0f * curr[this_offset] - next[this_offset]; TYPE div = c_coeff[0] * curr[this_offset]; for (int d = 1; d <= radius; d++) { int y_pos_offset = POINT_OFFSET(x, y + d, dimx, radius); int y_neg_offset = POINT_OFFSET(x, y - d, dimx, radius); int x_pos_offset = POINT_OFFSET(x + d, y, dimx, radius); int x_neg_offset = POINT_OFFSET(x - d, y, dimx, radius); div += c_coeff[d] * (curr[y_pos_offset] + curr[y_neg_offset] + curr[x_pos_offset] + curr[x_neg_offset]); } next[this_offset] = temp + div * vsq[this_offset]; } } }
int main( int argc, char *argv[] ) { config conf; setup_config(&conf, argc, argv); init_progress(conf.progress_width, conf.nsteps, conf.progress_disabled); TYPE dx = 20.f; TYPE dt = 0.002f; // compute the pitch for perfect coalescing size_t dimx = conf.nx + 2*conf.radius; size_t dimy = conf.ny + 2*conf.radius; size_t nbytes = dimx * dimy * sizeof(TYPE); if (conf.verbose) { printf("x = %zu, y = %zu\n", dimx, dimy); printf("nsteps = %d\n", conf.nsteps); printf("radius = %d\n", conf.radius); } TYPE c_coeff[NUM_COEFF]; TYPE *curr = (TYPE *)malloc(nbytes); TYPE *next = (TYPE *)malloc(nbytes); TYPE *vsq = (TYPE *)malloc(nbytes); if (curr == NULL || next == NULL || vsq == NULL) { fprintf(stderr, "Allocations failed\n"); return 1; } config_sources(&conf.srcs, &conf.nsrcs, conf.nx, conf.ny, conf.nsteps); TYPE **srcs = sample_sources(conf.srcs, conf.nsrcs, conf.nsteps, dt); init_data(curr, next, vsq, c_coeff, dimx, dimy, dx, dt); double start = seconds(); for (int step = 0; step < conf.nsteps; step++) { for (int src = 0; src < conf.nsrcs; src++) { if (conf.srcs[src].t > step) continue; int src_offset = POINT_OFFSET(conf.srcs[src].x, conf.srcs[src].y, dimx, conf.radius); curr[src_offset] = srcs[src][step]; } fwd(next, curr, vsq, c_coeff, conf.nx, conf.ny, dimx, dimy, conf.radius); TYPE *tmp = next; next = curr; curr = tmp; update_progress(step + 1); } double elapsed_s = seconds() - start; finish_progress(); float point_rate = (float)conf.nx * conf.ny / (elapsed_s / conf.nsteps); fprintf(stderr, "iso_r4_2x: %8.10f s total, %8.10f s/step, %8.2f Mcells/s/step\n", elapsed_s, elapsed_s / conf.nsteps, point_rate / 1000000.f); if (conf.save_text) { save_text(curr, dimx, dimy, conf.ny, conf.nx, "snap.text", conf.radius); } free(curr); free(next); free(vsq); for (int i = 0; i < conf.nsrcs; i++) { free(srcs[i]); } free(srcs); return 0; }