int main(int argc, char** argv) { sim_param_t params; if (get_params(argc, argv, ¶ms) != 0) exit(-1); sim_state_t* state = init_particles(¶ms); FILE* fp = fopen(params.fname, "w"); int nframes = params.nframes; int npframe = params.npframe; float dt = params.dt; int n = state->n; double t_start = omp_get_wtime(); //write_header(fp, n); write_header(fp, n, nframes, params.h); write_frame_data(fp, n, state, NULL); compute_accel(state, ¶ms); leapfrog_start(state, dt); check_state(state); for (int frame = 1; frame < nframes; ++frame) { for (int i = 0; i < npframe; ++i) { compute_accel(state, ¶ms); leapfrog_step(state, dt); check_state(state); } printf("Frame: %d of %d - %2.1f%%\n",frame, nframes, 100*(float)frame/nframes); write_frame_data(fp, n, state, NULL); } double t_end = omp_get_wtime(); printf("Ran in %g seconds\n", t_end-t_start); fclose(fp); free_state(state); }
int main(int argc, char** argv) { sim_param_t params; if (get_params(argc, argv, ¶ms) != 0) exit(-1); // Create global sim_state_t* globalState = init_particles(¶ms); #pragma omp parallel shared(globalState, params) { int proc = omp_get_thread_num(); int nproc = omp_get_num_threads(); FILE* fp = fopen(params.fname, "w"); int nframes = params.nframes; int npframe = params.npframe; float dt = params.dt; int n = globalState->n; // Processor information and holder proc_info* pInfo = malloc(sizeof(proc_info)); pInfo->proc = proc; pInfo->nproc = nproc; pInfo->beg = round((proc/(double)nproc)*n); pInfo->end = round(((proc+1)/(double)nproc)*n); pInfo->forceAccu = calloc(3*n, sizeof(float)); // Never used this... if (proc == 0) { printf("Running in parallel with %d processor\n", nproc); } normalize_mass(globalState, pInfo, ¶ms); double t_start = omp_get_wtime(); if (proc == 0) { // We only write for one processor write_header(fp, n, nframes, params.h); write_frame_data(fp, n, globalState, NULL); } if (proc == 0) { hash_particles(globalState, params.h); } //hash_particles_parallel(globalState, pInfo, params.h); #pragma omp barrier // Need the hashing to be done compute_accel(globalState, pInfo, ¶ms); #pragma omp barrier leapfrog_start(globalState, pInfo, dt); check_state(globalState, pInfo); for (int frame = 1; frame < nframes; ++frame) { // We sort according to Z-Morton to ensure locality, need to implement paralle qsort if (frame % 5 == 0) { // Dividing into chunks of sorting each chunk // This alone turned out to better than sorting the entire array qsort(globalState->part+pInfo->beg, pInfo->end-pInfo->beg ,sizeof(particle_t),compPart); // Sorting the array consisting of sorted chunks // This turned out to actually lower the performance. That's why // I commented it. // #pragma omp barrier // if( pInfo->nproc >1 ) arraymerge(globalState->part, globalState->n, pInfo); //#pragma omp barrier*/ // Serial version /*#pragma omp single // Implied barrier qsort(globalState->part, n, sizeof(particle_t), compPart);*/ } /*else if (frame % 49) {*/ /*if (proc == 0) {*/ /*}*/ /*}*/ #pragma omp barrier // Need sort to finish for (int i = 0; i < npframe; ++i) { if (proc == 0 && npframe % 4 == 0) { // Ammortize hashing cost hash_particles(globalState, params.h); } #pragma omp barrier compute_accel(globalState, pInfo, ¶ms); leapfrog_step(globalState, pInfo, dt); check_state(globalState, pInfo); #pragma omp barrier } if (proc == 0) { printf("Frame: %d of %d - %2.1f%%\n",frame, nframes, 100*(float)frame/nframes); write_frame_data(fp, n, globalState, NULL); } } double t_end = omp_get_wtime(); if (proc == 0) { printf("Ran in %g seconds\n", t_end-t_start); } free(pInfo); fclose(fp); } free_state(globalState); }