/*@T * * The [[place_particle]] routine determines the initial particle * placement, but not the desired mass. We want the fluid in the * initial configuration to exist roughly at the reference density. * One way to do this is to take the volume in the indicated body of * fluid, multiply by the mass density, and divide by the number of * particles; but that requires that we be able to compute the volume * of the fluid region. Alternately, we can simply compute the * average mass density assuming each particle has mass one, then use * that to compute the particle mass necessary in order to achieve the * desired reference density. We do this with [[normalize_mass]]. * * @c*/ void normalize_mass(sim_state_t* s, proc_info* pInfo, sim_param_t* param) { if (pInfo->proc == 0) { s->mass = 1; // Set mass with only one processor //hash_particles_parallel(s, pInfo, param->h); hash_particles(s, param->h); // Hashing with only one processor } float rho0 = param->rho0; float rho2s = 0; float rhos = 0; #pragma omp barrier // Barrier because need hashing information before computing density compute_density(s, pInfo, param); #pragma omp barrier // Want all processor to finish updating their own densities //printf("Starting: %d\n", pInfo->proc); #pragma omp parallel for reduction(+:rhos, rho2s) for (int i = 0; i < s->n; ++i) { rho2s += (s->part[i].rho)*(s->part[i].rho); rhos += s->part[i].rho; } #pragma omp single // Only one processor to update this { s->mass *= ( rho0*rhos / rho2s ); } }
/*@T * * The [[place_particle]] routine determines the initial particle * placement, but not the desired mass. We want the fluid in the * initial configuration to exist roughly at the reference density. * One way to do this is to take the volume in the indicated body of * fluid, multiply by the mass density, and divide by the number of * particles; but that requires that we be able to compute the volume * of the fluid region. Alternately, we can simply compute the * average mass density assuming each particle has mass one, then use * that to compute the particle mass necessary in order to achieve the * desired reference density. We do this with [[normalize_mass]]. * * @c*/ void normalize_mass(sim_state_t* s, sim_param_t* param) { s->mass = 1; hash_particles(s, param->h); compute_density(s, param); float rho0 = param->rho0; float rho2s = 0; float rhos = 0; for (int i = 0; i < s->n; ++i) { rho2s += (s->part[i].rho)*(s->part[i].rho); rhos += s->part[i].rho; } s->mass *= ( rho0*rhos / rho2s ); }
int main(int argc, char** argv) { sim_param_t params; if (get_params(argc, argv, ¶ms) != 0) exit(-1); // Create global sim_state_t* globalState = init_particles(¶ms); #pragma omp parallel shared(globalState, params) { int proc = omp_get_thread_num(); int nproc = omp_get_num_threads(); FILE* fp = fopen(params.fname, "w"); int nframes = params.nframes; int npframe = params.npframe; float dt = params.dt; int n = globalState->n; // Processor information and holder proc_info* pInfo = malloc(sizeof(proc_info)); pInfo->proc = proc; pInfo->nproc = nproc; pInfo->beg = round((proc/(double)nproc)*n); pInfo->end = round(((proc+1)/(double)nproc)*n); pInfo->forceAccu = calloc(3*n, sizeof(float)); // Never used this... if (proc == 0) { printf("Running in parallel with %d processor\n", nproc); } normalize_mass(globalState, pInfo, ¶ms); double t_start = omp_get_wtime(); if (proc == 0) { // We only write for one processor write_header(fp, n, nframes, params.h); write_frame_data(fp, n, globalState, NULL); } if (proc == 0) { hash_particles(globalState, params.h); } //hash_particles_parallel(globalState, pInfo, params.h); #pragma omp barrier // Need the hashing to be done compute_accel(globalState, pInfo, ¶ms); #pragma omp barrier leapfrog_start(globalState, pInfo, dt); check_state(globalState, pInfo); for (int frame = 1; frame < nframes; ++frame) { // We sort according to Z-Morton to ensure locality, need to implement paralle qsort if (frame % 5 == 0) { // Dividing into chunks of sorting each chunk // This alone turned out to better than sorting the entire array qsort(globalState->part+pInfo->beg, pInfo->end-pInfo->beg ,sizeof(particle_t),compPart); // Sorting the array consisting of sorted chunks // This turned out to actually lower the performance. That's why // I commented it. // #pragma omp barrier // if( pInfo->nproc >1 ) arraymerge(globalState->part, globalState->n, pInfo); //#pragma omp barrier*/ // Serial version /*#pragma omp single // Implied barrier qsort(globalState->part, n, sizeof(particle_t), compPart);*/ } /*else if (frame % 49) {*/ /*if (proc == 0) {*/ /*}*/ /*}*/ #pragma omp barrier // Need sort to finish for (int i = 0; i < npframe; ++i) { if (proc == 0 && npframe % 4 == 0) { // Ammortize hashing cost hash_particles(globalState, params.h); } #pragma omp barrier compute_accel(globalState, pInfo, ¶ms); leapfrog_step(globalState, pInfo, dt); check_state(globalState, pInfo); #pragma omp barrier } if (proc == 0) { printf("Frame: %d of %d - %2.1f%%\n",frame, nframes, 100*(float)frame/nframes); write_frame_data(fp, n, globalState, NULL); } } double t_end = omp_get_wtime(); if (proc == 0) { printf("Ran in %g seconds\n", t_end-t_start); } free(pInfo); fclose(fp); } free_state(globalState); }
void compute_accel(sim_state_t* state, sim_param_t* params) { // Unpack basic parameters const float h = params->h; const float rho0 = params->rho0; // const float k = params->k; // const float mu = params->mu; const float g = params->g; // const float mass = state->mass; const float h2 = params->h2; // Unpack system state particle_t* p = state->part; particle_t** hash = state->hash; const int n = state->n; // Rehash the particles hash_particles(state, h); // Compute density and color compute_density(state, params); // Constants for interaction term const float C0 = params->C0; const float Cp = params->Cp; const float Cv = params->Cv; // Start with gravity and surface forces for (int i = 0; i < n; ++i) { vec3_set(p[i].a, 0, -g, 0); } // Accumulate forces #ifdef USE_BUCKETING /* BEGIN TASK */ // Start multi-threaded // Iterate over each bucket, and within each bucket each particle #pragma omp parallel { // Get the thread ID and total threads int thread_id = omp_get_thread_num(); int total_threads = omp_get_num_threads(); // Get the appropriate forces vector float* forces = forces_all + thread_id * (state->n * 3); memset(forces, 0, sizeof(float) * state->n * 3); // Get the dedupe buffer for this thread char* usedBinID = used_bin_id_flags + (thread_id * HASH_SIZE); // Create storage for neighbor set unsigned buckets[MAX_NBR_BINS]; unsigned numbins; // Process all buckets that the thread is responsible for for (int iter_bucket = thread_id; iter_bucket < HASH_SIZE; iter_bucket += total_threads) { for (particle_t* pi = hash[iter_bucket]; pi != NULL ; pi = pi->next) { // Get the position of the pi force accumulator unsigned diffPosI = pi - state->part; float* pia = forces + 3 * diffPosI; // Compute equal and opposite forces for the particle, // first get neighbors numbins = particle_neighborhood(buckets, pi, h, usedBinID); for (int j = 0; j < numbins; ++j) { // Get the neighbor particles unsigned bucketid = buckets[j]; for (particle_t* pj = hash[bucketid]; pj != NULL ; pj = pj->next) { // Compute forces only if appropriate if (pi < pj && abs(pi->ix - pj->ix) <= 1 && abs(pi->iy - pj->iy) <= 1 && abs(pi->iz - pj->iz) <= 1) { // Get the position of the pj force accumulator unsigned diffPosJ = pj - state->part; float* pja = forces + 3 * diffPosJ; // Accumulate forces update_forces(pi, pj, h2, rho0, C0, Cp, Cv, pia, pja); } } } } } // Accumulate values in vector for (int iter_particle = 0; iter_particle < state->n; ++iter_particle) { particle_t* cur_particle = state->part + iter_particle; float* pia = forces + 3 * iter_particle; if(pia[0] != 0 || pia[1] != 0 || pia[2] != 0) { omp_set_lock(&cur_particle->lock); vec3_saxpy(cur_particle->a, 1, pia); omp_unset_lock(&cur_particle->lock); } } } /* END TASK */ #else for (int i = 0; i < n; ++i) { particle_t* pi = p+i; for (int j = i+1; j < n; ++j) { particle_t* pj = p+j; update_forces(pi, pj, h2, rho0, C0, Cp, Cv, pi->a, pj->a); } } #endif }