Beispiel #1
 * The [[place_particle]] routine determines the initial particle
 * placement, but not the desired mass.  We want the fluid in the
 * initial configuration to exist roughly at the reference density.
 * One way to do this is to take the volume in the indicated body of
 * fluid, multiply by the mass density, and divide by the number of
 * particles; but that requires that we be able to compute the volume
 * of the fluid region.  Alternately, we can simply compute the
 * average mass density assuming each particle has mass one, then use
 * that to compute the particle mass necessary in order to achieve the
 * desired reference density.  We do this with [[normalize_mass]].
 * @c*/
void normalize_mass(sim_state_t* s, proc_info* pInfo, sim_param_t* param)
  if (pInfo->proc == 0) {
    s->mass = 1; // Set mass with only one processor

    //hash_particles_parallel(s, pInfo, param->h);
    hash_particles(s, param->h); // Hashing with only one processor

  float rho0 = param->rho0;
  float rho2s = 0;
  float rhos  = 0;
#pragma omp barrier // Barrier because need hashing information before computing density

  compute_density(s, pInfo, param);
#pragma omp barrier // Want all processor to finish updating their own densities

  //printf("Starting: %d\n", pInfo->proc);
#pragma omp parallel for reduction(+:rhos, rho2s)
  for (int i = 0; i < s->n; ++i) {
    rho2s += (s->part[i].rho)*(s->part[i].rho);
    rhos  += s->part[i].rho;

#pragma omp single // Only one processor to update this
    s->mass *= ( rho0*rhos / rho2s );
Beispiel #2
 * The [[place_particle]] routine determines the initial particle
 * placement, but not the desired mass.  We want the fluid in the
 * initial configuration to exist roughly at the reference density.
 * One way to do this is to take the volume in the indicated body of
 * fluid, multiply by the mass density, and divide by the number of
 * particles; but that requires that we be able to compute the volume
 * of the fluid region.  Alternately, we can simply compute the
 * average mass density assuming each particle has mass one, then use
 * that to compute the particle mass necessary in order to achieve the
 * desired reference density.  We do this with [[normalize_mass]].
 * @c*/
void normalize_mass(sim_state_t* s, sim_param_t* param)
    s->mass = 1;
    hash_particles(s, param->h);
    compute_density(s, param);
    float rho0 = param->rho0;
    float rho2s = 0;
    float rhos  = 0;
    for (int i = 0; i < s->n; ++i) {
        rho2s += (s->part[i].rho)*(s->part[i].rho);
        rhos  += s->part[i].rho;
    s->mass *= ( rho0*rhos / rho2s );
Beispiel #3
int main(int argc, char** argv)
  sim_param_t params;
  if (get_params(argc, argv, &params) != 0)

  // Create global
  sim_state_t* globalState = init_particles(&params);

#pragma omp parallel shared(globalState, params) 
    int proc = omp_get_thread_num();
    int nproc = omp_get_num_threads();

    FILE* fp    = fopen(params.fname, "w");
    int nframes = params.nframes;
    int npframe = params.npframe;
    float dt    = params.dt;
    int n       = globalState->n;

    // Processor information and holder
    proc_info* pInfo = malloc(sizeof(proc_info)); 
    pInfo->proc = proc;
    pInfo->nproc = nproc;
    pInfo->beg = round((proc/(double)nproc)*n);
    pInfo->end = round(((proc+1)/(double)nproc)*n);
    pInfo->forceAccu = calloc(3*n, sizeof(float)); // Never used this...

    if (proc == 0) {
      printf("Running in parallel with %d processor\n", nproc);

    normalize_mass(globalState, pInfo, &params);

    double t_start = omp_get_wtime();

    if (proc == 0) { // We only write for one processor
      write_header(fp, n, nframes, params.h);
      write_frame_data(fp, n, globalState, NULL);

    if (proc == 0) {
      hash_particles(globalState, params.h);
    //hash_particles_parallel(globalState, pInfo, params.h);

#pragma omp barrier // Need the hashing to be done

    compute_accel(globalState, pInfo, &params);

#pragma omp barrier
    leapfrog_start(globalState, pInfo, dt);
    check_state(globalState, pInfo);
    for (int frame = 1; frame < nframes; ++frame) {

      // We sort according to Z-Morton to ensure locality, need to implement paralle qsort
      if (frame % 5 == 0) {

        // Dividing into chunks of sorting each chunk
        // This alone turned out to better than sorting the entire array
        qsort(globalState->part+pInfo->beg, pInfo->end-pInfo->beg ,sizeof(particle_t),compPart);
        // Sorting the array consisting of sorted chunks
        // This turned out to actually lower the performance. That's why
        // I commented it.
        // #pragma omp barrier
        //   if( pInfo->nproc >1 ) arraymerge(globalState->part, globalState->n, pInfo);
//#pragma omp barrier*/

        // Serial version
        /*#pragma omp single // Implied barrier
          qsort(globalState->part, n, sizeof(particle_t), compPart);*/
      /*else if (frame % 49) {*/
        /*if (proc == 0) {*/

#pragma omp barrier // Need sort to finish

    for (int i = 0; i < npframe; ++i) {
      if (proc == 0 && npframe % 4 == 0) { // Ammortize hashing cost
        hash_particles(globalState, params.h);        

#pragma omp barrier
      compute_accel(globalState, pInfo, &params);
      leapfrog_step(globalState, pInfo, dt);
      check_state(globalState, pInfo);
#pragma omp barrier

    if (proc == 0) {
      printf("Frame: %d of %d - %2.1f%%\n",frame, nframes, 
      write_frame_data(fp, n, globalState, NULL);

  double t_end = omp_get_wtime();

  if (proc == 0) {
    printf("Ran in %g seconds\n", t_end-t_start);


void compute_accel(sim_state_t* state, sim_param_t* params) {
	// Unpack basic parameters
	const float h = params->h;
	const float rho0 = params->rho0;
//	const float k = params->k;
//	const float mu = params->mu;
	const float g = params->g;
//	const float mass = state->mass;
	const float h2 = params->h2;

	// Unpack system state
	particle_t* p = state->part;
	particle_t** hash = state->hash;
	const int n = state->n;

	// Rehash the particles
	hash_particles(state, h);

	// Compute density and color
	compute_density(state, params);

	// Constants for interaction term
	const float C0 = params->C0;
	const float Cp = params->Cp;
	const float Cv = params->Cv;

	// Start with gravity and surface forces
	for (int i = 0; i < n; ++i)
		vec3_set(p[i].a, 0, -g, 0);

	// Accumulate forces

	// Start multi-threaded

	// Iterate over each bucket, and within each bucket each particle
#pragma omp parallel
		// Get the thread ID and total threads
		int thread_id = omp_get_thread_num();
		int total_threads = omp_get_num_threads();

		// Get the appropriate forces vector
		float* forces = forces_all + thread_id * (state->n * 3);
		memset(forces, 0, sizeof(float) * state->n * 3);

		// Get the dedupe buffer for this thread
		char* usedBinID = used_bin_id_flags + (thread_id * HASH_SIZE);

		// Create storage for neighbor set
		unsigned buckets[MAX_NBR_BINS];
		unsigned numbins;

		// Process all buckets that the thread is responsible for
		for (int iter_bucket = thread_id; iter_bucket < HASH_SIZE; iter_bucket += total_threads)
			for (particle_t* pi = hash[iter_bucket]; pi != NULL ; pi = pi->next)
				// Get the position of the pi force accumulator
				unsigned diffPosI = pi - state->part;
				float* pia = forces + 3 * diffPosI;

				// Compute equal and opposite forces for the particle,
				// first get neighbors
				numbins = particle_neighborhood(buckets, pi, h, usedBinID);
				for (int j = 0; j < numbins; ++j)
					// Get the neighbor particles
					unsigned bucketid = buckets[j];
					for (particle_t* pj = hash[bucketid]; pj != NULL ; pj = pj->next) {
						// Compute forces only if appropriate
						if (pi < pj && abs(pi->ix - pj->ix) <= 1
								&& abs(pi->iy - pj->iy) <= 1
								&& abs(pi->iz - pj->iz) <= 1)
							// Get the position of the pj force accumulator
							unsigned diffPosJ = pj - state->part;
							float* pja = forces + 3 * diffPosJ;

							// Accumulate forces
							update_forces(pi, pj, h2, rho0, C0, Cp, Cv, pia, pja);

		// Accumulate values in vector
		for (int iter_particle = 0; iter_particle < state->n; ++iter_particle)
			particle_t* cur_particle = state->part + iter_particle;
			float* pia = forces + 3 * iter_particle;
			if(pia[0] != 0 || pia[1] != 0 || pia[2] != 0)
				vec3_saxpy(cur_particle->a, 1, pia);

	/* END TASK */

	for (int i = 0; i < n; ++i) {
		particle_t* pi = p+i;
		for (int j = i+1; j < n; ++j) {
			particle_t* pj = p+j;
			update_forces(pi, pj, h2, rho0, C0, Cp, Cv, pi->a, pj->a);