void compute_step_factor(int nelr, double* variables, double* areas, double* step_factors)
{
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < nelr; i++)
	{
		double density = variables[NVAR*i + VAR_DENSITY];

		cfd_double3 momentum;
		momentum.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum.z = variables[NVAR*i + (VAR_MOMENTUM+2)];

		double density_energy = variables[NVAR*i + VAR_DENSITY_ENERGY];
		cfd_double3 velocity;	   compute_velocity(density, momentum, velocity);
		double speed_sqd      = compute_speed_sqd(velocity);
		double pressure       = compute_pressure(density, density_energy, speed_sqd);
		double speed_of_sound = compute_speed_of_sound(density, pressure);

		// dt = double(0.5) * std::sqrt(areas[i]) /  (||v|| + c).... but when we do time stepping, this later would need to be divided by the area, so we just do it all at once
		step_factors[i] = double(0.5) / (std::sqrt(areas[i]) * (std::sqrt(speed_sqd) + speed_of_sound));
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma155_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

}
void compute_flux_contributions(int nelr, double* variables, double* fc_momentum_x, double* fc_momentum_y, double* fc_momentum_z, double* fc_density_energy)
{
	#pragma acc kernels
	for(int i = 0; i < nelr; i++)
	{
		double density_i = variables[NVAR*i + VAR_DENSITY];
		double3 momentum_i;
		momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)];
		double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY];

		double3 velocity_i;             				compute_velocity(density_i, momentum_i, velocity_i);
		double speed_sqd_i                          = compute_speed_sqd(velocity_i);
		double speed_i                              = sqrtf(speed_sqd_i);
		double pressure_i                           = compute_pressure(density_i, density_energy_i, speed_sqd_i);
		double speed_of_sound_i                     = compute_speed_of_sound(density_i, pressure_i);
		double3 fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z;
		double3 fc_i_density_energy;	
		compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z, fc_i_density_energy);

		fc_momentum_x[i*NDIM + 0] = fc_i_momentum_x.x;
		fc_momentum_x[i*NDIM + 1] = fc_i_momentum_x.y;
		fc_momentum_x[i*NDIM+  2] = fc_i_momentum_x.z;

		fc_momentum_y[i*NDIM+ 0] = fc_i_momentum_y.x;
		fc_momentum_y[i*NDIM+ 1] = fc_i_momentum_y.y;
		fc_momentum_y[i*NDIM+ 2] = fc_i_momentum_y.z;


		fc_momentum_z[i*NDIM+ 0] = fc_i_momentum_z.x;
		fc_momentum_z[i*NDIM+ 1] = fc_i_momentum_z.y;
		fc_momentum_z[i*NDIM+ 2] = fc_i_momentum_z.z;

		fc_density_energy[i*NDIM+ 0] = fc_i_density_energy.x;
		fc_density_energy[i*NDIM+ 1] = fc_i_density_energy.y;
		fc_density_energy[i*NDIM+ 2] = fc_i_density_energy.z;
	}

}
void compute_step_factor(int nelr, double* variables, double* areas, double* step_factors)
{
	#pragma acc kernels
	for(int i = 0; i < nelr; i++)
	{
		double density = variables[NVAR*i + VAR_DENSITY];

		double3 momentum;
		momentum.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum.z = variables[NVAR*i + (VAR_MOMENTUM+2)];

		double density_energy = variables[NVAR*i + VAR_DENSITY_ENERGY];
		double3 velocity;	   compute_velocity(density, momentum, velocity);
		double speed_sqd      = compute_speed_sqd(velocity);
		double pressure       = compute_pressure(density, density_energy, speed_sqd);
		double speed_of_sound = compute_speed_of_sound(density, pressure);

		// dt = double(0.5) * std::sqrt(areas[i]) /  (||v|| + c).... but when we do time stepping, this later would need to be divided by the area, so we just do it all at once
		step_factors[i] = double(0.5) / (std::sqrt(areas[i]) * (std::sqrt(speed_sqd) + speed_of_sound));
	}
}
void compute_flux(int nelr, int* elements_surrounding_elements, double* normals, double* variables, double* fluxes)
{
	double smoothing_coefficient = double(0.2f);

 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < nelr; i++)
	{
		int j, nb;
		cfd_double3 normal; double normal_len;
		double factor;

		double density_i = variables[NVAR*i + VAR_DENSITY];
		cfd_double3 momentum_i;
		momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)];

		double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY];

		cfd_double3 velocity_i;             				 compute_velocity(density_i, momentum_i, velocity_i);
		double speed_sqd_i                          = compute_speed_sqd(velocity_i);
		double speed_i                              = std::sqrt(speed_sqd_i);
		double pressure_i                           = compute_pressure(density_i, density_energy_i, speed_sqd_i);
		double speed_of_sound_i                     = compute_speed_of_sound(density_i, pressure_i);
		cfd_double3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z;
		cfd_double3 flux_contribution_i_density_energy;
		compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z, flux_contribution_i_density_energy);

		double flux_i_density = double(0.0);
		cfd_double3 flux_i_momentum;
		flux_i_momentum.x = double(0.0);
		flux_i_momentum.y = double(0.0);
		flux_i_momentum.z = double(0.0);
		double flux_i_density_energy = double(0.0);

		cfd_double3 velocity_nb;
		double density_nb, density_energy_nb;
		cfd_double3 momentum_nb;
		cfd_double3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z;
		cfd_double3 flux_contribution_nb_density_energy;
		double speed_sqd_nb, speed_of_sound_nb, pressure_nb;

		for(j = 0; j < NNB; j++)
		{
			nb = elements_surrounding_elements[i*NNB + j];
			normal.x = normals[(i*NNB + j)*NDIM + 0];
			normal.y = normals[(i*NNB + j)*NDIM + 1];
			normal.z = normals[(i*NNB + j)*NDIM + 2];
			normal_len = std::sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);

			if(nb >= 0) 	// a legitimate neighbor
			{
				density_nb =        variables[nb*NVAR + VAR_DENSITY];
				momentum_nb.x =     variables[nb*NVAR + (VAR_MOMENTUM+0)];
				momentum_nb.y =     variables[nb*NVAR + (VAR_MOMENTUM+1)];
				momentum_nb.z =     variables[nb*NVAR + (VAR_MOMENTUM+2)];
				density_energy_nb = variables[nb*NVAR + VAR_DENSITY_ENERGY];
													compute_velocity(density_nb, momentum_nb, velocity_nb);
				speed_sqd_nb                      = compute_speed_sqd(velocity_nb);
				pressure_nb                       = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
				speed_of_sound_nb                 = compute_speed_of_sound(density_nb, pressure_nb);
													compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);

				// artificial viscosity
				factor = -normal_len*smoothing_coefficient*double(0.5)*(speed_i + std::sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
				flux_i_density += factor*(density_i-density_nb);
				flux_i_density_energy += factor*(density_energy_i-density_energy_nb);
				flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x);
				flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y);
				flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z);

				// accumulate cell-centered fluxes
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(momentum_nb.x+momentum_i.x);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.x+flux_contribution_i_density_energy.x);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.x+flux_contribution_i_momentum_x.x);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.x+flux_contribution_i_momentum_y.x);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.x+flux_contribution_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(momentum_nb.y+momentum_i.y);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.y+flux_contribution_i_density_energy.y);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.y+flux_contribution_i_momentum_x.y);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.y+flux_contribution_i_momentum_y.y);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.y+flux_contribution_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(momentum_nb.z+momentum_i.z);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.z+flux_contribution_i_density_energy.z);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.z+flux_contribution_i_momentum_x.z);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.z+flux_contribution_i_momentum_y.z);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.z+flux_contribution_i_momentum_z.z);
			}
			else if(nb == -1)	// a wing boundary
			{
				flux_i_momentum.x += normal.x*pressure_i;
				flux_i_momentum.y += normal.y*pressure_i;
				flux_i_momentum.z += normal.z*pressure_i;
			}
			else if(nb == -2) // a far field boundary
			{
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.x+flux_contribution_i_density_energy.x);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.x + flux_contribution_i_momentum_x.x);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.x + flux_contribution_i_momentum_y.x);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.x + flux_contribution_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.y+flux_contribution_i_density_energy.y);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.y + flux_contribution_i_momentum_x.y);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.y + flux_contribution_i_momentum_y.y);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.y + flux_contribution_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.z+flux_contribution_i_density_energy.z);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.z + flux_contribution_i_momentum_x.z);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.z + flux_contribution_i_momentum_y.z);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.z + flux_contribution_i_momentum_z.z);

			}
		}

		fluxes[i*NVAR + VAR_DENSITY] = flux_i_density;
		fluxes[i*NVAR + (VAR_MOMENTUM+0)] = flux_i_momentum.x;
		fluxes[i*NVAR + (VAR_MOMENTUM+1)] = flux_i_momentum.y;
		fluxes[i*NVAR + (VAR_MOMENTUM+2)] = flux_i_momentum.z;
		fluxes[i*NVAR + VAR_DENSITY_ENERGY] = flux_i_density_energy;
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma186_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

}
void compute_flux(int nelr, int* elements_surrounding_elements, double* normals, double* variables, double* fc_momentum_x, double* fc_momentum_y, double* fc_momentum_z, double* fc_density_energy, double* fluxes)
{
	const double smoothing_coefficient = double(0.2);

	#pragma acc kernels
	for(int i = 0; i < nelr; i++)
	{
		int j, nb;
		double3 normal; double normal_len;
		double factor;

		double density_i = variables[NVAR*i + VAR_DENSITY];
		double3 momentum_i;
		momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)];

		double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY];

		double3 velocity_i;             				 compute_velocity(density_i, momentum_i, velocity_i);
		double speed_sqd_i                          = compute_speed_sqd(velocity_i);
		double speed_i                              = std::sqrt(speed_sqd_i);
		double pressure_i                           = compute_pressure(density_i, density_energy_i, speed_sqd_i);
		double speed_of_sound_i                     = compute_speed_of_sound(density_i, pressure_i);
		double3 fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z;
		double3 fc_i_density_energy;

		fc_i_momentum_x.x = fc_momentum_x[i*NDIM + 0];
		fc_i_momentum_x.y = fc_momentum_x[i*NDIM + 1];
		fc_i_momentum_x.z = fc_momentum_x[i*NDIM + 2];

		fc_i_momentum_y.x = fc_momentum_y[i*NDIM + 0];
		fc_i_momentum_y.y = fc_momentum_y[i*NDIM + 1];
		fc_i_momentum_y.z = fc_momentum_y[i*NDIM + 2];

		fc_i_momentum_z.x = fc_momentum_z[i*NDIM + 0];
		fc_i_momentum_z.y = fc_momentum_z[i*NDIM + 1];
		fc_i_momentum_z.z = fc_momentum_z[i*NDIM + 2];

		fc_i_density_energy.x = fc_density_energy[i*NDIM + 0];
		fc_i_density_energy.y = fc_density_energy[i*NDIM + 1];
		fc_i_density_energy.z = fc_density_energy[i*NDIM + 2];

		double flux_i_density = double(0.0);
		double3 flux_i_momentum;
		flux_i_momentum.x = double(0.0);
		flux_i_momentum.y = double(0.0);
		flux_i_momentum.z = double(0.0);
		double flux_i_density_energy = double(0.0);

		double3 velocity_nb;
		double density_nb, density_energy_nb;
		double3 momentum_nb;
		double3 fc_nb_momentum_x, fc_nb_momentum_y, fc_nb_momentum_z;
		double3 fc_nb_density_energy;
		double speed_sqd_nb, speed_of_sound_nb, pressure_nb;

		for(j = 0; j < NNB; j++)
		{
			nb = elements_surrounding_elements[i*NNB + j];
			normal.x = normals[(i*NNB + j)*NDIM + 0];
			normal.y = normals[(i*NNB + j)*NDIM + 1];
			normal.z = normals[(i*NNB + j)*NDIM + 2];
			normal_len = std::sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);

			if(nb >= 0) 	// a legitimate neighbor
			{
				density_nb =        variables[nb*NVAR + VAR_DENSITY];
				momentum_nb.x =     variables[nb*NVAR + (VAR_MOMENTUM+0)];
				momentum_nb.y =     variables[nb*NVAR + (VAR_MOMENTUM+1)];
				momentum_nb.z =     variables[nb*NVAR + (VAR_MOMENTUM+2)];
				density_energy_nb = variables[nb*NVAR + VAR_DENSITY_ENERGY];
													compute_velocity(density_nb, momentum_nb, velocity_nb);
				speed_sqd_nb                      = compute_speed_sqd(velocity_nb);
				pressure_nb                       = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
				speed_of_sound_nb                 = compute_speed_of_sound(density_nb, pressure_nb);

				fc_nb_momentum_x.x = fc_momentum_x[nb*NDIM + 0];
				fc_nb_momentum_x.y = fc_momentum_x[nb*NDIM + 1];
				fc_nb_momentum_x.z = fc_momentum_x[nb*NDIM + 2];

				fc_nb_momentum_y.x = fc_momentum_y[nb*NDIM + 0];
				fc_nb_momentum_y.y = fc_momentum_y[nb*NDIM + 1];
				fc_nb_momentum_y.z = fc_momentum_y[nb*NDIM + 2];

				fc_nb_momentum_z.x = fc_momentum_z[nb*NDIM + 0];
				fc_nb_momentum_z.y = fc_momentum_z[nb*NDIM + 1];
				fc_nb_momentum_z.z = fc_momentum_z[nb*NDIM + 2];

				fc_nb_density_energy.x = fc_density_energy[nb*NDIM + 0];
				fc_nb_density_energy.y = fc_density_energy[nb*NDIM + 1];

				// artificial viscosity
				factor = -normal_len*smoothing_coefficient*double(0.5)*(speed_i + std::sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
				flux_i_density += factor*(density_i-density_nb);
				flux_i_density_energy += factor*(density_energy_i-density_energy_nb);
				flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x);
				flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y);
				flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z);

				// accumulate cell-centered fluxes
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(momentum_nb.x+momentum_i.x);
				flux_i_density_energy += factor*(fc_nb_density_energy.x+fc_i_density_energy.x);
				flux_i_momentum.x += factor*(fc_nb_momentum_x.x+fc_i_momentum_x.x);
				flux_i_momentum.y += factor*(fc_nb_momentum_y.x+fc_i_momentum_y.x);
				flux_i_momentum.z += factor*(fc_nb_momentum_z.x+fc_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(momentum_nb.y+momentum_i.y);
				flux_i_density_energy += factor*(fc_nb_density_energy.y+fc_i_density_energy.y);
				flux_i_momentum.x += factor*(fc_nb_momentum_x.y+fc_i_momentum_x.y);
				flux_i_momentum.y += factor*(fc_nb_momentum_y.y+fc_i_momentum_y.y);
				flux_i_momentum.z += factor*(fc_nb_momentum_z.y+fc_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(momentum_nb.z+momentum_i.z);
				flux_i_density_energy += factor*(fc_nb_density_energy.z+fc_i_density_energy.z);
				flux_i_momentum.x += factor*(fc_nb_momentum_x.z+fc_i_momentum_x.z);
				flux_i_momentum.y += factor*(fc_nb_momentum_y.z+fc_i_momentum_y.z);
				flux_i_momentum.z += factor*(fc_nb_momentum_z.z+fc_i_momentum_z.z);
			}
			else if(nb == -1)	// a wing boundary
			{
				flux_i_momentum.x += normal.x*pressure_i;
				flux_i_momentum.y += normal.y*pressure_i;
				flux_i_momentum.z += normal.z*pressure_i;
			}
			else if(nb == -2) // a far field boundary
			{
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x);
				flux_i_density_energy += factor*(ff_fc_density_energy.x+fc_i_density_energy.x);
				flux_i_momentum.x += factor*(ff_fc_momentum_x.x + fc_i_momentum_x.x);
				flux_i_momentum.y += factor*(ff_fc_momentum_y.x + fc_i_momentum_y.x);
				flux_i_momentum.z += factor*(ff_fc_momentum_z.x + fc_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y);
				flux_i_density_energy += factor*(ff_fc_density_energy.y+fc_i_density_energy.y);
				flux_i_momentum.x += factor*(ff_fc_momentum_x.y + fc_i_momentum_x.y);
				flux_i_momentum.y += factor*(ff_fc_momentum_y.y + fc_i_momentum_y.y);
				flux_i_momentum.z += factor*(ff_fc_momentum_z.y + fc_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z);
				flux_i_density_energy += factor*(ff_fc_density_energy.z+fc_i_density_energy.z);
				flux_i_momentum.x += factor*(ff_fc_momentum_x.z + fc_i_momentum_x.z);
				flux_i_momentum.y += factor*(ff_fc_momentum_y.z + fc_i_momentum_y.z);
				flux_i_momentum.z += factor*(ff_fc_momentum_z.z + fc_i_momentum_z.z);

			}
		}

		fluxes[i*NVAR + VAR_DENSITY] = flux_i_density;
		fluxes[i*NVAR + (VAR_MOMENTUM+0)] = flux_i_momentum.x;
		fluxes[i*NVAR + (VAR_MOMENTUM+1)] = flux_i_momentum.y;
		fluxes[i*NVAR + (VAR_MOMENTUM+2)] = flux_i_momentum.z;
		fluxes[i*NVAR + VAR_DENSITY_ENERGY] = flux_i_density_energy;
	}
}