void compute_flux_contributions(int nelr, double* variables, double* fc_momentum_x, double* fc_momentum_y, double* fc_momentum_z, double* fc_density_energy)
{
	#pragma acc kernels
	for(int i = 0; i < nelr; i++)
	{
		double density_i = variables[NVAR*i + VAR_DENSITY];
		double3 momentum_i;
		momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)];
		double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY];

		double3 velocity_i;             				compute_velocity(density_i, momentum_i, velocity_i);
		double speed_sqd_i                          = compute_speed_sqd(velocity_i);
		double speed_i                              = sqrtf(speed_sqd_i);
		double pressure_i                           = compute_pressure(density_i, density_energy_i, speed_sqd_i);
		double speed_of_sound_i                     = compute_speed_of_sound(density_i, pressure_i);
		double3 fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z;
		double3 fc_i_density_energy;	
		compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z, fc_i_density_energy);

		fc_momentum_x[i*NDIM + 0] = fc_i_momentum_x.x;
		fc_momentum_x[i*NDIM + 1] = fc_i_momentum_x.y;
		fc_momentum_x[i*NDIM+  2] = fc_i_momentum_x.z;

		fc_momentum_y[i*NDIM+ 0] = fc_i_momentum_y.x;
		fc_momentum_y[i*NDIM+ 1] = fc_i_momentum_y.y;
		fc_momentum_y[i*NDIM+ 2] = fc_i_momentum_y.z;


		fc_momentum_z[i*NDIM+ 0] = fc_i_momentum_z.x;
		fc_momentum_z[i*NDIM+ 1] = fc_i_momentum_z.y;
		fc_momentum_z[i*NDIM+ 2] = fc_i_momentum_z.z;

		fc_density_energy[i*NDIM+ 0] = fc_i_density_energy.x;
		fc_density_energy[i*NDIM+ 1] = fc_i_density_energy.y;
		fc_density_energy[i*NDIM+ 2] = fc_i_density_energy.z;
	}

}
Example #2
0
int main(int argc, char** argv)
{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();

	cl_int err;

	size_t global_size;
	size_t local_size;

	cl_program program;
	cl_kernel kernel_compute_flux;
	cl_kernel kernel_compute_flux_contributions;
	cl_kernel kernel_compute_step_factor;
	cl_kernel kernel_time_step;
	cl_kernel kernel_initialize_variables;

	cl_mem ff_variable;
	cl_mem ff_fc_momentum_x;
	cl_mem ff_fc_momentum_y;
	cl_mem ff_fc_momentum_z;
	cl_mem ff_fc_density_energy;

	if (argc < 2)
	{
		printf("Usage ./cfd <data input file>\n");
		return 0;
	}


	const char* data_file_name = argv[1];


	// set far field conditions and load them into constant memory on the gpu
	{
		float h_ff_variable[NVAR];
		const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack);

		h_ff_variable[VAR_DENSITY] = (float)(1.4);

		float ff_pressure = (float)(1.0);
		float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
		float ff_speed = (float)(ff_mach)*ff_speed_of_sound;

		float3 ff_velocity;
		ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack));
		ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack));
		ff_velocity.z = 0.0;

		h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
		h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
		h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;

		h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0));

		float3 h_ff_momentum;
		h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
		h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
		h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
		float3 h_ff_fc_momentum_x;
		float3 h_ff_fc_momentum_y;
		float3 h_ff_fc_momentum_z;
		float3 h_ff_fc_density_energy;
		compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum,
				&h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity,
				&h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z,
				&h_ff_fc_density_energy);

		// copy far field conditions to the gpu
		ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err);
		CHKERR(err, "Unable to allocate ff data");
	}
	int nel;
	int nelr;

	// read in domain geometry
	cl_mem areas;
	cl_mem elements_surrounding_elements;
	cl_mem normals;
	{
		std::ifstream file(data_file_name);

		file >> nel;

		nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));

		//float* h_areas = new float[nelr];
		//int* h_elements_surrounding_elements = new int[nelr*NNB];
		//float* h_normals = new float[nelr*NDIM*NNB];

		float* h_areas ;
		int* h_elements_surrounding_elements ;
		float* h_normals ;

        h_areas                         = (float*)  memalign(AOCL_ALIGNMENT,nelr*sizeof(float));
        h_elements_surrounding_elements = (int*)    memalign(AOCL_ALIGNMENT,nelr*NNB*sizeof(int));
        h_normals                       = (float *) memalign(AOCL_ALIGNMENT,nelr*NDIM*NNB*sizeof(float));


        //posix_memalign(&h_areas                         , AOCL_ALIGNMENT, nelr);
        //posix_memalign(&h_elements_surrounding_elements , AOCL_ALIGNMENT, nelr*NNB);
        //posix_memalign(&h_normals                       , AOCL_ALIGNMENT, nelr*NDIM*NNB);


		// read in data
		for(int i = 0; i < nel; i++)
		{
			file >> h_areas[i];
			for(int j = 0; j < NNB; j++)
			{
				file >> h_elements_surrounding_elements[i + j*nelr];
				if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
				h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering

				for(int k = 0; k < NDIM; k++)
				{
					file >> h_normals[i + (j + k*NNB)*nelr];
					h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
				}
			}
		}

		// fill in remaining data
		int last = nel-1;
		for(int i = nel; i < nelr; i++)
		{
			h_areas[i] = h_areas[last];
			for(int j = 0; j < NNB; j++)
			{
				// duplicate the last element
				h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
				for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
			}
		}

		areas = alloc<float>(context, nelr);
		upload<float>(commands, areas, h_areas, nelr);

		elements_surrounding_elements = alloc<int>(context, nelr*NNB);
		upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);

		normals = alloc<float>(context, nelr*NDIM*NNB);
		upload<float>(commands, normals, h_normals, nelr*NDIM*NNB);

		delete[] h_areas;
		delete[] h_elements_surrounding_elements;
		delete[] h_normals;
	}

	char* kernel_files;
	int num_kernels = 20;
	kernel_files = (char*) malloc(sizeof(char*)*num_kernels);

	strcpy(kernel_files,"cfd_kernel");
      
    program = ocdBuildProgramFromFile(context,device_id,kernel_files, NULL);

	// Create the compute kernel in the program we wish to run
	kernel_compute_flux = clCreateKernel(program, "compute_flux", &err);
	CHKERR(err, "Failed to create a compute kernel!");

	// Create the reduce kernel in the program we wish to run
	kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err);
	CHKERR(err, "Failed to create a compute_flux_contributions kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err);
	CHKERR(err, "Failed to create a compute_step_factor kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_time_step = clCreateKernel(program, "time_step", &err);
	CHKERR(err, "Failed to create a time_step kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err);
	CHKERR(err, "Failed to create a initialize_variables kernel!");

	// Create arrays and set initial conditions
	cl_mem variables = alloc<cl_float>(context, nelr*NVAR);

	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	//err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	local_size = 1;//std::min(local_size, (size_t)nelr);
	global_size = nelr;
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	err = clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0");


	cl_mem old_variables = alloc<float>(context, nelr*NVAR);
	cl_mem fluxes = alloc<float>(context, nelr*NVAR);
	cl_mem step_factors = alloc<float>(context, nelr);
	clFinish(commands);
	cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM);
	cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM);
	clFinish(commands);

	// make sure all memory is floatly allocated before we start timing
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1");
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");

	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2");
	std::cout << "About to memcopy" << std::endl;
	err = clReleaseMemObject(step_factors);
	float temp[nelr];
	for(int i = 0; i < nelr; i++)
		temp[i] = 0;
	step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err);
	CHKERR(err, "Unable to memset step_factors");
	// make sure CUDA isn't still doing something before we start timing

	clFinish(commands);

	// these need to be computed the first time in order to compute time step
	std::cout << "Starting..." << std::endl;


	// Begin iterations
	for(int i = 0; i < iterations; i++)
	{
		copy<float>(commands, old_variables, variables, nelr*NVAR);

		// for the first iteration we compute the time step
		err = 0;
		err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr);
		err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables);
		err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas);
		err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors);
		CHKERR(err, "Failed to set kernel arguments!");
		// Get the maximum work group size for executing the kernel on the device
		err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
		CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");
		err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!");
		for(int j = 0; j < RK; j++)
		{
			err = 0;
			err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer)
			//compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!");
			err = 0;
			err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements);
			err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals);
			err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy);
			err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes);
			err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable);
			err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!");
			err = 0;
			err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j);
			err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables);
			err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors);
			err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_time_step work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_time_step]!");
		}
	}

	clFinish(commands);
	std::cout << "Finished" << std::endl;
	std::cout << "Saving solution..." << std::endl;
	dump(commands, variables, nel, nelr);
	std::cout << "Saved solution..." << std::endl;
	std::cout << "Cleaning up..." << std::endl;

	clReleaseProgram(program);
	clReleaseKernel(kernel_compute_flux);
	clReleaseKernel(kernel_compute_flux_contributions);
	clReleaseKernel(kernel_compute_step_factor);
	clReleaseKernel(kernel_time_step);
	clReleaseKernel(kernel_initialize_variables);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	dealloc<float>(areas);
	dealloc<int>(elements_surrounding_elements);
	dealloc<float>(normals);

	dealloc<float>(variables);
	dealloc<float>(old_variables);
	dealloc<float>(fluxes);
	dealloc<float>(step_factors);
	dealloc<float>(fc_momentum_x);
	dealloc<float>(fc_momentum_y);
	dealloc<float>(fc_momentum_z);
	dealloc<float>(fc_density_energy);

	std::cout << "Done..." << std::endl;
	ocd_finalize();
	return 0;
}
/*
 * Main function
 */
int main(int argc, char** argv)
{
	if (argc < 2)
	{
		std::cout << "specify data file name" << std::endl;
		return 0;
	}
	const char* data_file_name = argv[1];

const unsigned long long full_program_start = current_time_ns();
{
	// set far field conditions
	{
		const double angle_of_attack = double(3.1415926535897931 / 180.0) * double(deg_angle_of_attack);

		ff_variable[VAR_DENSITY] = double(1.4);

		double ff_pressure = double(1.0);
		double ff_speed_of_sound = sqrt(GAMMA*ff_pressure / ff_variable[VAR_DENSITY]);
		double ff_speed = double(ff_mach)*ff_speed_of_sound;

		cfd_double3 ff_velocity;
		ff_velocity.x = ff_speed*double(cos((double)angle_of_attack));
		ff_velocity.y = ff_speed*double(sin((double)angle_of_attack));
		ff_velocity.z = 0.0;

		ff_variable[VAR_MOMENTUM+0] = ff_variable[VAR_DENSITY] * ff_velocity.x;
		ff_variable[VAR_MOMENTUM+1] = ff_variable[VAR_DENSITY] * ff_velocity.y;
		ff_variable[VAR_MOMENTUM+2] = ff_variable[VAR_DENSITY] * ff_velocity.z;

		ff_variable[VAR_DENSITY_ENERGY] = ff_variable[VAR_DENSITY]*(double(0.5)*(ff_speed*ff_speed)) + (ff_pressure / double(GAMMA-1.0));

		cfd_double3 ff_momentum;
		ff_momentum.x = *(ff_variable+VAR_MOMENTUM+0);
		ff_momentum.y = *(ff_variable+VAR_MOMENTUM+1);
		ff_momentum.z = *(ff_variable+VAR_MOMENTUM+2);
		compute_flux_contribution(ff_variable[VAR_DENSITY], ff_momentum, ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy);
	}
	int nel;
	int nelr;

	// read in domain geometry
	double* areas;
	int* elements_surrounding_elements;
	double* normals;
	{
		std::ifstream file(data_file_name);

		file >> nel;
		nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));

		areas = new double[nelr];
		elements_surrounding_elements = new int[nelr*NNB];
		normals = new double[NDIM*NNB*nelr];

		// read in data
		for(int i = 0; i < nel; i++)
		{
			file >> areas[i];
			for(int j = 0; j < NNB; j++)
			{
				file >> elements_surrounding_elements[i*NNB + j];
				if(elements_surrounding_elements[i*NNB+j] < 0) elements_surrounding_elements[i*NNB+j] = -1;
				elements_surrounding_elements[i*NNB + j]--; //it's coming in with Fortran numbering

				for(int k = 0; k < NDIM; k++)
				{
					file >>  normals[(i*NNB + j)*NDIM + k];
					normals[(i*NNB + j)*NDIM + k] = -normals[(i*NNB + j)*NDIM + k];
				}
			}
		}

		// fill in remaining data
		int last = nel-1;
		for(int i = nel; i < nelr; i++)
		{
			areas[i] = areas[last];
			for(int j = 0; j < NNB; j++)
			{
				// duplicate the last element
				elements_surrounding_elements[i*NNB + j] = elements_surrounding_elements[last*NNB + j];
				for(int k = 0; k < NDIM; k++) normals[(i*NNB + j)*NDIM + k] = normals[(last*NNB + j)*NDIM + k];
			}
		}
	}

	// Create arrays and set initial conditions
	double* variables = alloc<double>(nelr*NVAR);
	initialize_variables(nelr, variables);

	double* old_variables = alloc<double>(nelr*NVAR);
	double* fluxes = alloc<double>(nelr*NVAR);
	double* step_factors = alloc<double>(nelr);

	// these need to be computed the first time in order to compute time step
	std::cout << "Starting..." << std::endl;

	// Begin iterations
	for(int i = 0; i < iterations; i++)
	{
		copy(old_variables, variables, nelr*NVAR);

		// for the first iteration we compute the time step
		compute_step_factor(nelr, variables, areas, step_factors);

		for(int j = 0; j < RK; j++)
		{
			compute_flux(nelr, elements_surrounding_elements, normals, variables, fluxes);
			time_step(j, nelr, old_variables, variables, step_factors, fluxes);
		}
	}

	std::cout << "Saving solution..." << std::endl;
	dump(variables, nel, nelr);
	std::cout << "Saved solution..." << std::endl;


	std::cout << "Cleaning up..." << std::endl;
	dealloc<double>(areas);
	dealloc<int>(elements_surrounding_elements);
	dealloc<double>(normals);

	dealloc<double>(variables);
	dealloc<double>(old_variables);
	dealloc<double>(fluxes);
	dealloc<double>(step_factors);
    } ; 
const unsigned long long full_program_end = current_time_ns();
printf("full_program %llu ns\n", full_program_end - full_program_start);


	std::cout << "Done..." << std::endl;

	return 0;
}
void compute_flux(int nelr, int* elements_surrounding_elements, double* normals, double* variables, double* fluxes)
{
	double smoothing_coefficient = double(0.2f);

 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < nelr; i++)
	{
		int j, nb;
		cfd_double3 normal; double normal_len;
		double factor;

		double density_i = variables[NVAR*i + VAR_DENSITY];
		cfd_double3 momentum_i;
		momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)];

		double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY];

		cfd_double3 velocity_i;             				 compute_velocity(density_i, momentum_i, velocity_i);
		double speed_sqd_i                          = compute_speed_sqd(velocity_i);
		double speed_i                              = std::sqrt(speed_sqd_i);
		double pressure_i                           = compute_pressure(density_i, density_energy_i, speed_sqd_i);
		double speed_of_sound_i                     = compute_speed_of_sound(density_i, pressure_i);
		cfd_double3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z;
		cfd_double3 flux_contribution_i_density_energy;
		compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z, flux_contribution_i_density_energy);

		double flux_i_density = double(0.0);
		cfd_double3 flux_i_momentum;
		flux_i_momentum.x = double(0.0);
		flux_i_momentum.y = double(0.0);
		flux_i_momentum.z = double(0.0);
		double flux_i_density_energy = double(0.0);

		cfd_double3 velocity_nb;
		double density_nb, density_energy_nb;
		cfd_double3 momentum_nb;
		cfd_double3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z;
		cfd_double3 flux_contribution_nb_density_energy;
		double speed_sqd_nb, speed_of_sound_nb, pressure_nb;

		for(j = 0; j < NNB; j++)
		{
			nb = elements_surrounding_elements[i*NNB + j];
			normal.x = normals[(i*NNB + j)*NDIM + 0];
			normal.y = normals[(i*NNB + j)*NDIM + 1];
			normal.z = normals[(i*NNB + j)*NDIM + 2];
			normal_len = std::sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);

			if(nb >= 0) 	// a legitimate neighbor
			{
				density_nb =        variables[nb*NVAR + VAR_DENSITY];
				momentum_nb.x =     variables[nb*NVAR + (VAR_MOMENTUM+0)];
				momentum_nb.y =     variables[nb*NVAR + (VAR_MOMENTUM+1)];
				momentum_nb.z =     variables[nb*NVAR + (VAR_MOMENTUM+2)];
				density_energy_nb = variables[nb*NVAR + VAR_DENSITY_ENERGY];
													compute_velocity(density_nb, momentum_nb, velocity_nb);
				speed_sqd_nb                      = compute_speed_sqd(velocity_nb);
				pressure_nb                       = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
				speed_of_sound_nb                 = compute_speed_of_sound(density_nb, pressure_nb);
													compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);

				// artificial viscosity
				factor = -normal_len*smoothing_coefficient*double(0.5)*(speed_i + std::sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
				flux_i_density += factor*(density_i-density_nb);
				flux_i_density_energy += factor*(density_energy_i-density_energy_nb);
				flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x);
				flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y);
				flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z);

				// accumulate cell-centered fluxes
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(momentum_nb.x+momentum_i.x);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.x+flux_contribution_i_density_energy.x);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.x+flux_contribution_i_momentum_x.x);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.x+flux_contribution_i_momentum_y.x);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.x+flux_contribution_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(momentum_nb.y+momentum_i.y);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.y+flux_contribution_i_density_energy.y);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.y+flux_contribution_i_momentum_x.y);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.y+flux_contribution_i_momentum_y.y);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.y+flux_contribution_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(momentum_nb.z+momentum_i.z);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.z+flux_contribution_i_density_energy.z);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.z+flux_contribution_i_momentum_x.z);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.z+flux_contribution_i_momentum_y.z);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.z+flux_contribution_i_momentum_z.z);
			}
			else if(nb == -1)	// a wing boundary
			{
				flux_i_momentum.x += normal.x*pressure_i;
				flux_i_momentum.y += normal.y*pressure_i;
				flux_i_momentum.z += normal.z*pressure_i;
			}
			else if(nb == -2) // a far field boundary
			{
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.x+flux_contribution_i_density_energy.x);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.x + flux_contribution_i_momentum_x.x);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.x + flux_contribution_i_momentum_y.x);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.x + flux_contribution_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.y+flux_contribution_i_density_energy.y);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.y + flux_contribution_i_momentum_x.y);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.y + flux_contribution_i_momentum_y.y);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.y + flux_contribution_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.z+flux_contribution_i_density_energy.z);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.z + flux_contribution_i_momentum_x.z);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.z + flux_contribution_i_momentum_y.z);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.z + flux_contribution_i_momentum_z.z);

			}
		}

		fluxes[i*NVAR + VAR_DENSITY] = flux_i_density;
		fluxes[i*NVAR + (VAR_MOMENTUM+0)] = flux_i_momentum.x;
		fluxes[i*NVAR + (VAR_MOMENTUM+1)] = flux_i_momentum.y;
		fluxes[i*NVAR + (VAR_MOMENTUM+2)] = flux_i_momentum.z;
		fluxes[i*NVAR + VAR_DENSITY_ENERGY] = flux_i_density_energy;
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma186_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

}