void compute_flux_contributions(int nelr, double* variables, double* fc_momentum_x, double* fc_momentum_y, double* fc_momentum_z, double* fc_density_energy) { #pragma acc kernels for(int i = 0; i < nelr; i++) { double density_i = variables[NVAR*i + VAR_DENSITY]; double3 momentum_i; momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)]; momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)]; momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)]; double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY]; double3 velocity_i; compute_velocity(density_i, momentum_i, velocity_i); double speed_sqd_i = compute_speed_sqd(velocity_i); double speed_i = sqrtf(speed_sqd_i); double pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i); double speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i); double3 fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z; double3 fc_i_density_energy; compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z, fc_i_density_energy); fc_momentum_x[i*NDIM + 0] = fc_i_momentum_x.x; fc_momentum_x[i*NDIM + 1] = fc_i_momentum_x.y; fc_momentum_x[i*NDIM+ 2] = fc_i_momentum_x.z; fc_momentum_y[i*NDIM+ 0] = fc_i_momentum_y.x; fc_momentum_y[i*NDIM+ 1] = fc_i_momentum_y.y; fc_momentum_y[i*NDIM+ 2] = fc_i_momentum_y.z; fc_momentum_z[i*NDIM+ 0] = fc_i_momentum_z.x; fc_momentum_z[i*NDIM+ 1] = fc_i_momentum_z.y; fc_momentum_z[i*NDIM+ 2] = fc_i_momentum_z.z; fc_density_energy[i*NDIM+ 0] = fc_i_density_energy.x; fc_density_energy[i*NDIM+ 1] = fc_i_density_energy.y; fc_density_energy[i*NDIM+ 2] = fc_i_density_energy.z; } }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); //float* h_areas = new float[nelr]; //int* h_elements_surrounding_elements = new int[nelr*NNB]; //float* h_normals = new float[nelr*NDIM*NNB]; float* h_areas ; int* h_elements_surrounding_elements ; float* h_normals ; h_areas = (float*) memalign(AOCL_ALIGNMENT,nelr*sizeof(float)); h_elements_surrounding_elements = (int*) memalign(AOCL_ALIGNMENT,nelr*NNB*sizeof(int)); h_normals = (float *) memalign(AOCL_ALIGNMENT,nelr*NDIM*NNB*sizeof(float)); //posix_memalign(&h_areas , AOCL_ALIGNMENT, nelr); //posix_memalign(&h_elements_surrounding_elements , AOCL_ALIGNMENT, nelr*NNB); //posix_memalign(&h_normals , AOCL_ALIGNMENT, nelr*NDIM*NNB); // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } char* kernel_files; int num_kernels = 20; kernel_files = (char*) malloc(sizeof(char*)*num_kernels); strcpy(kernel_files,"cfd_kernel"); program = ocdBuildProgramFromFile(context,device_id,kernel_files, NULL); // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
/* * Main function */ int main(int argc, char** argv) { if (argc < 2) { std::cout << "specify data file name" << std::endl; return 0; } const char* data_file_name = argv[1]; const unsigned long long full_program_start = current_time_ns(); { // set far field conditions { const double angle_of_attack = double(3.1415926535897931 / 180.0) * double(deg_angle_of_attack); ff_variable[VAR_DENSITY] = double(1.4); double ff_pressure = double(1.0); double ff_speed_of_sound = sqrt(GAMMA*ff_pressure / ff_variable[VAR_DENSITY]); double ff_speed = double(ff_mach)*ff_speed_of_sound; cfd_double3 ff_velocity; ff_velocity.x = ff_speed*double(cos((double)angle_of_attack)); ff_velocity.y = ff_speed*double(sin((double)angle_of_attack)); ff_velocity.z = 0.0; ff_variable[VAR_MOMENTUM+0] = ff_variable[VAR_DENSITY] * ff_velocity.x; ff_variable[VAR_MOMENTUM+1] = ff_variable[VAR_DENSITY] * ff_velocity.y; ff_variable[VAR_MOMENTUM+2] = ff_variable[VAR_DENSITY] * ff_velocity.z; ff_variable[VAR_DENSITY_ENERGY] = ff_variable[VAR_DENSITY]*(double(0.5)*(ff_speed*ff_speed)) + (ff_pressure / double(GAMMA-1.0)); cfd_double3 ff_momentum; ff_momentum.x = *(ff_variable+VAR_MOMENTUM+0); ff_momentum.y = *(ff_variable+VAR_MOMENTUM+1); ff_momentum.z = *(ff_variable+VAR_MOMENTUM+2); compute_flux_contribution(ff_variable[VAR_DENSITY], ff_momentum, ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy); } int nel; int nelr; // read in domain geometry double* areas; int* elements_surrounding_elements; double* normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); areas = new double[nelr]; elements_surrounding_elements = new int[nelr*NNB]; normals = new double[NDIM*NNB*nelr]; // read in data for(int i = 0; i < nel; i++) { file >> areas[i]; for(int j = 0; j < NNB; j++) { file >> elements_surrounding_elements[i*NNB + j]; if(elements_surrounding_elements[i*NNB+j] < 0) elements_surrounding_elements[i*NNB+j] = -1; elements_surrounding_elements[i*NNB + j]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> normals[(i*NNB + j)*NDIM + k]; normals[(i*NNB + j)*NDIM + k] = -normals[(i*NNB + j)*NDIM + k]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { areas[i] = areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element elements_surrounding_elements[i*NNB + j] = elements_surrounding_elements[last*NNB + j]; for(int k = 0; k < NDIM; k++) normals[(i*NNB + j)*NDIM + k] = normals[(last*NNB + j)*NDIM + k]; } } } // Create arrays and set initial conditions double* variables = alloc<double>(nelr*NVAR); initialize_variables(nelr, variables); double* old_variables = alloc<double>(nelr*NVAR); double* fluxes = alloc<double>(nelr*NVAR); double* step_factors = alloc<double>(nelr); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy(old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step compute_step_factor(nelr, variables, areas, step_factors); for(int j = 0; j < RK; j++) { compute_flux(nelr, elements_surrounding_elements, normals, variables, fluxes); time_step(j, nelr, old_variables, variables, step_factors, fluxes); } } std::cout << "Saving solution..." << std::endl; dump(variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; dealloc<double>(areas); dealloc<int>(elements_surrounding_elements); dealloc<double>(normals); dealloc<double>(variables); dealloc<double>(old_variables); dealloc<double>(fluxes); dealloc<double>(step_factors); } ; const unsigned long long full_program_end = current_time_ns(); printf("full_program %llu ns\n", full_program_end - full_program_start); std::cout << "Done..." << std::endl; return 0; }
void compute_flux(int nelr, int* elements_surrounding_elements, double* normals, double* variables, double* fluxes) { double smoothing_coefficient = double(0.2f); { const unsigned long long parallel_for_start = current_time_ns(); #pragma omp parallel for default(shared) schedule(static) for(int i = 0; i < nelr; i++) { int j, nb; cfd_double3 normal; double normal_len; double factor; double density_i = variables[NVAR*i + VAR_DENSITY]; cfd_double3 momentum_i; momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)]; momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)]; momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)]; double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY]; cfd_double3 velocity_i; compute_velocity(density_i, momentum_i, velocity_i); double speed_sqd_i = compute_speed_sqd(velocity_i); double speed_i = std::sqrt(speed_sqd_i); double pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i); double speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i); cfd_double3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z; cfd_double3 flux_contribution_i_density_energy; compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z, flux_contribution_i_density_energy); double flux_i_density = double(0.0); cfd_double3 flux_i_momentum; flux_i_momentum.x = double(0.0); flux_i_momentum.y = double(0.0); flux_i_momentum.z = double(0.0); double flux_i_density_energy = double(0.0); cfd_double3 velocity_nb; double density_nb, density_energy_nb; cfd_double3 momentum_nb; cfd_double3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z; cfd_double3 flux_contribution_nb_density_energy; double speed_sqd_nb, speed_of_sound_nb, pressure_nb; for(j = 0; j < NNB; j++) { nb = elements_surrounding_elements[i*NNB + j]; normal.x = normals[(i*NNB + j)*NDIM + 0]; normal.y = normals[(i*NNB + j)*NDIM + 1]; normal.z = normals[(i*NNB + j)*NDIM + 2]; normal_len = std::sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z); if(nb >= 0) // a legitimate neighbor { density_nb = variables[nb*NVAR + VAR_DENSITY]; momentum_nb.x = variables[nb*NVAR + (VAR_MOMENTUM+0)]; momentum_nb.y = variables[nb*NVAR + (VAR_MOMENTUM+1)]; momentum_nb.z = variables[nb*NVAR + (VAR_MOMENTUM+2)]; density_energy_nb = variables[nb*NVAR + VAR_DENSITY_ENERGY]; compute_velocity(density_nb, momentum_nb, velocity_nb); speed_sqd_nb = compute_speed_sqd(velocity_nb); pressure_nb = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb); speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb); compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy); // artificial viscosity factor = -normal_len*smoothing_coefficient*double(0.5)*(speed_i + std::sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb); flux_i_density += factor*(density_i-density_nb); flux_i_density_energy += factor*(density_energy_i-density_energy_nb); flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x); flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y); flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z); // accumulate cell-centered fluxes factor = double(0.5)*normal.x; flux_i_density += factor*(momentum_nb.x+momentum_i.x); flux_i_density_energy += factor*(flux_contribution_nb_density_energy.x+flux_contribution_i_density_energy.x); flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.x+flux_contribution_i_momentum_x.x); flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.x+flux_contribution_i_momentum_y.x); flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.x+flux_contribution_i_momentum_z.x); factor = double(0.5)*normal.y; flux_i_density += factor*(momentum_nb.y+momentum_i.y); flux_i_density_energy += factor*(flux_contribution_nb_density_energy.y+flux_contribution_i_density_energy.y); flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.y+flux_contribution_i_momentum_x.y); flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.y+flux_contribution_i_momentum_y.y); flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.y+flux_contribution_i_momentum_z.y); factor = double(0.5)*normal.z; flux_i_density += factor*(momentum_nb.z+momentum_i.z); flux_i_density_energy += factor*(flux_contribution_nb_density_energy.z+flux_contribution_i_density_energy.z); flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.z+flux_contribution_i_momentum_x.z); flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.z+flux_contribution_i_momentum_y.z); flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.z+flux_contribution_i_momentum_z.z); } else if(nb == -1) // a wing boundary { flux_i_momentum.x += normal.x*pressure_i; flux_i_momentum.y += normal.y*pressure_i; flux_i_momentum.z += normal.z*pressure_i; } else if(nb == -2) // a far field boundary { factor = double(0.5)*normal.x; flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x); flux_i_density_energy += factor*(ff_flux_contribution_density_energy.x+flux_contribution_i_density_energy.x); flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.x + flux_contribution_i_momentum_x.x); flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.x + flux_contribution_i_momentum_y.x); flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.x + flux_contribution_i_momentum_z.x); factor = double(0.5)*normal.y; flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y); flux_i_density_energy += factor*(ff_flux_contribution_density_energy.y+flux_contribution_i_density_energy.y); flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.y + flux_contribution_i_momentum_x.y); flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.y + flux_contribution_i_momentum_y.y); flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.y + flux_contribution_i_momentum_z.y); factor = double(0.5)*normal.z; flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z); flux_i_density_energy += factor*(ff_flux_contribution_density_energy.z+flux_contribution_i_density_energy.z); flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.z + flux_contribution_i_momentum_x.z); flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.z + flux_contribution_i_momentum_y.z); flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.z + flux_contribution_i_momentum_z.z); } } fluxes[i*NVAR + VAR_DENSITY] = flux_i_density; fluxes[i*NVAR + (VAR_MOMENTUM+0)] = flux_i_momentum.x; fluxes[i*NVAR + (VAR_MOMENTUM+1)] = flux_i_momentum.y; fluxes[i*NVAR + (VAR_MOMENTUM+2)] = flux_i_momentum.z; fluxes[i*NVAR + VAR_DENSITY_ENERGY] = flux_i_density_energy; } ; const unsigned long long parallel_for_end = current_time_ns(); printf("pragma186_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } }