CCudaTimeMeasure::CCudaTimeMeasure(cudaStream_t csStreamID/* = 0*/): m_ceStartEvent(NULL), m_ceStopEvent(NULL), m_csStreamID(csStreamID) { cudaCheckError(cudaEventCreate(&m_ceStartEvent)); cudaCheckError(cudaEventCreate(&m_ceStopEvent)); cudaCheckError(cudaEventRecord(m_ceStartEvent, m_csStreamID)); }
float CCudaTimeMeasure::GetTimeout(bool bResetStart/* = false*/) { cudaCheckError(cudaEventRecord(m_ceStopEvent, m_csStreamID)); cudaCheckError(cudaEventSynchronize(m_ceStopEvent)); float fElapsedTime = 0.0f; cudaCheckError(cudaEventElapsedTime(&fElapsedTime, m_ceStartEvent, m_ceStopEvent)); if (bResetStart) { cudaCheckError(cudaEventRecord(m_ceStartEvent, m_csStreamID)); } return fElapsedTime; }
void XZHydrostatic_GeoPotential<EvalT, Traits>:: evaluateFields(typename Traits::EvalData workset) { #ifndef ALBANY_KOKKOS_UNDER_DEVELOPMENT for (int cell=0; cell < workset.numCells; ++cell) { for (int node=0; node < numNodes; ++node) { for (int level=0; level < numLevels; ++level) { ScalarT sum = PhiSurf(cell,node) + 0.5 * Pi(cell,node,level) * E.delta(level) / density(cell,node,level); for (int j=level+1; j < numLevels; ++j) sum += Pi(cell,node,j) * E.delta(j) / density(cell,node,j); Phi(cell,node,level) = sum; //std::cout <<"Inside GeoP, cell, node, PhiSurf(cell,node)="<<cell<< //", "<<node<<", "<<PhiSurf(cell,node) <<std::endl; } } } /* OG Debugging statements std::cout << "Printing PHI at level 0 ----------------------------------------- \n"; //for(int level=0; level < numLevels; ++level){ for (int node=0; node < numNodes; ++node) { //std::cout << "lev= " << level << ", phi = " << Phi(23,0,level) <<"\n"; std::cout << "node = " << node << ", phi = " << Phi(23,node,0) <<"\n"; } //}*/ #else Kokkos::parallel_for(XZHydrostatic_GeoPotential_Policy(0,workset.numCells),*this); cudaCheckError(); #endif }
void CCudaDeviceProperties::Init() { const int nDeviceIndex = GetDeviceIndex(); int nDevicesCount; cudaCheckError(cudaGetDeviceCount(&nDevicesCount)); if (nDeviceIndex < nDevicesCount) { cudaCheckError(cudaGetDeviceProperties(&m_dpProperties, nDeviceIndex)); } else { cout << "Incorrect device index " << nDeviceIndex << ". Proper index in range 0 .. " << nDevicesCount << endl; } }
void XZHydrostatic_Omega<EvalT, Traits>:: evaluateFields(typename Traits::EvalData workset) { #ifndef ALBANY_KOKKOS_UNDER_DEVELOPMENT for (int cell=0; cell < workset.numCells; ++cell) { for (int qp=0; qp < numQPs; ++qp) { for (int level=0; level < numLevels; ++level) { ScalarT sum = -0.5*divpivelx(cell,qp,level) * E.delta(level); for (int j=0; j<level; ++j) sum -= divpivelx(cell,qp,j) * E.delta(j); for (int dim=0; dim < numDims; ++dim) sum += Velocity(cell,qp,level,dim)*gradp(cell,qp,level,dim); omega(cell,qp,level) = sum/(Cpstar(cell,qp,level)*density(cell,qp,level)); } } } #else Kokkos::parallel_for(XZHydrostatic_Omega_Policy(0,workset.numCells),*this); cudaCheckError(); #endif }
void Hydrostatic_Velocity<EvalT, Traits>:: evaluateFields(typename Traits::EvalData workset) { time = workset.current_time; #ifndef ALBANY_KOKKOS_UNDER_DEVELOPMENT //*out << "Aeras::Hydrostatic_Velocity time = " << time << std::endl; switch (adv_type) { case UNKNOWN: //velocity is an unknown that we solve for (not prescribed) { for (int cell=0; cell < workset.numCells; ++cell) for (int node=0; node < numNodes; ++node) for (int level=0; level < numLevels; ++level) for (int dim=0; dim < numDims; ++dim) Velocity(cell,node,level,dim) = Velx(cell,node,level,dim); } break; case PRESCRIBED_1_1: //velocity is prescribed to that of 1-1 test { for (int cell=0; cell < workset.numCells; ++cell) { for (int node=0; node < numNodes; ++node) { const MeshScalarT lambda = sphere_coord(cell, node, 0); const MeshScalarT theta = sphere_coord(cell, node, 1); ScalarT lambdap = lambda - 2.0*PI*time/tau; for (int level=0; level < numLevels; ++level) { ScalarT Ua = k*sin(lambdap)*sin(lambdap)*sin(2.0*theta)*cos(PI*time/tau) + (2.0*PI*earthRadius/tau)*cos(theta); ScalarT Va = k*sin(2.0*lambdap)*cos(theta)*cos(PI*time/tau); ScalarT B = E.B(level); ScalarT p = pressure(cell,node,level); ScalarT taper = - exp( (p - p0)/(B*ptop) ) + exp( (ptop - p )/(B*ptop) ); ScalarT Ud = (omega0*earthRadius)/(B*ptop) *cos(lambdap)*cos(theta)*cos(theta)*cos(2.0*PI*time/tau)*taper; Velocity(cell,node,level,0) = Ua + Ud; Velocity(cell,node,level,1) = Va; } } } } break; case PRESCRIBED_1_2: //velocity is prescribed to that of 1-2 test { //FIXME: Pete, Tom - please fill in for (int cell=0; cell < workset.numCells; ++cell) { for (int node=0; node < numNodes; ++node) { const MeshScalarT lambda = sphere_coord(cell, node, 0); const MeshScalarT theta = sphere_coord(cell, node, 1); for (int level=0; level < numLevels; ++level) { for (int dim=0; dim < numDims; ++dim) { Velocity(cell,node,level,dim) = 0.0; //FIXME } } } } } break; } #else switch (adv_type) { case UNKNOWN: //velocity is an unknown that we solve for (not prescribed) { Kokkos::parallel_for(Hydrostatic_Velocity_Policy(0,workset.numCells),*this); cudaCheckError(); break; } case PRESCRIBED_1_1: //velocity is prescribed to that of 1-1 test { Kokkos::parallel_for(Hydrostatic_Velocity_PRESCRIBED_1_1_Policy(0,workset.numCells),*this); cudaCheckError(); break; } case PRESCRIBED_1_2: //velocity is prescribed to that of 1-2 test { Kokkos::parallel_for(Hydrostatic_Velocity_PRESCRIBED_1_2_Policy(0,workset.numCells),*this); cudaCheckError(); break; } } #endif }
int main() { //SPECIFY PARMETERS, SEE CONFIG.H FILE FOR ENUM OPTIONS Configurations config; //Algorithm for solving for h config.algorithm_solve_h = NEWTON_BRUTE_FORCE; //Total time in years config.total_time = 20; // Initial time step in seconds config.initial_time_step = 30000;//12592000; // Injection stop config.injection_time = 100; // Pressure update interval in years config.pressure_update_injection = 2; config.pressure_update_migration = 5; // Permeability type config.perm_type = PERM_CONSTANT; // Name of formation config.formation_name = UTSIRA; // Parameter beta for the Corey permeability function config.beta = 0.4; ////////////////////////////////////////////////////////////////////////// if (config.formation_name == UTSIRA){ config.formation = "utsira"; config.formation_dir = "Utsira"; } else if(config.formation_name == JOHANSEN){ config.formation = "johansen"; config.formation_dir = "Johansen"; } else { printf("ERROR: No data for this formation"); } //Initialize some variables int nx, ny, nz; float dt, dz; float t, tf; int year = 60*60*24*365; // Device properties cudaSetDevice(0); int device; cudaGetDevice(&device); cudaDeviceProp p; cudaGetDeviceProperties(&p, device); printf("Device name: %s\n", p.name); // Set directory path of output and input files size_t buff_size= 100; char buffer[buff_size]; const char* path; readlink("/proc/self/exe", buffer, buff_size); printf("PATH %s", buffer); char* output_dir_path = "FullyIntegratedVESimulatorMATLAB/SimulationData/ResultData/"; char* input_dir_path = "FullyIntegratedVESimulatorMATLAB/SimulationData/FormationData/"; std::cout << "Trying to open " << input_dir_path << std::endl; std::cout << "Output will end up in " << output_dir_path << std::endl; // Filename strings char dir_input[300]; char filename_input[300]; char dir_output[300]; strcpy(dir_input, input_dir_path); strcat(dir_input, config.formation_dir); strcat(dir_input, "/"); strcpy(dir_output, output_dir_path); strcat(dir_output, config.formation_dir); strcat(dir_output, "/"); strcpy(filename_input, dir_input); strcat (filename_input, "dimensions.mat"); // Output txt files with results FILE* matlab_file_h; FILE* matlab_file_coarse_satu; FILE* matlab_file_volume; // Create output files for coarse saturation, interface height and volume // Files are stored in the directory createOutputFiles(matlab_file_h, matlab_file_coarse_satu, matlab_file_volume, dir_output); readDimensionsFromMATLABFile(filename_input, nx, ny, nz); InitialConditions IC(nx, ny, 5); printf("nx: %i, ny: %i nz: %i dt: %.10f", nx, ny, nz, dt); // Cpu pointers to store formation data from MATLAB CpuPtr_2D H(nx, ny, 0, true); CpuPtr_2D top_surface(nx, ny, 0, true); CpuPtr_2D h(nx, ny, 0, true); CpuPtr_2D normal_z(nx, ny, 0, true); CpuPtr_3D perm3D(nx, ny, nz + 1, 0, true); CpuPtr_3D poro3D(nx, ny, nz + 1, 0, true); CpuPtr_2D pv(nx, ny, 0, true); CpuPtr_2D flux_north(nx, ny, IC.border, true); CpuPtr_2D flux_east(nx, ny, IC.border, true); CpuPtr_2D source(nx, ny, 0, true); CpuPtr_2D grav_north(nx, ny, 0, true); CpuPtr_2D grav_east(nx, ny, 0, true); CpuPtr_2D K_face_north(nx, ny, 0, true); CpuPtr_2D K_face_east(nx, ny, 0, true); CpuPtr_2D active_east(nx, ny, 0, true); CpuPtr_2D active_north(nx, ny, 0, true); CpuPtr_2D volume(nx, ny, 0,true); strcpy(filename_input, dir_input); strcat (filename_input, "data.mat"); readFormationDataFromMATLABFile(filename_input, H.getPtr(), top_surface.getPtr(), h.getPtr(), normal_z.getPtr(), perm3D.getPtr(), poro3D.getPtr(), pv.getPtr(), flux_north.getPtr(), flux_east.getPtr(), grav_north.getPtr(), grav_east.getPtr(), K_face_north.getPtr(), K_face_east.getPtr(), dz); strcpy(filename_input, dir_input); strcat (filename_input, "active_cells.mat"); readActiveCellsFromMATLABFile(filename_input, active_east.getPtr(), active_north.getPtr()); //readDtTableFromMATLABFile(filename, dt_table, size_dt_table); Engine *ep; if (!(ep = engOpen(""))) { fprintf(stderr, "\nCan't start MATLAB engine\n"); return EXIT_FAILURE; } startMatlabEngine(ep, config.formation_dir); // Create double precision array for the data exchange between the GPU program and the MATLAB program mxArray *h_matrix = NULL, *flux_east_matrix = NULL, *flux_north_matrix=NULL; mxArray *source_matrix = NULL, *open_well = NULL; open_well = mxCreateLogicalScalar(true); engPutVariable(ep, "open_well", open_well); double * h_matlab_matrix; h_matlab_matrix = new double[nx*ny]; h_matrix = mxCreateDoubleMatrix(nx, ny, mxREAL); flux_east_matrix = mxCreateDoubleMatrix(nx+2*IC.border,ny+2*IC.border,mxREAL); flux_north_matrix = mxCreateDoubleMatrix(nx+2*IC.border,ny+2*IC.border,mxREAL); source_matrix = mxCreateDoubleMatrix(nx, ny, mxREAL); // Cpu Pointer to store the results CpuPtr_2D zeros(nx, ny, 0, true); //Initial Conditions IC.dz = dz; IC.createnIntervalsTable(H); IC.createScalingParameterTable(H, config.beta); IC.createInitialCoarseSatu(H, h); IC.computeAllGridBlocks(); IC.createDtVec(); // Create mask for sparse grid on GPU std::vector<int> active_block_indexes; std::vector<int> active_block_indexes_flux; int n_active_blocks = 0; int n_active_blocks_flux = 0; createGridMask(H, IC.grid, IC.block, nx, ny, active_block_indexes, n_active_blocks); createGridMaskFlux(H, IC.grid_flux, IC.block_flux, nx, ny, active_block_indexes_flux, n_active_blocks_flux); // Print grid mask properties printf("\n nBlocks: %i nActiveBlocks: %i fraction: %.5f\n", IC.grid.x * IC.grid.y, n_active_blocks, (float) n_active_blocks / (IC.grid.x * IC.grid.y)); printf("nBlocks: %i nActiveBlocks: %i fraction: %.5f\n", IC.grid_flux.x * IC.grid_flux.y, n_active_blocks_flux, (float) n_active_blocks_flux / (IC.grid_flux.x * IC.grid_flux.y)); printf("dz: %.3f\n", IC.dz); dim3 new_sparse_grid(n_active_blocks, 1, 1); dim3 new_sparse_grid_flux(n_active_blocks_flux, 1, 1); CommonArgs common_args; CoarseMobIntegrationKernelArgs coarse_mob_int_args; CoarsePermIntegrationKernelArgs coarse_perm_int_args; FluxKernelArgs flux_kernel_args; TimeIntegrationKernelArgs time_int_kernel_args; TimestepReductionKernelArgs time_red_kernel_args; SolveForhProblemCellsKernelArgs solve_problem_cells_args; printf("Cuda error 0.5: %s\n", cudaGetErrorString(cudaGetLastError())); initAllocate(&common_args, &coarse_perm_int_args, &coarse_mob_int_args, &flux_kernel_args, &time_int_kernel_args, &time_red_kernel_args, &solve_problem_cells_args); h.convertToDoublePointer(h_matlab_matrix); memcpy((void *)mxGetPr(h_matrix), (void *)h_matlab_matrix, sizeof(double)*nx*ny); engPutVariable(ep, "h_matrix", h_matrix); printf("Cuda error 1: %s\n", cudaGetErrorString(cudaGetLastError())); // Allocate and set data on the GPU GpuPtr_3D perm3D_device(nx, ny, nz + 1, 0, perm3D.getPtr()); GpuPtr_2D Lambda_c_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D Lambda_b_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D dLambda_c_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D dLambda_b_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D scaling_parameter_C_device(nx, ny, 0, IC.scaling_parameter.getPtr()); GpuPtr_2D K_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D H_device(nx, ny, 0, H.getPtr()); GpuPtr_2D h_device(nx, ny, 0, h.getPtr()); GpuPtr_2D top_surface_device(nx, ny, 0, top_surface.getPtr()); GpuPtr_2D z_diff_east_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D z_diff_north_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D nInterval_device(nx, ny, 0, IC.nIntervals.getPtr()); GpuPtr_2D U_x_device(nx, ny, IC.border, flux_east.getPtr()); GpuPtr_2D U_y_device(nx, ny, IC.border, flux_north.getPtr()); GpuPtr_2D source_device(nx, ny, 0, source.getPtr()); GpuPtr_2D K_face_east_device(nx, ny, 0, K_face_east.getPtr()); GpuPtr_2D K_face_north_device(nx, ny, 0, K_face_north.getPtr()); GpuPtr_2D grav_east_device(nx, ny, 0, grav_east.getPtr()); GpuPtr_2D grav_north_device(nx, ny, 0, grav_north.getPtr()); GpuPtr_2D normal_z_device(nx, ny, 0, normal_z.getPtr()); GpuPtr_2D R_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D pv_device(nx, ny, 0, pv.getPtr()); GpuPtr_2D output_test_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D active_east_device(nx, ny, 0, active_east.getPtr()); GpuPtr_2D active_north_device(nx, ny, 0, active_north.getPtr()); GpuPtr_2D vol_old_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D vol_new_device(nx, ny, 0, zeros.getPtr()); GpuPtr_2D coarse_satu_device(nx, ny, 0, IC.initial_coarse_satu_c.getPtr()); GpuPtr_1D global_dt_device(3, IC.global_time_data); GpuPtr_1D dt_vector_device(IC.nElements, IC.dt_vector); GpuPtrInt_1D active_block_indexes_device(n_active_blocks, &active_block_indexes[0]); GpuPtrInt_1D active_block_indexes_flux_device(n_active_blocks_flux, &active_block_indexes_flux[0]); cudaCheckError(); setCommonArgs(&common_args, IC.p_ci, IC.delta_rho, IC.g, IC.mu_c, IC.mu_b, IC.s_c_res, IC.s_b_res, IC.lambda_end_point_c, IC.lambda_end_point_b, active_east_device.getRawPtr(), active_north_device.getRawPtr(), H_device.getRawPtr(), pv_device.getRawPtr(), nx, ny, IC.border); setupGPU(&common_args); setCoarsePermIntegrationKernelArgs(&coarse_perm_int_args, K_device.getRawPtr(), perm3D_device.getRawPtr(), nInterval_device.getRawPtr(), IC.dz); callCoarsePermIntegrationKernel(IC.grid, IC.block, &coarse_perm_int_args); setCoarseMobIntegrationKernelArgs(&coarse_mob_int_args, Lambda_c_device.getRawPtr(), Lambda_b_device.getRawPtr(), dLambda_c_device.getRawPtr(), dLambda_b_device.getRawPtr(), h_device.getRawPtr(), perm3D_device.getRawPtr(), K_device.getRawPtr(), nInterval_device.getRawPtr(), scaling_parameter_C_device.getRawPtr(), active_block_indexes_device.getRawPtr(), IC.p_ci, IC.dz, config.perm_type); setFluxKernelArgs(&flux_kernel_args, Lambda_c_device.getRawPtr(),Lambda_b_device.getRawPtr(), dLambda_c_device.getRawPtr(), dLambda_b_device.getRawPtr(), U_x_device.getRawPtr(), U_y_device.getRawPtr(), source_device.getRawPtr(), h_device.getRawPtr(),top_surface_device.getRawPtr(), normal_z_device.getRawPtr(), K_face_east_device.getRawPtr(), K_face_north_device.getRawPtr(), grav_east_device.getRawPtr(), grav_north_device.getRawPtr(), R_device.getRawPtr(), dt_vector_device.getRawPtr(), active_block_indexes_flux_device.getRawPtr(), output_test_device.getRawPtr()); setTimestepReductionKernelArgs(&time_red_kernel_args, TIME_THREADS, IC.nElements, global_dt_device.getRawPtr(), IC.cfl_scale, dt_vector_device.getRawPtr()); //CUDPP CUDPPHandle plan; unsigned int* d_isValid; int* d_in; int* d_out; size_t* d_numValid = NULL; size_t numValidElements; unsigned int numElements=nx*ny; cudaCheckError(); cudaMalloc((void**) &d_isValid, sizeof(unsigned int)*numElements); cudaMalloc((void**) &d_in, sizeof(int)*numElements); cudaMalloc((void**) &d_out, sizeof(int)*numElements); cudaMalloc((void**) &d_numValid, sizeof(size_t)); cudaCheckError(); CUDPPHandle theCudpp; cudppCreate(&theCudpp); setUpCUDPP(theCudpp, plan, nx, ny, d_isValid, d_in, d_out, numElements); printf("\nCudaMemcpy error: %s", cudaGetErrorString(cudaGetLastError())); cudaCheckError(); setTimeIntegrationKernelArgs(&time_int_kernel_args, global_dt_device.getRawPtr(), IC.integral_res,pv_device.getRawPtr(), h_device.getRawPtr(), R_device.getRawPtr(),coarse_satu_device.getRawPtr(), scaling_parameter_C_device.getRawPtr(), vol_old_device.getRawPtr(), vol_new_device.getRawPtr(), d_isValid, d_in); cudaCheckError(); setSolveForhProblemCellsKernelArgs(&solve_problem_cells_args, h_device.getRawPtr(), coarse_satu_device.getRawPtr(), scaling_parameter_C_device.getRawPtr(), d_out, IC.integral_res, d_numValid); cudaCheckError(); //Compute start volume callCoarseMobIntegrationKernel(new_sparse_grid, IC.block, IC.grid.x, &coarse_mob_int_args); callFluxKernel(new_sparse_grid_flux, IC.block_flux, IC.grid_flux.x, &flux_kernel_args); callTimeIntegration(new_sparse_grid, IC.block, IC.grid.x, &time_int_kernel_args); cudaCheckError(); vol_old_device.download(volume.getPtr(), 0, 0, nx, ny); float total_volume_old = computeTotalVolume(volume, nx, ny); t = 0; double t2 = 0; tf = IC.global_time_data[2]; int iter_outer_loop = 0; int iter_inner_loop = 0; int iter_total = 0; int iter_total_lim = 5000; float time = 0; float injected = 0; int table_index = 1; double time_start = getWallTime(); double time_start_iter; double total_time_gpu = 0; while (time < config.total_time && iter_total < iter_total_lim){ t = 0; iter_inner_loop = 0; h_device.download(h.getPtr(), 0, 0, nx, ny); h.convertToDoublePointer(h_matlab_matrix); memcpy((void *)mxGetPr(h_matrix), (void *)h_matlab_matrix, sizeof(double)*nx*ny); engPutVariable(ep, "h_matrix", h_matrix); if (time >= config.injection_time){ open_well = mxCreateLogicalScalar(false); engPutVariable(ep, "open_well", open_well); IC.global_time_data[2] = 31536000*config.pressure_update_migration; tf = IC.global_time_data[2]; cudaMemcpy(global_dt_device.getRawPtr(), IC.global_time_data, sizeof(float)*3, cudaMemcpyHostToDevice); } // MATLAB call engEvalString(ep, "[source, east_flux, north_flux] = pressureFunctionToRunfromCpp(h_matrix, variables, open_well);"); // Get variables from MATLABs pressure solver flux_east_matrix = engGetVariable(ep, "east_flux"); flux_north_matrix = engGetVariable(ep, "north_flux"); source_matrix = engGetVariable(ep, "source"); memcpy((void *)flux_east.getPtr(), (void *)mxGetPr(flux_east_matrix), sizeof(float)*(nx+2*IC.border)*(ny+2*IC.border)); memcpy((void *)flux_north.getPtr(), (void *)mxGetPr(flux_north_matrix), sizeof(float)*(nx+2*IC.border)*(ny+2*IC.border)); memcpy((void *)source.getPtr(), (void *)mxGetPr(source_matrix), sizeof(float)*nx*ny); source_device.upload(source.getPtr(), 0, 0, nx, ny); U_x_device.upload(flux_east.getPtr(), 0, 0, nx+2*IC.border, ny+2*IC.border); U_y_device.upload(flux_north.getPtr(), 0, 0,nx+2*IC.border, ny+2*IC.border); time_start_iter = getWallTime(); while (t < tf && iter_total < iter_total_lim){ cudaCheckError(); callCoarseMobIntegrationKernel(new_sparse_grid, IC.block, IC.grid.x, &coarse_mob_int_args); cudaCheckError(); callFluxKernel(new_sparse_grid_flux, IC.block_flux, IC.grid_flux.x, &flux_kernel_args); cudaCheckError(); callTimestepReductionKernel(TIME_THREADS, &time_red_kernel_args); cudaCheckError(); // Set the initial time step if (iter_total < 1 && iter_inner_loop == 0){ IC.global_time_data[0] = config.initial_time_step; IC.global_time_data[1] += config.initial_time_step; cudaMemcpy(global_dt_device.getRawPtr(), IC.global_time_data, sizeof(float)*3, cudaMemcpyHostToDevice); } //For precomputed time step insertion /* IC.global_time_data[0] = (float)dt_table[table_index]; IC.global_time_data[1] += (float)dt_table[table_index]; cudaMemcpy(global_dt_device.getRawPtr(), IC.global_time_data, sizeof(float)*3, cudaMemcpyHostToDevice); */ cudaCheckError(); if (config.algorithm_solve_h == BRUTE_FORCE) callTimeIntegration(new_sparse_grid, IC.block, IC.grid.x, &time_int_kernel_args); else if (config.algorithm_solve_h == NEWTON_BRUTE_FORCE){ callTimeIntegrationNewton(new_sparse_grid, IC.block, IC.grid.x, &time_int_kernel_args); cudppCompact(plan, d_out, d_numValid, d_in, d_isValid, numElements); callSolveForhProblemCells(IC.grid_pc, IC.block_pc, &solve_problem_cells_args); } else if (config.algorithm_solve_h == NEWTON_BISECTION){ callTimeIntegrationNewton(new_sparse_grid, IC.block, IC.grid.x, &time_int_kernel_args); cudppCompact(plan, d_out, d_numValid, d_in, d_isValid, numElements); callSolveForhProblemCellsBisection(IC.grid_pc, IC.block_pc, &solve_problem_cells_args); } cudaCheckError(); cudaMemcpy(IC.global_time_data, global_dt_device.getRawPtr(), sizeof(float)*3, cudaMemcpyDeviceToHost); cudaCheckError(); // Keep track of injected volume, insert injection coordinate and rate injected += IC.global_time_data[0]*source(50,50); t += IC.global_time_data[0]; //table_index++; iter_inner_loop++; iter_total++; } total_time_gpu += getWallTime() - time_start_iter; printf("Total time in years: %.3f time in this round %.3f timestep %.3f GPU time %.3f time per iter %.8f \n", time, t, IC.global_time_data[0], getWallTime() - time_start_iter, (getWallTime() - time_start_iter)/iter_inner_loop); time += t/(year); iter_outer_loop++; } printf("Elapsed time program: %.5f gpu part: %.5f", getWallTime() - time_start, total_time_gpu); engClose(ep); h_device.download(zeros.getPtr(), 0, 0, nx, ny); zeros.printToFile(matlab_file_h); coarse_satu_device.download(zeros.getPtr(), 0, 0, nx, ny); // Divide by the formation thickness H to get the actual coarse satu for (int i = 0; i < nx; i++){ for (int j = 0; j < ny; j++){ if (H(i,j) != 0) zeros(i,j) = zeros(i,j)/H(i,j); } } zeros.printToFile(matlab_file_coarse_satu); vol_new_device.download(zeros.getPtr(), 0, 0, nx, ny); zeros.printToFile(matlab_file_volume); float total_volume_new = computeTotalVolume(zeros, nx, ny); printf("total volume new %.2f total volume old %.2f injected %.1f injected fraction %.10f", total_volume_new, total_volume_old, injected, (total_volume_new-injected)/(injected)); printf("volume fraction %.10f",(total_volume_new-total_volume_old)/(total_volume_old)); printf("\nCudaMemcpy error: %s", cudaGetErrorString(cudaGetLastError())); printf("FINITO precise time %.6f iter_total %i", time, iter_total); }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors"); VectorType r(b.startIndex, nrows); VectorType p(0, ncols); VectorType Ap(b.startIndex, nrows); nvtxRangeEnd(r1); #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); #endif #endif normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot(r, r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon(); #ifdef MINIFE_DEBUG std::ostream& os = outstream(); os << "brkdown_tol = " << brkdown_tol << std::endl; #endif for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) { if (k == 1) { TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY); } else { oldrtrans = rtrans; TICK(); rtrans = dot(r, r); TOCK(tDOT); magnitude_type beta = rtrans/oldrtrans; TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY); } normr = std::sqrt(rtrans); if (myproc == 0 && (k%print_freq==0 || k==max_iter)) { std::cout << "Iteration = "<<k<<" Residual = "<<normr<<std::endl; } magnitude_type alpha = 0; magnitude_type p_ap_dot = 0; TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT); #ifdef MINIFE_DEBUG os << "iter " << k << ", p_ap_dot = " << p_ap_dot; os.flush(); #endif //TODO remove false below if (false && p_ap_dot < brkdown_tol) { if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) { std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl; #ifdef MINIFE_DEBUG os << "ERROR, numerical breakdown!"<<std::endl; #endif //update the timers before jumping out. my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[TOTAL] = mytimer() - total_time; return; } else brkdown_tol = 0.1 * p_ap_dot; } alpha = rtrans/p_ap_dot; #ifdef MINIFE_DEBUG os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl; #endif TICK(); waxpby(one, x, alpha, p, x); waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY); num_iters = k; } #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostUnregister(&p.coefs[0]); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]); cudaCheckError(); #endif #endif my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }
CCudaTimeMeasure::~CCudaTimeMeasure() { cudaCheckError(cudaEventDestroy(m_ceStartEvent)); cudaCheckError(cudaEventDestroy(m_ceStopEvent)); }