PerformanceData run( const typename FixtureType::FEMeshType & mesh , const int , // global_max_x , const int , // global_max_y , const int global_max_z , const bool print_sample ) { typedef Scalar scalar_type ; typedef FixtureType fixture_type ; typedef typename fixture_type::execution_space execution_space; //typedef typename execution_space::size_type size_type ; // unused typedef typename fixture_type::FEMeshType mesh_type ; typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ; enum { ElementNodeCount = fixture_type::element_node_count }; const comm::Machine machine = mesh.parallel_data_map.machine ; const size_t element_count = mesh.elem_node_ids.dimension_0(); const size_t iteration_limit = 200 ; const double residual_tolerance = 1e-14 ; size_t iteration_count = 0 ; double residual_norm = 0 ; PerformanceData perf_data ; //------------------------------------ // Sparse linear system types: typedef Kokkos::View< scalar_type* , execution_space > vector_type ; typedef Kokkos::CrsMatrix< scalar_type , execution_space > matrix_type ; typedef typename matrix_type::graph_type matrix_graph_type ; typedef typename matrix_type::coefficients_type matrix_coefficients_type ; typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ; //------------------------------------ // Problem setup types: typedef ElementComputation< scalar_type , scalar_type , execution_space > ElementFunctor ; typedef DirichletBoundary< scalar_type , scalar_type , execution_space > BoundaryFunctor ; typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ; typedef typename ElementFunctor::elem_vectors_type elem_vectors_type ; typedef GatherFill< matrix_type , mesh_type , elem_matrices_type , elem_vectors_type > GatherFillFunctor ; //------------------------------------ const scalar_type elem_coeff_K = 2 ; const scalar_type elem_load_Q = 1 ; matrix_type linsys_matrix ; vector_type linsys_rhs ; vector_type linsys_solution ; typename graph_factory::element_map_type element_map ; Kokkos::Impl::Timer wall_clock ; //------------------------------------ // Generate sparse matrix graph and element->graph map. graph_factory::create( mesh , linsys_matrix.graph , element_map ); execution_space::fence(); perf_data.graph_time = comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Allocate linear system coefficients and rhs: const size_t local_owned_length = linsys_matrix.graph.row_map.dimension_0() - 1 ; linsys_matrix.coefficients = matrix_coefficients_type( "coeff" , linsys_matrix.graph.entries.dimension_0() ); linsys_rhs = vector_type( "rhs" , local_owned_length ); linsys_solution = vector_type( "solution" , local_owned_length ); //------------------------------------ // Fill linear system { elem_matrices_type elem_matrices ; elem_vectors_type elem_vectors ; if ( element_count ) { elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count ); elem_vectors = elem_vectors_type ( std::string("elem_vectors"), element_count ); } //------------------------------------ // Compute element matrices and vectors: wall_clock.reset(); ElementFunctor::apply( mesh , elem_matrices , elem_vectors , elem_coeff_K , elem_load_Q ); execution_space::fence(); perf_data.elem_time = comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Fill linear system coefficients: wall_clock.reset(); GatherFillFunctor::apply( linsys_matrix , linsys_rhs , mesh , element_map , elem_matrices , elem_vectors ); execution_space::fence(); perf_data.matrix_gather_fill_time = comm::max( machine , wall_clock.seconds() ); // Apply boundary conditions: wall_clock.reset(); BoundaryFunctor::apply( linsys_matrix , linsys_rhs , mesh , 0 , global_max_z , 0 , global_max_z ); execution_space::fence(); perf_data.matrix_boundary_condition_time = comm::max( machine , wall_clock.seconds() ); } //------------------------------------ // Solve linear sytem cgsolve( mesh.parallel_data_map , linsys_matrix , linsys_rhs , linsys_solution , iteration_count , residual_norm , perf_data.cg_iteration_time , iteration_limit , residual_tolerance ); //------------------------------------ if ( print_sample ) { typename mesh_type::node_coords_type::HostMirror coords_h = Kokkos::create_mirror( mesh.node_coords ); typename vector_type::HostMirror X_h = Kokkos::create_mirror( linsys_solution ); Kokkos::deep_copy( coords_h , mesh.node_coords ); Kokkos::deep_copy( X_h , linsys_solution ); for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) { const coordinate_scalar_type x = coords_h(i,0); const coordinate_scalar_type y = coords_h(i,1); const coordinate_scalar_type z = coords_h(i,2); if ( x <= 0 && y <= 0 ) { std::cout << " node( " << x << " " << y << " " << z << " ) = " << X_h(i) << std::endl ; } } } return perf_data ; }
PerfCGSolve test_cgsolve_array( comm::Machine machine , const int nGrid , const int iterMax , const char * const /* verify_label */ ) { typedef Kokkos::Array<Scalar,N> value_type ; typedef Kokkos::CrsArray<int,Device,void,int> crsarray_type ; typedef Kokkos::CrsMatrix<value_type,Device> matrix_type ; typedef Kokkos::View<value_type*,Kokkos::LayoutRight,Device> vector_type ; //------------------------------ // Generate FEM graph: std::vector< std::vector<size_t> > fem_graph ; const unsigned fem_length = nGrid * nGrid * nGrid ; Test::generate_fem_graph( nGrid , fem_graph ); //------------------------------ vector_type x = vector_type( "x" , fem_length ); vector_type y = vector_type( "y" , fem_length ); typename vector_type::HostMirror hx = Kokkos::create_mirror( x ); typename vector_type::HostMirror hy_result = Kokkos::create_mirror( y ); for ( unsigned i = 0 ; i < fem_length ; ++i ) { for ( unsigned j = 0 ; j < N ; ++j ) { hx(i)[j] = Test::generate_vector_coefficient( fem_length , N , i , j ); } } Kokkos::deep_copy( x , hx ); //------------------------------ matrix_type matrix ; matrix.graph = Kokkos::create_crsarray<crsarray_type>( std::string("testing") , fem_graph ); const unsigned fem_graph_length = matrix.graph.entries.dimension_0(); matrix.values = vector_type( "matrix" , fem_graph_length ); { typename vector_type::HostMirror hM = Kokkos::create_mirror( matrix.values ); for ( size_t iRow = 0 , iEntry = 0 ; iRow < fem_length ; ++iRow ) { for ( unsigned k = 0 ; k < N ; ++k ) { hy_result(iRow)[k] = 0 ; } for ( size_t iRowEntry = 0 ; iRowEntry < fem_graph[ iRow ].size() ; ++iRowEntry , ++iEntry ) { const size_t iCol = fem_graph[ iRow ][ iRowEntry ]; for ( unsigned k = 0 ; k < N ; ++k ) { hM(iEntry)[k] = Test::generate_matrix_coefficient( fem_length , N , iRow, iCol, k ); hy_result(iRow)[k] += hM(iEntry)[k] * hx(iCol)[k]; } } } Kokkos::deep_copy( matrix.values , hM ); } size_t iter_count = 0 ; double iter_time = 0 ; double norm_resid = 0 ; cgsolve( matrix , x , y , iter_count , norm_resid , iter_time , iterMax , 1e-14 ); PerfCGSolve perf ; perf.seconds_per_iter = iter_time ; perf.row_count = fem_length ; perf.entry_count = fem_graph_length ; return perf ; }
PerformanceData run( const typename FixtureType::FEMeshType & mesh , const int global_max_x , const int global_max_y , const int global_max_z , const bool print_error ) { typedef Scalar scalar_type ; typedef FixtureType fixture_type ; typedef typename fixture_type::device_type device_type; typedef typename device_type::size_type size_type ; typedef typename fixture_type::FEMeshType mesh_type ; typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ; enum { ElementNodeCount = fixture_type::element_node_count }; const comm::Machine machine = mesh.parallel_data_map.machine ; const size_t element_count = mesh.elem_node_ids.dimension(0); //------------------------------------ // The amount of nonlinearity is proportional to the ratio // between T(zmax) and T(zmin). For the manufactured solution // 0 < T(zmin) and 0 < T(zmax) const ManufacturedSolution exact_solution( /* zmin */ 0 , /* zmax */ global_max_z , /* T(zmin) */ 1 , /* T(zmax) */ 20 ); //----------------------------------- // Convergence Criteria and perf data: const size_t cg_iteration_limit = 200 ; const double cg_tolerance = 1e-14 ; const size_t newton_iteration_limit = 150 ; const double newton_tolerance = 1e-14 ; size_t cg_iteration_count_total = 0 ; double cg_iteration_time = 0 ; size_t newton_iteration_count = 0 ; double residual_norm_init = 0 ; double residual_norm = 0 ; PerformanceData perf_data ; //------------------------------------ // Sparse linear system types: typedef KokkosArray::View< Scalar[] , device_type > vector_type ; typedef KokkosArray::CrsMatrix< Scalar , device_type > matrix_type ; typedef typename matrix_type::graph_type matrix_graph_type ; typedef typename matrix_type::coefficients_type matrix_coefficients_type ; typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ; //------------------------------------ // Problem setup types: typedef ElementComputation < mesh_type , Scalar > ElementFunctor ; typedef DirichletSolution < mesh_type , Scalar > DirichletSolutionFunctor ; typedef DirichletResidual < mesh_type , Scalar > DirichletResidualFunctor ; typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ; typedef typename ElementFunctor::elem_vectors_type elem_vectors_type ; typedef GatherFill< matrix_type , mesh_type , elem_matrices_type , elem_vectors_type > GatherFillFunctor ; //------------------------------------ matrix_type jacobian ; vector_type residual ; vector_type delta ; vector_type nodal_solution ; typename graph_factory::element_map_type element_map ; //------------------------------------ // Generate mesh and corresponding sparse matrix graph KokkosArray::Impl::Timer wall_clock ; //------------------------------------ // Generate sparse matrix graph and element->graph map. wall_clock.reset(); graph_factory::create( mesh , jacobian.graph , element_map ); device_type::fence(); perf_data.graph_time = comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Allocate linear system coefficients and rhs: const size_t local_owned_length = jacobian.graph.row_map.dimension(0) - 1 ; const size_t local_total_length = mesh.node_coords.dimension(0); jacobian.coefficients = matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension(0) ); // Nonlinear residual for owned nodes: residual = vector_type( "residual" , local_owned_length ); // Nonlinear solution for owned and ghosted nodes: nodal_solution = vector_type( "solution" , local_total_length ); // Nonlinear solution update for owned nodes: delta = vector_type( "delta" , local_owned_length ); //------------------------------------ // Allocation of arrays to fill the linear system elem_matrices_type elem_matrices ; // Jacobian matrices elem_vectors_type elem_vectors ; // Residual vectors if ( element_count ) { elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count ); elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count ); } //------------------------------------ // For boundary condition set the correct values in the solution vector // The 'zmin' face is assigned to 'T_zmin'. // The 'zmax' face is assigned to 'T_zmax'. // The resulting solution is one dimensional along the 'Z' axis. DirichletSolutionFunctor::apply( nodal_solution , mesh , exact_solution.zmin , exact_solution.zmax , exact_solution.T_zmin , exact_solution.T_zmax ); for(;;) { // Nonlinear loop #if defined( HAVE_MPI ) { //------------------------------------ // Import off-processor nodal solution values // for residual and jacobian computations KokkosArray::AsyncExchange< typename vector_type::value_type , device_type , KokkosArray::ParallelDataMap > exchange( mesh.parallel_data_map , 1 ); KokkosArray::PackArray< vector_type > ::pack( exchange.buffer() , mesh.parallel_data_map.count_interior , mesh.parallel_data_map.count_send , nodal_solution ); exchange.setup(); exchange.send_receive(); KokkosArray::UnpackArray< vector_type > ::unpack( nodal_solution , exchange.buffer() , mesh.parallel_data_map.count_owned , mesh.parallel_data_map.count_receive ); } #endif //------------------------------------ // Compute element matrices and vectors: wall_clock.reset(); ElementFunctor( mesh , elem_matrices , elem_vectors , nodal_solution , exact_solution.K ); device_type::fence(); perf_data.elem_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Fill linear system coefficients: wall_clock.reset(); fill( 0 , jacobian.coefficients ); fill( 0 , residual ); GatherFillFunctor::apply( jacobian , residual , mesh , element_map , elem_matrices , elem_vectors ); device_type::fence(); perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() ); // Apply boundary conditions: wall_clock.reset(); // Updates jacobian matrix to 1 on the diagonal, zero elsewhere, // and 0 in the residual due to the solution vector having the correct value DirichletResidualFunctor::apply( jacobian, residual, mesh , exact_solution.zmin , exact_solution.zmax ); device_type::fence(); perf_data.matrix_boundary_condition_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Has the residual converged? residual_norm = sqrt( dot(mesh.parallel_data_map, residual) ); if ( 0 == newton_iteration_count ) { residual_norm_init = residual_norm ; } if ( residual_norm / residual_norm_init < newton_tolerance ) { break ; } //------------------------------------ // Solve linear sytem size_t cg_iteration_count = 0 ; double cg_residual_norm = 0 ; cgsolve( mesh.parallel_data_map , jacobian , residual , delta , cg_iteration_count , cg_residual_norm , cg_iteration_time , cg_iteration_limit , cg_tolerance ) ; perf_data.cg_iteration_time += cg_iteration_time ; cg_iteration_count_total += cg_iteration_count ; // Update non-linear solution with delta... // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update // LaTeX: // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} ) // text: // x[n+1] = x[n] + Dx waxpby( mesh.parallel_data_map, 1.0, nodal_solution, -1.0, delta, nodal_solution); ++newton_iteration_count ; if ( newton_iteration_limit < newton_iteration_count ) { break ; } }; if ( newton_iteration_count ) { perf_data.elem_time /= newton_iteration_count ; perf_data.matrix_gather_fill_time /= newton_iteration_count ; perf_data.matrix_boundary_condition_time /= newton_iteration_count ; } if ( cg_iteration_count_total ) { perf_data.cg_iteration_time /= cg_iteration_count_total ; } perf_data.newton_iteration_count = newton_iteration_count ; perf_data.cg_iteration_count = cg_iteration_count_total ; //------------------------------------ { // For extracting the nodal solution and its coordinates: typename mesh_type::node_coords_type::HostMirror node_coords_host = KokkosArray::create_mirror( mesh.node_coords ); typename vector_type::HostMirror nodal_solution_host = KokkosArray::create_mirror( nodal_solution ); KokkosArray::deep_copy( node_coords_host , mesh.node_coords ); KokkosArray::deep_copy( nodal_solution_host , nodal_solution ); double tmp = 0 ; for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) { const coordinate_scalar_type x = node_coords_host(i,0); const coordinate_scalar_type y = node_coords_host(i,1); const coordinate_scalar_type z = node_coords_host(i,2); const double Tx = exact_solution(z); const double Ts = nodal_solution_host(i); const double Te = std::abs( Tx - Ts ) / std::abs( Tx ); tmp = std::max( tmp , Te ); if ( print_error && 0.02 < Te ) { std::cout << " node( " << x << " " << y << " " << z << " ) = " << Ts << " != exact_solution " << Tx << std::endl ; } } perf_data.error_max = comm::max( machine , tmp ); } return perf_data ; }
int main( int argc , char ** argv ) { const int ghost = 1 ; const int max_cube = 20 ; int ncube[20] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; FILE * print_file = stdout ; int print_iter = 500 ; int max_iter = 50 ; VECTOR_SCALAR tolerance = 0.0 ; /* Force max iterations */ int gbox[3][2] = { { 0 , 16 } , { 0 , 16 } , { 0 , 16 } }; int nt = 0 ; int trials = 5 ; int ntest ; int np = 1; int my_p = 0 ; #ifdef HAVE_MPI MPI_Init( & argc , & argv ); MPI_Comm_size( MPI_COMM_WORLD , & np ); MPI_Comm_rank( MPI_COMM_WORLD , & my_p ); #endif if ( ! my_p ) { const char arg_threads[] = "threads=" ; const char arg_cube[] = "cube=" ; const char arg_box[] = "box=" ; const char arg_max[] = "max_iter=" ; const char arg_trials[] = "trials=" ; const char arg_print[] = "print_iter=" ; const char arg_file[] = "print_file=" ; int i ; for ( i = 1 ; i < argc ; ++i ) { if ( ! strncmp(argv[i],arg_threads,strlen(arg_threads)) ) { sscanf(argv[i]+strlen(arg_threads),"%d",&nt); } else if ( ! strncmp(argv[i],arg_box,strlen(arg_box)) ) { sscanf(argv[i]+strlen(arg_box),"%d%*[x]%d%*[x]%d", & gbox[0][1] , & gbox[1][1] , & gbox[2][1] ); } else if ( ! strncmp(argv[i],arg_cube,strlen(arg_cube)) ) { sscanf(argv[i]+strlen(arg_cube), "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d", ncube+0, ncube+1, ncube+2, ncube+3, ncube+4, ncube+5, ncube+6, ncube+7, ncube+8, ncube+9, ncube+10, ncube+11, ncube+12, ncube+13, ncube+14, ncube+15, ncube+16, ncube+17, ncube+18, ncube+19); } else if ( ! strncmp(argv[i],arg_max,strlen(arg_max)) ) { sscanf(argv[i]+strlen(arg_max),"%d",&max_iter); } else if ( ! strncmp(argv[i],arg_trials,strlen(arg_trials)) ) { sscanf(argv[i]+strlen(arg_trials),"%d",&trials); } else if ( ! strncmp(argv[i],arg_print,strlen(arg_print)) ) { sscanf(argv[i]+strlen(arg_print),"%d",&print_iter); } else if ( ! strncmp(argv[i],arg_file,strlen(arg_file)) ) { char buffer[256] ; sscanf(argv[i]+strlen(arg_file),"%s",buffer); print_file = fopen(buffer,"a"); } } } #ifdef HAVE_MPI { MPI_Bcast( & nt , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & gbox[0][0] , 6 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( ncube , max_cube , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & max_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & print_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & trials , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); } #endif if ( nt ) { TPI_Init( nt ); TPI_Block(); TPI_Unblock(); } if ( ! my_p ) { fprintf(print_file,"\"PROC\" , \"THREAD\" , \"EQUATION\" , \"NON-ZERO\" , \"MXV\" , \"AXPBY\" , \"DOT\" , \"Xerror\" , \"Iter\"\n"); fprintf(print_file,"\"COUNT\" , \"COUNT\" , \"COUNT\" , \"COUNT\" , \"Mflops\" , \"Mflops\" , \"Mflops\" , \"L2norm\" , \"COUNT\"\n"); } for ( ntest = 0 ; ! ntest || ( ntest < max_cube && ncube[ntest] ) ; ++ntest ) { struct cgsolve_data cgdata ; if ( ncube[ntest] ) { gbox[0][1] = gbox[1][1] = gbox[2][1] = ncube[ntest] ; } hpccg_alloc_and_fill( np, my_p, (const int (*)[2]) gbox, ghost, &cgdata); cgdata.max_iter = max_iter ; cgdata.print_iter = print_iter ; cgdata.tolerance = tolerance ; { double dt_mxv[2] = { 0 , 0 }; double dt_axpby[2] = { 0 , 0 }; double dt_dot[2] = { 0 , 0 }; VECTOR_SCALAR norm_resid = 0.0 ; int iter_count = 0 ; int iter_total = 0 ; int k ; VECTOR_SCALAR * const b = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow ); VECTOR_SCALAR * const x = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow ); VECTOR_SCALAR * const xexact = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow ); { const VECTOR_SCALAR value = 1.0 /* 1.0 / 3.0 */ ; int i ; for ( i = 0 ; i < cgdata.nRow ; ++i ) xexact[i] = value ; } for ( k = 0 ; k < trials ; ++k ) { int i ; for ( i = 0 ; i < cgdata.nRow ; ++i ) { x[i] = 0.0 ; } cgsolve_set_lhs( & cgdata , xexact , b ); cgsolve( & cgdata, b, x, & iter_count, & norm_resid, dt_mxv , dt_axpby , dt_dot ); iter_total += iter_count ; } { int nnzGlobal = cgdata.A_pc[ cgdata.nRow ]; double error[2] = { 0 , 0 }; for ( k = 0 ; k < cgdata.nRow ; ++k ) { error[0] += ( x[k] - xexact[k] ) * ( x[k] - xexact[k] ); error[1] += xexact[k] * xexact[k] ; } #ifdef HAVE_MPI { double error_global[2] = { 0.0 , 0.0 }; int nnz = nnzGlobal ; MPI_Allreduce( & nnz , & nnzGlobal , 1 , MPI_INT , MPI_SUM , MPI_COMM_WORLD ); MPI_Allreduce( error , error_global , 2 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD ); error[0] = error_global[0]; error[1] = error_global[1]; } #endif error[0] = sqrt( error[0] ); error[1] = sqrt( error[1] ); if ( ! my_p ) { const int nRowGlobal = ( gbox[0][1] - gbox[0][0] ) * ( gbox[1][1] - gbox[1][0] ) * ( gbox[2][1] - gbox[2][0] ); const double mflop_mxv = 1.0e-6 * ( iter_total ) * 2 * nnzGlobal / dt_mxv[0] ; const double mflop_axpby = 1.0e-6 * ( iter_total * 3 ) * 3 * nRowGlobal / dt_axpby[0] ; const double mflop_dot = 1.0e-6 * ( iter_total * 2 ) * 2 * nRowGlobal / dt_dot[0] ; fprintf(print_file,"%8d , %8d , %8d , %8d , %10g , %10g , %10g , %g , %d\n", np , nt , nRowGlobal , nnzGlobal , mflop_mxv , mflop_axpby , mflop_dot , error[0] / error[1] , iter_total ); fflush(print_file); } } free( xexact ); free( x ); free( b ); } free( cgdata.A_a ); free( cgdata.A_ia ); free( cgdata.A_pc ); free( cgdata.recv_pc ); free( cgdata.send_pc ); free( cgdata.send_id ); } if ( nt ) { TPI_Finalize(); } #ifdef HAVE_MPI MPI_Finalize(); #endif return 0 ; }
int main( int argc, char* argv[] ) { int writeOutX = 0; int n, k; int maxiterations = 1000; int niters=0; double norm; double* b; double* x; double time; double t1, t2; MPI_Init( &argc, &argv ); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); // Read command line args. // 1st case runs model problem, 2nd Case allows you to specify your own b vector if ( argc == 3 ) { k = atoi( argv[1] ); n = k*k; // each processor calls cs240_getB to build its own part of the b vector! } else if ( !strcmp( argv[1], "-i" ) && argc == 4 ) { b = load_vec( argv[2], &k ); } else { printf( "\nCGSOLVE Usage: \n\t" "Model Problem:\tmpirun -np [number_procs] cgsolve [k] [output_1=y_0=n]\n\t" "Custom Input:\tmpirun -np [number_procs] cgsolve -i [input_filename] [output_1=y_0=n]\n\n"); exit(0); } writeOutX = atoi( argv[argc-1] ); // Write X to file if true, do not write if unspecified. // Start Timer t1 = MPI_Wtime(); // CG Solve here! x = cgsolve(k); // End Timer t2 = MPI_Wtime(); printf("TEST: %s\n", cs240_verify(x, k, 0.0) ? "PASSED" : "FAILED"); if ( writeOutX ) { save_vec( k, x ); } // Output printf( "Problem size (k): %d\n",k); if(niters>0){ printf( "Norm of the residual after %d iterations: %lf\n",niters,norm); } printf( "Elapsed time during CGSOLVE: %lf\n", t2-t1); // Deallocate if(niters > 0){ free(b); } if(niters > 0){ free(x); } MPI_Finalize(); return 0; }