PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
                     const int , // global_max_x ,
                     const int , // global_max_y ,
                     const int global_max_z ,
                     const bool print_sample )
{
  typedef Scalar                              scalar_type ;
  typedef FixtureType                         fixture_type ;
  typedef typename fixture_type::execution_space  execution_space;
  //typedef typename execution_space::size_type     size_type ; // unused

  typedef typename fixture_type::FEMeshType mesh_type ;
  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;

  enum { ElementNodeCount = fixture_type::element_node_count };

  const comm::Machine machine = mesh.parallel_data_map.machine ;

  const size_t element_count = mesh.elem_node_ids.dimension_0();

  const size_t iteration_limit = 200 ;
  const double residual_tolerance = 1e-14 ;

  size_t iteration_count = 0 ;
  double residual_norm = 0 ;

  PerformanceData perf_data ;

  //------------------------------------
  // Sparse linear system types:

  typedef Kokkos::View< scalar_type* , execution_space >   vector_type ;
  typedef Kokkos::CrsMatrix< scalar_type , execution_space >     matrix_type ;
  typedef typename matrix_type::graph_type         matrix_graph_type ;
  typedef typename matrix_type::coefficients_type  matrix_coefficients_type ;

  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;

  //------------------------------------
  // Problem setup types:

  typedef ElementComputation< scalar_type , scalar_type , execution_space > ElementFunctor ;
  typedef DirichletBoundary< scalar_type , scalar_type , execution_space > BoundaryFunctor ;

  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;

  typedef GatherFill< matrix_type ,
                      mesh_type ,
                      elem_matrices_type ,
                      elem_vectors_type > GatherFillFunctor ;

  //------------------------------------

  const scalar_type elem_coeff_K = 2 ;
  const scalar_type elem_load_Q  = 1 ;

  matrix_type linsys_matrix ;
  vector_type linsys_rhs ;
  vector_type linsys_solution ;

  typename graph_factory::element_map_type element_map ;

  Kokkos::Impl::Timer wall_clock ;

  //------------------------------------
  // Generate sparse matrix graph and element->graph map.

  graph_factory::create( mesh , linsys_matrix.graph , element_map );

  execution_space::fence();
  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );

  //------------------------------------
  // Allocate linear system coefficients and rhs:

  const size_t local_owned_length =
    linsys_matrix.graph.row_map.dimension_0() - 1 ;

  linsys_matrix.coefficients =
    matrix_coefficients_type( "coeff" , linsys_matrix.graph.entries.dimension_0() );

  linsys_rhs      = vector_type( "rhs" , local_owned_length );
  linsys_solution = vector_type( "solution" , local_owned_length );

  //------------------------------------
  // Fill linear system
  {
    elem_matrices_type elem_matrices ;
    elem_vectors_type  elem_vectors ;

    if ( element_count ) {
      elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
      elem_vectors  = elem_vectors_type ( std::string("elem_vectors"), element_count );
    }

    //------------------------------------
    // Compute element matrices and vectors:

    wall_clock.reset();

    ElementFunctor::apply( mesh ,
                           elem_matrices , elem_vectors ,
                           elem_coeff_K , elem_load_Q );

    execution_space::fence();
    perf_data.elem_time = comm::max( machine , wall_clock.seconds() );

    //------------------------------------
    // Fill linear system coefficients:

    wall_clock.reset();

    GatherFillFunctor::apply( linsys_matrix , linsys_rhs ,
               mesh , element_map , elem_matrices , elem_vectors );

    execution_space::fence();
    perf_data.matrix_gather_fill_time = comm::max( machine , wall_clock.seconds() );

    // Apply boundary conditions:

    wall_clock.reset();

    BoundaryFunctor::apply( linsys_matrix , linsys_rhs , mesh ,
                            0 , global_max_z , 0 , global_max_z );

    execution_space::fence();
    perf_data.matrix_boundary_condition_time = comm::max( machine , wall_clock.seconds() );
  }

  //------------------------------------
  // Solve linear sytem

  cgsolve( mesh.parallel_data_map ,
           linsys_matrix , linsys_rhs , linsys_solution ,
           iteration_count , residual_norm ,
           perf_data.cg_iteration_time ,
           iteration_limit , residual_tolerance );

  //------------------------------------

  if ( print_sample ) {

    typename mesh_type::node_coords_type::HostMirror coords_h =
      Kokkos::create_mirror( mesh.node_coords );

    typename vector_type::HostMirror X_h =
      Kokkos::create_mirror( linsys_solution );

    Kokkos::deep_copy( coords_h , mesh.node_coords );
    Kokkos::deep_copy( X_h , linsys_solution );

    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
      const coordinate_scalar_type x = coords_h(i,0);
      const coordinate_scalar_type y = coords_h(i,1);
      const coordinate_scalar_type z = coords_h(i,2);

      if ( x <= 0 && y <= 0 ) {
        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
                  << X_h(i) << std::endl ;
      }
    }
  }

  return perf_data ;
}
Example #2
0
PerfCGSolve test_cgsolve_array( comm::Machine machine ,
                                const int nGrid ,
                                const int iterMax ,
                                const char * const /* verify_label */ )
{
  typedef Kokkos::Array<Scalar,N> value_type ;

  typedef Kokkos::CrsArray<int,Device,void,int>      crsarray_type ;
  typedef Kokkos::CrsMatrix<value_type,Device>         matrix_type ;
  typedef Kokkos::View<value_type*,Kokkos::LayoutRight,Device> vector_type ;

  //------------------------------
  // Generate FEM graph:

  std::vector< std::vector<size_t> > fem_graph ;

  const unsigned fem_length = nGrid * nGrid * nGrid ;

  Test::generate_fem_graph( nGrid , fem_graph );

  //------------------------------

  vector_type x = vector_type( "x" , fem_length );
  vector_type y = vector_type( "y" , fem_length );

  typename vector_type::HostMirror hx        = Kokkos::create_mirror( x );
  typename vector_type::HostMirror hy_result = Kokkos::create_mirror( y );

  for ( unsigned i = 0 ; i < fem_length ; ++i ) {
    for ( unsigned j = 0 ; j < N ; ++j ) {
      hx(i)[j] = Test::generate_vector_coefficient( fem_length , N , i , j );
    }
  }

  Kokkos::deep_copy( x , hx );

  //------------------------------

  matrix_type matrix ;

  matrix.graph = Kokkos::create_crsarray<crsarray_type>( std::string("testing") , fem_graph );

  const unsigned fem_graph_length = matrix.graph.entries.dimension_0();

  matrix.values = vector_type( "matrix" , fem_graph_length );

  {
    typename vector_type::HostMirror hM =
      Kokkos::create_mirror( matrix.values );

    for ( size_t iRow = 0 , iEntry = 0 ; iRow < fem_length ; ++iRow ) {

      for ( unsigned k = 0 ; k < N ; ++k ) { hy_result(iRow)[k] = 0 ; }

      for ( size_t iRowEntry = 0 ; iRowEntry < fem_graph[ iRow ].size() ; ++iRowEntry , ++iEntry ) {

        const size_t iCol = fem_graph[ iRow ][ iRowEntry ];

        for ( unsigned k = 0 ; k < N ; ++k ) {
          hM(iEntry)[k] = Test::generate_matrix_coefficient( fem_length , N , iRow, iCol, k );
          hy_result(iRow)[k] += hM(iEntry)[k] * hx(iCol)[k];
        }
      }
    }

    Kokkos::deep_copy( matrix.values , hM );
  }

  size_t iter_count = 0 ;
  double iter_time  = 0 ;
  double norm_resid = 0 ;

  cgsolve( matrix , x , y , iter_count , norm_resid , iter_time , iterMax , 1e-14 );

  PerfCGSolve perf ;

  perf.seconds_per_iter = iter_time ;
  perf.row_count   = fem_length ;
  perf.entry_count = fem_graph_length ;

  return perf ;
}
Example #3
0
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
                     const int global_max_x ,
                     const int global_max_y ,
                     const int global_max_z ,
                     const bool print_error )
{
    typedef Scalar                              scalar_type ;
    typedef FixtureType                         fixture_type ;
    typedef typename fixture_type::device_type  device_type;
    typedef typename device_type::size_type     size_type ;

    typedef typename fixture_type::FEMeshType mesh_type ;
    typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;

    enum { ElementNodeCount = fixture_type::element_node_count };

    const comm::Machine machine = mesh.parallel_data_map.machine ;

    const size_t element_count = mesh.elem_node_ids.dimension(0);

    //------------------------------------
    // The amount of nonlinearity is proportional to the ratio
    // between T(zmax) and T(zmin).  For the manufactured solution
    // 0 < T(zmin) and 0 < T(zmax)

    const ManufacturedSolution
    exact_solution( /* zmin */ 0 ,
                               /* zmax */ global_max_z ,
                               /* T(zmin) */ 1 ,
                               /* T(zmax) */ 20 );

    //-----------------------------------
    // Convergence Criteria and perf data:

    const size_t cg_iteration_limit = 200 ;
    const double cg_tolerance = 1e-14 ;

    const size_t newton_iteration_limit = 150 ;
    const double newton_tolerance = 1e-14 ;

    size_t cg_iteration_count_total = 0 ;
    double cg_iteration_time = 0 ;

    size_t newton_iteration_count = 0 ;
    double residual_norm_init = 0 ;
    double residual_norm = 0 ;

    PerformanceData perf_data ;

    //------------------------------------
    // Sparse linear system types:

    typedef KokkosArray::View< Scalar[] , device_type >     vector_type ;
    typedef KokkosArray::CrsMatrix< Scalar , device_type >  matrix_type ;
    typedef typename matrix_type::graph_type                matrix_graph_type ;
    typedef typename matrix_type::coefficients_type         matrix_coefficients_type ;

    typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;

    //------------------------------------
    // Problem setup types:

    typedef ElementComputation < mesh_type , Scalar > ElementFunctor ;
    typedef DirichletSolution  < mesh_type , Scalar > DirichletSolutionFunctor ;
    typedef DirichletResidual  < mesh_type , Scalar > DirichletResidualFunctor ;

    typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
    typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;

    typedef GatherFill< matrix_type ,
            mesh_type ,
            elem_matrices_type ,
            elem_vectors_type > GatherFillFunctor ;

    //------------------------------------

    matrix_type jacobian ;
    vector_type residual ;
    vector_type delta ;
    vector_type nodal_solution ;

    typename graph_factory::element_map_type element_map ;

    //------------------------------------
    // Generate mesh and corresponding sparse matrix graph

    KokkosArray::Impl::Timer wall_clock ;

    //------------------------------------
    // Generate sparse matrix graph and element->graph map.

    wall_clock.reset();

    graph_factory::create( mesh , jacobian.graph , element_map );

    device_type::fence();

    perf_data.graph_time = comm::max( machine , wall_clock.seconds() );

    //------------------------------------
    // Allocate linear system coefficients and rhs:

    const size_t local_owned_length = jacobian.graph.row_map.dimension(0) - 1 ;
    const size_t local_total_length = mesh.node_coords.dimension(0);

    jacobian.coefficients =
        matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension(0) );

    // Nonlinear residual for owned nodes:
    residual = vector_type( "residual" , local_owned_length );

    // Nonlinear solution for owned and ghosted nodes:
    nodal_solution = vector_type( "solution" , local_total_length );

    // Nonlinear solution update for owned nodes:
    delta = vector_type( "delta" , local_owned_length );

    //------------------------------------
    // Allocation of arrays to fill the linear system

    elem_matrices_type elem_matrices ; // Jacobian matrices
    elem_vectors_type  elem_vectors ;  // Residual vectors

    if ( element_count ) {
        elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
        elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count );
    }

    //------------------------------------
    // For boundary condition set the correct values in the solution vector
    //   The 'zmin' face is assigned to 'T_zmin'.
    //   The 'zmax' face is assigned to 'T_zmax'.
    //   The resulting solution is one dimensional along the 'Z' axis.

    DirichletSolutionFunctor::apply( nodal_solution , mesh ,
                                     exact_solution.zmin ,
                                     exact_solution.zmax ,
                                     exact_solution.T_zmin ,
                                     exact_solution.T_zmax );

    for(;;) { // Nonlinear loop

#if defined( HAVE_MPI )

        {   //------------------------------------
            // Import off-processor nodal solution values
            // for residual and jacobian computations

            KokkosArray::AsyncExchange< typename vector_type::value_type , device_type ,
                        KokkosArray::ParallelDataMap >
                        exchange( mesh.parallel_data_map , 1 );

            KokkosArray::PackArray< vector_type >
            ::pack( exchange.buffer() ,
                    mesh.parallel_data_map.count_interior ,
                    mesh.parallel_data_map.count_send ,
                    nodal_solution );

            exchange.setup();

            exchange.send_receive();

            KokkosArray::UnpackArray< vector_type >
            ::unpack( nodal_solution , exchange.buffer() ,
                      mesh.parallel_data_map.count_owned ,
                      mesh.parallel_data_map.count_receive );
        }

#endif

        //------------------------------------
        // Compute element matrices and vectors:

        wall_clock.reset();

        ElementFunctor( mesh ,
                        elem_matrices ,
                        elem_vectors ,
                        nodal_solution ,
                        exact_solution.K );

        device_type::fence();
        perf_data.elem_time += comm::max( machine , wall_clock.seconds() );

        //------------------------------------
        // Fill linear system coefficients:

        wall_clock.reset();

        fill( 0 , jacobian.coefficients );
        fill( 0 , residual );

        GatherFillFunctor::apply( jacobian ,
                                  residual ,
                                  mesh ,
                                  element_map ,
                                  elem_matrices ,
                                  elem_vectors );

        device_type::fence();
        perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() );

        // Apply boundary conditions:

        wall_clock.reset();

        // Updates jacobian matrix to 1 on the diagonal, zero elsewhere,
        // and 0 in the residual due to the solution vector having the correct value
        DirichletResidualFunctor::apply( jacobian, residual, mesh ,
                                         exact_solution.zmin ,
                                         exact_solution.zmax );

        device_type::fence();
        perf_data.matrix_boundary_condition_time +=
            comm::max( machine , wall_clock.seconds() );

        //------------------------------------
        // Has the residual converged?

        residual_norm = sqrt( dot(mesh.parallel_data_map, residual) );

        if ( 0 == newton_iteration_count ) {
            residual_norm_init = residual_norm ;
        }

        if ( residual_norm / residual_norm_init < newton_tolerance ) {
            break ;
        }

        //------------------------------------
        // Solve linear sytem

        size_t cg_iteration_count = 0 ;
        double cg_residual_norm = 0 ;

        cgsolve( mesh.parallel_data_map ,
                 jacobian , residual , delta ,
                 cg_iteration_count ,
                 cg_residual_norm ,
                 cg_iteration_time ,
                 cg_iteration_limit , cg_tolerance ) ;

        perf_data.cg_iteration_time += cg_iteration_time ;
        cg_iteration_count_total += cg_iteration_count ;

        // Update non-linear solution with delta...
        // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update
        // LaTeX:
        // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} )
        // text:
        // x[n+1] = x[n] + Dx

        waxpby( mesh.parallel_data_map,
                1.0, nodal_solution,
                -1.0, delta, nodal_solution);

        ++newton_iteration_count ;

        if ( newton_iteration_limit < newton_iteration_count ) {
            break ;
        }
    };

    if ( newton_iteration_count ) {
        perf_data.elem_time /= newton_iteration_count ;
        perf_data.matrix_gather_fill_time /= newton_iteration_count ;
        perf_data.matrix_boundary_condition_time /= newton_iteration_count ;
    }

    if ( cg_iteration_count_total ) {
        perf_data.cg_iteration_time /= cg_iteration_count_total ;
    }

    perf_data.newton_iteration_count = newton_iteration_count ;
    perf_data.cg_iteration_count = cg_iteration_count_total ;

    //------------------------------------

    {
        // For extracting the nodal solution and its coordinates:

        typename mesh_type::node_coords_type::HostMirror node_coords_host =
            KokkosArray::create_mirror( mesh.node_coords );

        typename vector_type::HostMirror nodal_solution_host =
            KokkosArray::create_mirror( nodal_solution );

        KokkosArray::deep_copy( node_coords_host , mesh.node_coords );
        KokkosArray::deep_copy( nodal_solution_host , nodal_solution );

        double tmp = 0 ;

        for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
            const coordinate_scalar_type x = node_coords_host(i,0);
            const coordinate_scalar_type y = node_coords_host(i,1);
            const coordinate_scalar_type z = node_coords_host(i,2);

            const double Tx = exact_solution(z);
            const double Ts = nodal_solution_host(i);
            const double Te = std::abs( Tx - Ts ) / std::abs( Tx );

            tmp = std::max( tmp , Te );

            if ( print_error && 0.02 < Te ) {
                std::cout << "  node( " << x << " " << y << " " << z << " ) = "
                          << Ts << " != exact_solution " << Tx
                          << std::endl ;
            }
        }
        perf_data.error_max = comm::max( machine , tmp );
    }

    return perf_data ;
}
Example #4
0
int main( int argc , char ** argv )
{
  const int ghost = 1 ;
  const int max_cube = 20 ;
  int ncube[20] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
                    0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };

  FILE * print_file = stdout ;
  int print_iter = 500 ;
  int max_iter = 50 ;

  VECTOR_SCALAR tolerance = 0.0 ; /* Force max iterations */

  int gbox[3][2] = { { 0 , 16 } , { 0 , 16 } , { 0 , 16 } };
  int nt = 0 ;
  int trials = 5 ;
  int ntest ;
  int np = 1;
  int my_p = 0 ;

#ifdef HAVE_MPI
  MPI_Init( & argc , & argv );
  MPI_Comm_size( MPI_COMM_WORLD , & np );
  MPI_Comm_rank( MPI_COMM_WORLD , & my_p );
#endif

  if ( ! my_p ) {
    const char arg_threads[] = "threads=" ;
    const char arg_cube[] = "cube=" ;
    const char arg_box[] = "box=" ;
    const char arg_max[] = "max_iter=" ;
    const char arg_trials[] = "trials=" ;
    const char arg_print[] = "print_iter=" ;
    const char arg_file[] = "print_file=" ;
    int i ;
    for ( i = 1 ; i < argc ; ++i ) {
      if ( ! strncmp(argv[i],arg_threads,strlen(arg_threads)) ) {
        sscanf(argv[i]+strlen(arg_threads),"%d",&nt);
      }
      else if ( ! strncmp(argv[i],arg_box,strlen(arg_box)) ) {
        sscanf(argv[i]+strlen(arg_box),"%d%*[x]%d%*[x]%d",
               & gbox[0][1] , & gbox[1][1] , & gbox[2][1] );
      }
      else if ( ! strncmp(argv[i],arg_cube,strlen(arg_cube)) ) {
        sscanf(argv[i]+strlen(arg_cube),
               "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",
               ncube+0, ncube+1, ncube+2, ncube+3, ncube+4,
               ncube+5, ncube+6, ncube+7, ncube+8, ncube+9,
               ncube+10, ncube+11, ncube+12, ncube+13, ncube+14,
               ncube+15, ncube+16, ncube+17, ncube+18, ncube+19);
      }
      else if ( ! strncmp(argv[i],arg_max,strlen(arg_max)) ) {
        sscanf(argv[i]+strlen(arg_max),"%d",&max_iter);
      }
      else if ( ! strncmp(argv[i],arg_trials,strlen(arg_trials)) ) {
        sscanf(argv[i]+strlen(arg_trials),"%d",&trials);
      }
      else if ( ! strncmp(argv[i],arg_print,strlen(arg_print)) ) {
        sscanf(argv[i]+strlen(arg_print),"%d",&print_iter);
      }
      else if ( ! strncmp(argv[i],arg_file,strlen(arg_file)) ) {
        char buffer[256] ;
        sscanf(argv[i]+strlen(arg_file),"%s",buffer);
        print_file = fopen(buffer,"a");
      }
    }
  }

#ifdef HAVE_MPI
  {
    MPI_Bcast( & nt , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
    MPI_Bcast( & gbox[0][0] , 6 , MPI_INT , 0 , MPI_COMM_WORLD );
    MPI_Bcast( ncube , max_cube , MPI_INT , 0 , MPI_COMM_WORLD );
    MPI_Bcast( & max_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
    MPI_Bcast( & print_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
    MPI_Bcast( & trials , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
  }
#endif

  if ( nt ) {
    TPI_Init( nt );
    TPI_Block();
    TPI_Unblock();
  }

  if ( ! my_p ) {
    fprintf(print_file,"\"PROC\" , \"THREAD\" , \"EQUATION\" , \"NON-ZERO\" , \"MXV\"    , \"AXPBY\"  , \"DOT\" , \"Xerror\" , \"Iter\"\n");
    fprintf(print_file,"\"COUNT\" , \"COUNT\"  , \"COUNT\"    , \"COUNT\"    , \"Mflops\" , \"Mflops\" , \"Mflops\" , \"L2norm\" , \"COUNT\"\n");
  }

  for ( ntest = 0 ; ! ntest || ( ntest < max_cube && ncube[ntest] ) ; ++ntest ) {
    struct cgsolve_data cgdata ;

    if ( ncube[ntest] ) {
      gbox[0][1] = gbox[1][1] = gbox[2][1] = ncube[ntest] ;
    }

    hpccg_alloc_and_fill( np, my_p, (const int (*)[2]) gbox, ghost, &cgdata);

    cgdata.max_iter   = max_iter ;
    cgdata.print_iter = print_iter ;
    cgdata.tolerance  = tolerance ;

    {
      double dt_mxv[2] = { 0 , 0 };
      double dt_axpby[2] = { 0 , 0 };
      double dt_dot[2] = { 0 , 0 };
      VECTOR_SCALAR norm_resid = 0.0 ;
      int iter_count = 0 ;
      int iter_total = 0 ;
      int k ;

      VECTOR_SCALAR * const b      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
      VECTOR_SCALAR * const x      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
      VECTOR_SCALAR * const xexact = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );

      {
        const VECTOR_SCALAR value = 1.0 /* 1.0 / 3.0 */ ;
        int i ;
        for ( i = 0 ; i < cgdata.nRow ; ++i ) xexact[i] = value ;
      }

      for ( k = 0 ; k < trials ; ++k ) {
        int i ;

        for ( i = 0 ; i < cgdata.nRow ; ++i ) { x[i] = 0.0 ; }

        cgsolve_set_lhs( & cgdata , xexact , b );

        cgsolve( & cgdata, b, x,
                 & iter_count, & norm_resid,
                 dt_mxv , dt_axpby , dt_dot );

        iter_total += iter_count ;
      }

      {
        int nnzGlobal = cgdata.A_pc[ cgdata.nRow ];
        double error[2] = { 0 , 0 };

        for ( k = 0 ; k < cgdata.nRow ; ++k ) {
          error[0] += ( x[k] - xexact[k] ) * ( x[k] - xexact[k] );
          error[1] += xexact[k] * xexact[k] ;
        }

#ifdef HAVE_MPI
        {
          double error_global[2] = { 0.0 , 0.0 };
          int nnz = nnzGlobal ;

          MPI_Allreduce( & nnz , & nnzGlobal , 1 , MPI_INT , MPI_SUM ,
                         MPI_COMM_WORLD );

          MPI_Allreduce( error , error_global , 2 , MPI_DOUBLE , MPI_SUM ,
                         MPI_COMM_WORLD );

          error[0] = error_global[0];
          error[1] = error_global[1];
        }
#endif

        error[0] = sqrt( error[0] );
        error[1] = sqrt( error[1] );

        if ( ! my_p ) {
          const int nRowGlobal = ( gbox[0][1] - gbox[0][0] ) *
                                 ( gbox[1][1] - gbox[1][0] ) *
                                 ( gbox[2][1] - gbox[2][0] );

          const double mflop_mxv =
             1.0e-6 * ( iter_total ) * 2 * nnzGlobal / dt_mxv[0] ;

          const double mflop_axpby =
             1.0e-6 * ( iter_total * 3 ) * 3 * nRowGlobal / dt_axpby[0] ;

          const double mflop_dot =
             1.0e-6 * ( iter_total * 2 ) * 2 * nRowGlobal / dt_dot[0] ;

          fprintf(print_file,"%8d , %8d , %8d , %8d , %10g , %10g , %10g , %g , %d\n",
                  np , nt , nRowGlobal , nnzGlobal ,
                  mflop_mxv , mflop_axpby , mflop_dot ,
                  error[0] / error[1] , iter_total );
          fflush(print_file);
        }
      }

      free( xexact );
      free( x );
      free( b );
    }
    free( cgdata.A_a );
    free( cgdata.A_ia );
    free( cgdata.A_pc );
    free( cgdata.recv_pc );
    free( cgdata.send_pc );
    free( cgdata.send_id );
  }

  if ( nt ) { TPI_Finalize(); }

#ifdef HAVE_MPI
  MPI_Finalize();
#endif

  return 0 ;
}
Example #5
0
int main( int argc, char* argv[] ) {
	int writeOutX = 0;
	int n, k;
	int maxiterations = 1000;
	int niters=0;
 	double norm;
	double* b;
	double* x;
	double time;
	double t1, t2;
	
	MPI_Init( &argc, &argv );
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	
	// Read command line args.
	// 1st case runs model problem, 2nd Case allows you to specify your own b vector
	if ( argc == 3 ) {
		k = atoi( argv[1] );
		n = k*k;
		// each processor calls cs240_getB to build its own part of the b vector!
	} else if  ( !strcmp( argv[1], "-i" ) && argc == 4 ) {
		b = load_vec( argv[2], &k );
	} else {
		printf( "\nCGSOLVE Usage: \n\t"
			"Model Problem:\tmpirun -np [number_procs] cgsolve [k] [output_1=y_0=n]\n\t"
			"Custom Input:\tmpirun -np [number_procs] cgsolve -i [input_filename] [output_1=y_0=n]\n\n");
		exit(0);
	}
	writeOutX = atoi( argv[argc-1] ); // Write X to file if true, do not write if unspecified.

	
	// Start Timer
	t1 = MPI_Wtime();
	
	// CG Solve here!
	x = cgsolve(k);
 	// End Timer
	t2 = MPI_Wtime();
	
	printf("TEST: %s\n", cs240_verify(x, k, 0.0) ? "PASSED" : "FAILED");

	if ( writeOutX ) {
		save_vec( k, x );
	}
		
	// Output
	printf( "Problem size (k): %d\n",k);
	if(niters>0){
          printf( "Norm of the residual after %d iterations: %lf\n",niters,norm);
        }
	printf( "Elapsed time during CGSOLVE: %lf\n", t2-t1);
	
        // Deallocate 
        if(niters > 0){
	  free(b);
	}
        if(niters > 0){
          free(x);
	}
	
	MPI_Finalize();
	
	return 0;
}