Multiply( const matrix_type & A ,
            const size_type nrow ,
            const size_type ncol ,
            const vector_type & x ,
            const vector_type & y )
  {
    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
    const scalar_type alpha = 1 , beta = 0 ;

    cusparseStatus_t status =
      cusparseScsrmv( s.handle ,
                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
                      nrow , ncol , A.coefficients.dimension_0() ,
                      &alpha ,
                      s.descra ,
                      A.coefficients.ptr_on_device() ,
                      A.graph.row_map.ptr_on_device() ,
                      A.graph.entries.ptr_on_device() ,
                      x.ptr_on_device() ,
                      &beta ,
                      y.ptr_on_device() );

    if ( CUSPARSE_STATUS_SUCCESS != status ) {
      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
    }
  }
  static void apply( const matrix_type & A ,
                     const vector_type & x ,
                     const vector_type & y )
  {
    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
    const double alpha = 1 , beta = 0 ;
    const int n = A.graph.row_map.dimension_0() - 1 ;
    const int nz = A.graph.entries.dimension_0();

    cusparseStatus_t status =
      cusparseDcsrmv( s.handle ,
                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
                      n , n , nz ,
                      &alpha ,
                      s.descra ,
                      A.values.ptr_on_device() ,
                      A.graph.row_map.ptr_on_device() ,
                      A.graph.entries.ptr_on_device() ,
                      x.ptr_on_device() ,
                      &beta ,
                      y.ptr_on_device() );

    if ( CUSPARSE_STATUS_SUCCESS != status ) {
      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
    }
  }
  void recv( const vector_type & v )
  {
    const size_t recv_msg_count = m_recv_request.size();
    const std::pair<unsigned,unsigned> recv_range( m_map.count_owned , m_map.count_owned + m_map.count_receive );

    const vector_type vrecv = subview<vector_type>( v , recv_range );

    // Wait for receives and verify:

    for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
      MPI_Status recv_status ;
      int recv_which = 0 ;
      int recv_size  = 0 ;

      MPI_Waitany( recv_msg_count , & m_recv_request[0] , & recv_which , & recv_status );

      const int recv_proc = recv_status.MPI_SOURCE ;

      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );

      // Verify message properly received:

      const int  expected_proc = m_map.host_recv(recv_which,0);
      const int  expected_size = m_map.host_recv(recv_which,1) *
                                 m_chunk * sizeof(scalar_type);

      if ( ( expected_proc != recv_proc ) ||
           ( expected_size != recv_size ) ) {
        std::ostringstream msg ;
        msg << "MatrixMultiply communication error:"
            << " P" << comm::rank( m_map.machine )
            << " received from P" << recv_proc
            << " size "     << recv_size
            << " expected " << expected_size
            << " from P"    << expected_proc ;
        throw std::runtime_error( msg.str() );
      }
    }

    // Copy received data to device memory.

    Impl::DeepCopy<typename Device::memory_space,HostSpace>( vrecv.ptr_on_device() ,
                                                             m_host_recv_buffer.ptr_on_device() ,
                                                             m_map.count_receive * m_chunk * sizeof(scalar_type) );
  }