KOKKOS_INLINE_FUNCTION
  int
  Chol<Uplo::Upper,
       AlgoChol::ExternalLapack,Variant::One>
  ::invoke(PolicyType &policy,
           const MemberType &member,
           DenseExecViewTypeA &A) {
    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                Kokkos::Cuda
    //                >::value,
    //                "Cuda space is not available for calling external BLAS" );

    //typedef typename DenseExecViewTypeA::space_type   space_type;
    typedef typename DenseExecViewTypeA::ordinal_type ordinal_type;
    typedef typename DenseExecViewTypeA::value_type   value_type;

    int r_val = 0;      
    if (member.team_rank() == 0) {
#ifdef HAVE_SHYLUTACHO_TEUCHOS
      Teuchos::LAPACK<ordinal_type,value_type> lapack;

      lapack.POTRF('U',
                   A.NumRows(),
                   A.ValuePtr(), A.BaseObject().ColStride(),
                   &r_val);
#else
    TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") );
#endif
    }

    return r_val;
  }
    void setGraph(const ordinal_type m,
                  const size_type_array rptr,
                  const ordinal_type_array cidx) {
        _is_ordered = false;
        _cblk  = 0;

        /// Scotch graph spec
        /// - no diagonals, symmetric

        _base  = 0;
        _m     = m;
        _nnz   = rptr[m];

        _rptr  = rptr;
        _cidx  = cidx;

        _perm  = ordinal_type_array("Scotch::PermutationArray", _m);
        _peri  = ordinal_type_array("Scotch::InvPermutationArray", _m);
        _range = ordinal_type_array("Scotch::RangeArray", _m);
        _tree  = ordinal_type_array("Scotch::TreeArray", _m);

        _strat = 0;
        _level = 0;

        int ierr = 0;
        ordinal_type *rptr_ptr = reinterpret_cast<ordinal_type*>(_rptr.ptr_on_device());
        ordinal_type *cidx_ptr = reinterpret_cast<ordinal_type*>(_cidx.ptr_on_device());

        ierr = SCOTCH_graphInit(&_graph);
        TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphInit");
        ierr = SCOTCH_graphBuild(&_graph,             // scotch graph
                                 _base,               // base value
                                 _m,                  // # of vertices
                                 rptr_ptr,            // column index array pointer begin
                                 rptr_ptr+1,          // column index array pointer end
                                 NULL,                // weights on vertices (optional)
                                 NULL,                // label array on vertices (optional)
                                 _nnz,                // # of nonzeros
                                 cidx_ptr,            // column index array
                                 NULL);               // edge load array (optional)
        TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphBuild");
        ierr = SCOTCH_graphCheck(&_graph);
        TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphCheck");
    }
  KOKKOS_INLINE_FUNCTION
  int
  Gemm<Trans::ConjTranspose,Trans::NoTranspose,
       AlgoGemm::ExternalBlas,Variant::One>
  ::invoke(PolicyType &policy,
           MemberType &member,
           const ScalarType alpha,
           DenseExecViewTypeA &A,
           DenseExecViewTypeB &B,
           const ScalarType beta,
           DenseExecViewTypeC &C) {
    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                typename DenseMatrixTypeB::space_type
    //                >::value && 
    //                Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeB::space_type,
    //                typename DenseMatrixTypeC::space_type
    //                >::value,
    //                "Space type of input matrices does not match" );
    
    if (member.team_rank() == 0) {
#if                                                     \
  defined( HAVE_SHYLUTACHO_TEUCHOS ) &&                 \
  defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
      typedef typename DenseExecViewTypeA::ordinal_type ordinal_type;
      typedef typename DenseExecViewTypeA::value_type   value_type;

      Teuchos::BLAS<ordinal_type,value_type> blas;
      
      const ordinal_type m = C.NumRows();
      const ordinal_type n = C.NumCols();
      const ordinal_type k = B.NumRows();

      if (m > 0 && n > 0 && k > 0)
        blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS,
                  m, n, k,
                  alpha,
                  A.ValuePtr(), A.BaseObject().ColStride(),
                  B.ValuePtr(), B.BaseObject().ColStride(),
                  beta,
                  C.ValuePtr(), C.BaseObject().ColStride());
#else
    TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") );        
#endif
    } 

    return 0;
  }
  KOKKOS_INLINE_FUNCTION
  int
  Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose,
       AlgoTrsm::ExternalBlas,Variant::One>
  ::invoke(PolicyType &policy,
           const MemberType &member,
           const int diagA,
           const ScalarType alpha,
           DenseExecViewTypeA &A,
           DenseExecViewTypeB &B) {
    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                Kokkos::Cuda
    //                >::value,
    //                "Cuda space is not available for calling external BLAS" );

    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                typename DenseMatrixTypeB::space_type
    //                >::value,
    //                "Space type of input matrices does not match" );

    //typedef typename DenseExecViewTypeA::space_type   space_type;
    typedef typename DenseExecViewTypeA::ordinal_type ordinal_type;
    typedef typename DenseExecViewTypeA::value_type   value_type;

    if (member.team_rank() == 0) {
#ifdef HAVE_SHYLUTACHO_TEUCHOS
      Teuchos::BLAS<ordinal_type,value_type> blas;

      const ordinal_type m = A.NumRows();
      const ordinal_type n = B.NumCols();

      blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::NO_TRANS,
                (diagA == Diag::Unit ? Teuchos::UNIT_DIAG : Teuchos::NON_UNIT_DIAG),
                m, n,
                alpha,
                A.ValuePtr(), A.BaseObject().ColStride(),
                B.ValuePtr(), B.BaseObject().ColStride());
#else
    TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") );
#endif
    }

    return 0;
  }
  KOKKOS_INLINE_FUNCTION
  int
  Chol<Uplo::Upper,
       AlgoChol::Unblocked,Variant::One>
  ::invoke(PolicyType &policy,
           const MemberType &member,
           CrsExecViewTypeA &A) {

    typedef typename CrsExecViewTypeA::value_type    value_type;
    typedef typename CrsExecViewTypeA::ordinal_type  ordinal_type;
    typedef typename CrsExecViewTypeA::row_view_type row_view_type;

    // row_view_type r1t, r2t;

    for (ordinal_type k=0;k<A.NumRows();++k) {
      //r1t.setView(A, k);
      row_view_type &r1t = A.RowView(k);

      // extract diagonal from alpha11
      value_type &alpha = r1t.Value(0);

      if (member.team_rank() == 0) {
        // if encounter null diag or wrong index, return -(row + 1)
        TACHO_TEST_FOR_ABORT( r1t.Col(0) != k, "Chol::Unblocked:: Diagonal does not exist");        
        if (Util::real(alpha) <= 0.0) {
          // warning message
          fprintf(stderr, "   diagonal = %f, local col = %d, global col = %d\n", 
                  Util::real(alpha), k, r1t.OffsetCols() + k);
          // proceed with epsilon; for incomplete factorization, Cholesky factor may not exit
          alpha = 1.0e-8;

          //TACHO_TEST_FOR_ABORT( true, "Chol::Unblocked:: Diagonal is negative");
          //return -(k + 1);
        }

        // error handling should be more carefully designed

        // sqrt on diag
        alpha = sqrt(Util::real(alpha));
      }
      member.team_barrier();

      const ordinal_type nnz_r1t = r1t.NumNonZeros();

      if (nnz_r1t) {
        // inverse scale
        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_r1t),
                             [&](const ordinal_type j) {
                               r1t.Value(j) /= alpha;
                             });

        member.team_barrier();

        // hermitian rank update
        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_r1t),
                             [&](const ordinal_type i) {
                               const ordinal_type row_at_i = r1t.Col(i);
                               const value_type   val_at_i = Util::conj(r1t.Value(i));

                               //r2t.setView(A, row_at_i);
                               row_view_type &r2t = A.RowView(row_at_i);
                               ordinal_type idx = 0;

                               for (ordinal_type j=i;j<nnz_r1t && (idx > -2);++j) {
                                 const ordinal_type col_at_j = r1t.Col(j);
                                 idx = r2t.Index(col_at_j, idx);

                                 if (idx >= 0) {
                                   const value_type val_at_j = r1t.Value(j);
                                   r2t.Value(idx) -= val_at_i*val_at_j;
                                 }
                               }
                             });
      }
    }
    return 0;
  }
    void computeOrdering(const ordinal_type treecut = 0) {
        int ierr = 0;

        // pointers for global graph ordering
        ordinal_type *perm  = _perm.ptr_on_device();
        ordinal_type *peri  = _peri.ptr_on_device();
        ordinal_type *range = _range.ptr_on_device();
        ordinal_type *tree  = _tree.ptr_on_device();

        {
            // set desired tree level
            if (_strat & SCOTCH_STRATLEVELMAX ||
                    _strat & SCOTCH_STRATLEVELMIN) {
                TACHO_TEST_FOR_ABORT(_level == 0, "SCOTCH_STRATLEVEL(MIN/MAX) is used but level is not specified");
            }
            const int level = Util::max(1, _level-treecut);

            SCOTCH_Strat stradat;
            SCOTCH_Num straval = _strat;

            ierr = SCOTCH_stratInit(&stradat);
            TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_stratInit");


            // if both are zero, do not build strategy
            if (_strat || _level) {
                if (_verbose)
                    std::cout << "GraphTools_Scotch:: User provide a strategy and/or level" << std::endl
                              << "                    strategy = " << _strat << ", level =  " << _level << ", treecut = " << treecut << std::endl
                              << "                    strategy & SCOTCH_STRATLEVELMAX   = " << (_strat & SCOTCH_STRATLEVELMAX) << std::endl
                              << "                    strategy & SCOTCH_STRATLEVELMIN   = " << (_strat & SCOTCH_STRATLEVELMIN) << std::endl
                              << "                    strategy & SCOTCH_STRATLEAFSIMPLE = " << (_strat & SCOTCH_STRATLEAFSIMPLE) << std::endl
                              << "                    strategy & SCOTCH_STRATSEPASIMPLE = " << (_strat & SCOTCH_STRATSEPASIMPLE) << std::endl
                              << std::endl;
                ierr = SCOTCH_stratGraphOrderBuild(&stradat, straval, level, 0.2);
                TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_stratGraphOrderBuild");
            }
            ierr = SCOTCH_graphOrder(&_graph,
                                     &stradat,
                                     perm,
                                     peri,
                                     &_cblk,
                                     range,
                                     tree);
            TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphOrder");
            SCOTCH_stratExit(&stradat);
        }


        {
            ordinal_type nroot = 0;
            for (ordinal_type i=0; i<_cblk; ++i)
                nroot += (_tree[i] == -1);

            if (nroot > 1) {
                if (_verbose)
                    std::cout << "GraphTools_Scotch:: # of roots " << nroot << std::endl
                              << "                    a fake root is created to complete the tree" << std::endl
                              << std::endl;
                _tree [_cblk]   = -1;          // dummy root
                _range[_cblk+1] = _range[_cblk]; // zero range for the dummy root

                for (ordinal_type i=0; i<_cblk; ++i)
                    if (_tree[i] == -1)           // multiple roots becomes children of the dummy root
                        _tree[i] = (_cblk+1);
                ++_cblk;                       // include the dummy root
            }
        }
        _is_ordered = true;

        //std::cout << "SCOTCH level = " << level << std::endl;
        //std::cout << "Range   Tree " << std::endl;
        //for (int i=0;i<_cblk;++i)
        //  std::cout << _range[i] << " :: " << i << " " << _tree[i] << std::endl;
    }
  int exampleCholByBlocks(const std::string file_input,
                          const int treecut,
                          const int prunecut,
                          const int fill_level,
                          const int rows_per_team,
                          const int max_concurrency,
                          const int max_task_dependence,
                          const int team_size,
                          const bool check,
                          const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    // for simple test, let's use host space only here, for device it needs mirroring.

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType;
    typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType;

    typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType;

    typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch;
    typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD;

    typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType;

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType;
    typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType;
    typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType;
    typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType;

    int r_val = 0;
    
    Kokkos::Impl::Timer timer;

    ///
    /// Read from matrix market
    ///
    ///     input  - file
    ///     output - AA_host
    ///
    CrsMatrixBaseHostType AA_host("AA_host");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA_host, in);
    }
    double t_read = timer.seconds();

    if (verbose)
      AA_host.showMe(std::cout) << std::endl;

    ///
    /// Create a graph structure for Scotch and CAMD (rptr, cidx)
    ///     
    ///     rptr and cidx are need to be set up for Scotch and CAMD
    ///
    typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1);
    typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros());

    ///
    /// Run Scotch
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_host);
    double t_graph = timer.seconds();

    GraphToolsHostType_Scotch S;
    S.setGraph(AA_host.NumRows(), rptr, cidx);
    S.setSeed(0);
    S.setTreeLevel();
    S.setStrategy( SCOTCH_STRATSPEED 
                   | SCOTCH_STRATLEVELMAX   
                   | SCOTCH_STRATLEVELMIN   
                   | SCOTCH_STRATLEAFSIMPLE 
                   | SCOTCH_STRATSEPASIMPLE 
                   );
    
    timer.reset();
    S.computeOrdering(treecut);
    double t_scotch = timer.seconds();
    
    if (verbose)
      S.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host");
    AA_scotch_host.createConfTo(AA_host);
    
    CrsMatrixTools::copy(AA_scotch_host,
                         S.PermVector(),
                         S.InvPermVector(),
                         AA_host);

    if (verbose)
      AA_scotch_host.showMe(std::cout) << std::endl;

    ///
    /// Run CAMD
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host);
    t_graph += timer.seconds();

    GraphToolsHostType_CAMD C;
    C.setGraph(AA_scotch_host.NumRows(),
               rptr, cidx,
               S.NumBlocks(),
               S.RangeVector());

    timer.reset();
    C.computeOrdering();
    double t_camd = timer.seconds();

    if (verbose)
      C.showMe(std::cout) << std::endl;
    
    CrsMatrixBaseHostType AA_camd_host("AA_camd_host");
    AA_camd_host.createConfTo(AA_scotch_host);
    
    CrsMatrixTools::copy(AA_camd_host,
                         C.PermVector(),
                         C.InvPermVector(),
                         AA_scotch_host);
    
    if (verbose)
      AA_camd_host.showMe(std::cout) << std::endl;

    ///
    /// Symbolic factorization
    ///
    ///     input  - 
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    CrsMatrixBaseHostType AA_factor_host("AA_factor_host");

    timer.reset();
    IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host,
                                                    fill_level,
                                                    Uplo::Upper,
                                                    AA_camd_host,
                                                    rows_per_team);
    double t_symbolic = timer.seconds();

    if (verbose)
      AA_factor_host.showMe(std::cout) << std::endl;

    ///
    /// Clean tempoerary matrices
    ///
    ///     input  - AA_scotch_host, AA_camd_host, C, rptr, cidx
    ///     output - none
    ///
    AA_scotch_host = CrsMatrixBaseHostType();
    AA_camd_host   = CrsMatrixBaseHostType();

    C = GraphToolsHostType_CAMD();
    rptr = typename GraphToolsHostType::size_type_array();
    cidx = typename GraphToolsHostType::ordinal_type_array();

    ///
    /// Create task policy
    ///
    ///     input  - max_task_size
    ///     output - policy
    ///
    const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128);

    timer.reset();
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);
    double t_policy = timer.seconds();

    ///
    /// Sequential execution
    ///
    ///     input  - AA_factor_host (matrix to be compared), rowviews
    ///     output - BB_factor_host, B_factor_host
    ///
    double t_chol_serial = 0;
    CrsMatrixBaseHostType BB_factor_host("BB_factor_host");
    if (check) {
      BB_factor_host.createConfTo(AA_factor_host);
      CrsMatrixTools::copy(BB_factor_host, AA_factor_host);
      
      CrsTaskViewHostType B_factor_host(BB_factor_host);
      Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType>
        rowviews("RowViewInMatView", B_factor_host.NumRows());
      B_factor_host.setRowViewArray(rowviews);
      
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>
                                              ::createTaskFunctor(policy, B_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_serial = timer.seconds();

      if (verbose)
        BB_factor_host.showMe(std::cout) << std::endl;
    }

    ///
    /// Task parallel execution
    ///
    ///    input  - AA_factor_host, rowviews
    ///    output - HA_factor_host, AA_factor_host, B_factor_host
    ///
    double t_hier = 0, t_blocks = 0, t_chol_parallel = 0;
    CrsHierBaseHostType HA_factor_host("HA_factor_host");
    {
      timer.reset();
      S.pruneTree(prunecut);
      CrsMatrixTools::createHierMatrix(HA_factor_host, 
                                       AA_factor_host, 
                                       S.NumBlocks(),
                                       S.RangeVector(),
                                       S.TreeVector());
      t_hier = timer.seconds();    
      
      timer.reset();
      size_type nblocks = HA_factor_host.NumNonZeros();
      
      Kokkos::View<ordinal_type*,HostSpaceType> 
        ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1);
      
      ap_rowview_blocks(0) = 0;
      for (ordinal_type k=0;k<nblocks;++k) 
        ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows();
      
      Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType>
        rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks));
      
      Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks),
                           [&](const ordinal_type k) {
                             const ordinal_type begin = ap_rowview_blocks(k);
                             const ordinal_type end   = ap_rowview_blocks(k+1);
                             HA_factor_host.Value(k).setRowViewArray
                               (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end)));
                           } );
      CrsMatrixTools::filterEmptyBlocks(HA_factor_host);
      t_blocks = timer.seconds();    

      {
        size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0;
        for (size_type k=0;k<nblocks_filtered; ++k) 
          nnz_blocks += HA_factor_host.Value(k).NumNonZeros();
        
        TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(),
                              "nnz counted in blocks is different from nnz in the base matrix.");
      }
      
      CrsTaskHierViewHostType H_factor_host(HA_factor_host);
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One>
                                              ::createTaskFunctor(policy, H_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_parallel = timer.seconds();
      
      if (verbose)
        AA_factor_host.showMe(std::cout) << std::endl;
    }

    if (check) {
      double diff = 0, norm = 0;    
      TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(),
                            "nnz used in serial is not same as nnz used in parallel");

      const size_type nnz = AA_factor_host.NumNonZeros();
      for (size_type k=0;k<nnz;++k) {
        norm += Util::abs(BB_factor_host.Value(k));
        diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k));
      }
      std::cout << std::scientific;
      std::cout << "CholByBlocks:: check with serial execution " << std::endl
                << "               diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl;
      std::cout.unsetf(std::ios::scientific);      
    }

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific;
      std::cout << "CholByBlocks:: Given    matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Hier     matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl;

      std::cout << "CholByBlocks:: "
                << "read = " << t_read << " [sec], "
                << "graph generation = " << (t_graph/2.0) << " [sec] "
                << "scotch reordering = " << t_scotch << " [sec] "
                << "camd reordering = " << t_camd << " [sec] " 
                << std::endl
                << "CholByBlocks:: "
                << "symbolic factorization = " << t_symbolic << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "policy creation = " << t_policy << " [sec] "
                << "hier creation = " << t_hier << " [sec] "
                << "block specification = " << t_blocks << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "Chol Parallel = " << t_chol_parallel << " [sec] ";
      if (check) 
        std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] "
                  << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] ";
      
      std::cout << std::endl;
      
      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }

    return r_val;
  }
  int exampleDenseCholByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseCholByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host");

      // random T matrix
      {
        TT_host.createConfTo(AA_host);
        for (ordinal_type j=0;j<TT_host.NumCols();++j) {
          for (ordinal_type i=0;i<TT_host.NumRows();++i)
            TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
          TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j));
        }
      }
      // create SPD matrix
      {
        Teuchos::BLAS<ordinal_type,value_type> blas;

        blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, 
                  Teuchos::CONJ_TRANS,
                  m, m,
                  1.0,
                  TT_host.ValuePtr(), TT_host.ColStride(),
                  0.0,
                  AA_host.ValuePtr(), AA_host.ColStride());

        // preserve a copy of A
        AB_host.createConfTo(AA_host);        
        DenseMatrixTools::copy(AB_host, AA_host);
      }

      const double flop = DenseFlopCount<value_type>::Chol(m);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif
      os << "DenseCholByBlocks:: m = " << m << "  ";
      int ierr = 0;
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AB_host); 
        ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke
          (policy, policy.member_single(),
           A_host);
        t = timer.seconds();
        TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device");
        
        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);

        DenseHierTaskViewDeviceType TA_device(HA_device);

        timer.reset();
        auto future = policy.proc_create_team
          (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, TA_device), 
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      AA_host.mirror(AA_device);
      if (!ierr && check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<=j;++i) {
            const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j));
            const double val  = AB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }
    
    return r_val;
  }
  int exampleCholUnblocked(const std::string file_input,
                           const int treecut,
                           const int prunecut,
                           const int fill_level,
                           const int rows_per_team,
                           const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType;
    typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType;

    typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch;
    typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD;

    typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType;

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> CrsMatrixBaseDeviceType;

    typedef CrsMatrixView<CrsMatrixBaseDeviceType> CrsMatrixViewDeviceType;
    typedef TaskView<CrsMatrixViewDeviceType> CrsTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;

    CrsMatrixBaseHostType AA_host("AA_host");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA_host, in);
    }
    double t_read = timer.seconds();

    if (verbose)
      AA_host.showMe(std::cout) << std::endl;

    typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1);
    typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros());

    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_host);
    double t_graph = timer.seconds();

    GraphToolsHostType_Scotch S;
    S.setGraph(AA_host.NumRows(), rptr, cidx);
    S.setSeed(0);
    S.setTreeLevel();
    S.setStrategy( SCOTCH_STRATSPEED
                   | SCOTCH_STRATLEVELMAX  
                   | SCOTCH_STRATLEVELMIN  
                   | SCOTCH_STRATLEAFSIMPLE
                   | SCOTCH_STRATSEPASIMPLE
                   );

    timer.reset();
    S.computeOrdering(treecut);
    double t_scotch = timer.seconds();

    S.pruneTree(prunecut);
    if (verbose)
      S.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType BB_host("BB_host");
    BB_host.createConfTo(AA_host);

    CrsMatrixTools::copy(BB_host,
                         S.PermVector(),
                         S.InvPermVector(),
                         AA_host);

    if (verbose)
      BB_host.showMe(std::cout) << std::endl;

    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, BB_host);
    t_graph += timer.seconds();

    GraphToolsHostType_CAMD C;
    C.setGraph(BB_host.NumRows(),
               rptr, cidx,
               S.NumBlocks(),
               S.RangeVector());

    timer.reset();
    C.computeOrdering();
    double t_camd = timer.seconds();

    if (verbose)
      C.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType CC_host("CC_host");
    CC_host.createConfTo(BB_host);

    CrsMatrixTools::copy(CC_host,
                         C.PermVector(),
                         C.InvPermVector(),
                         BB_host);

    if (verbose)
      CC_host.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType DD_host("DD_host");

    timer.reset();
    IncompleteSymbolicFactorizationType::createNonZeroPattern(DD_host,
                                                    fill_level,
                                                    Uplo::Upper,
                                                    CC_host,
                                                    rows_per_team);
    double t_symbolic = timer.seconds();

    if (verbose)
      DD_host.showMe(std::cout) << std::endl;

    // ==================================================================================

    CrsMatrixBaseDeviceType AA_device("AA_device");
    AA_device.mirror(DD_host);

    const size_type max_concurrency = 10;
    const size_type max_task_size = (3*sizeof(CrsTaskViewDeviceType)+sizeof(PolicyType)+128);
    const size_type max_task_dependence = 0;
    const size_type team_size = 1;

    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    CrsMatrixViewDeviceType A_device(AA_device);
    Kokkos::View<typename CrsMatrixViewDeviceType::row_view_type*,DeviceSpaceType> 
      rowviews("RowViewInMatView", A_device.NumRows());
    A_device.setRowViewArray(rowviews);
    
    timer.reset();
    int ierr = Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>::invoke
      (policy, policy.member_single(),
       A_device);
    double t_chol = timer.seconds();
    TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");

    if (verbose) {
      DD_host.mirror(AA_device);
      DD_host.showMe(std::cout) << std::endl;
    }

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific;
      std::cout << "SymbolicFactorization:: Given matrix  dimension = " << AA_host.NumRows() << " x " << AA_host.NumCols()
                << ", " << " nnz = " << AA_host.NumNonZeros() << std::endl;
      std::cout << "SymbolicFactorization:: Upper factors dimension = " << DD_host.NumRows() << " x " << DD_host.NumCols()
                << ", " << " nnz = " << DD_host.NumNonZeros() << std::endl;

      std::cout << "SymbolicFactorization:: "
                << "read = " << t_read << " [sec], "
                << "graph generation = " << (t_graph/2.0) << " [sec] "
                << "scotch reordering = " << t_scotch << " [sec] "
                << "camd reordering = " << t_camd << " [sec] "
                << "symbolic factorization = " << t_symbolic << " [sec] "
                << "Cholesky factorization = " << t_chol << " [sec] "
                << std::endl;

      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }

    return r_val;
  }
Esempio n. 10
0
    static void
    read(CrsMatrixType &A, std::ifstream &file) {
      // static_assert( Kokkos::Impl::is_same<
      //                typename CrsMatrixType::space_type,
      //                Kokkos::HostSpace
      //                >::value,
      //                "Space type of the input should be HostSpace" );

      typedef typename CrsMatrixType::value_type   value_type;
      typedef typename CrsMatrixType::ordinal_type ordinal_type;
      typedef typename CrsMatrixType::size_type    size_type;

      // coordinate format
      typedef Coo<ordinal_type,value_type> ijv_type;

      //typedef typename CrsMatrixType::space_type space_type;
      //typedef Kokkos::RangePolicy<space_type,Kokkos::Schedule<Kokkos::Static> > range_policy_type;
      ordinal_type m, n;
      size_type nnz;

      std::vector<ijv_type> mm;
      const ordinal_type mm_base = 1;
      {
        std::string header;
        if (file.is_open()) {
          std::getline(file, header);
          while (file.good()) {
            char c = file.peek();
            if (c == '%' || c == '\n') {
              file.ignore(256, '\n');
              continue;
            }
            break;
          }
        } else {
          TACHO_TEST_FOR_ABORT(true, MSG_INVALID_INPUT(file));
        }

        // check the header
        bool symmetry = (header.find("symmetric") != std::string::npos);

        // read matrix specification
        file >> m >> n >> nnz;

        mm.reserve(nnz*(symmetry ? 2 : 1));
        for (size_type i=0;i<nnz;++i) {
          ordinal_type row, col;
          value_type val;
          file >> row >> col >> val;

          row -= mm_base;
          col -= mm_base;

          mm.push_back(ijv_type(row, col, val));
          if (symmetry && row != col)
            mm.push_back(ijv_type(col, row, val));
        }

        // sort by row major order
        std::sort(mm.begin(), mm.end(), std::less<ijv_type>());
      }

      // change mm to crs
      std::vector<size_type> ap;
      std::vector<ordinal_type> aj;
      std::vector<value_type> ax;

      ap.reserve(m+1);
      aj.reserve(nnz);
      ax.reserve(nnz);

      {
        ordinal_type ii = 0;
        size_type jj = 0;
        ijv_type prev = mm[0];

        ap.push_back(0);
        aj.push_back(prev.Col());
        ax.push_back(prev.Val());

        for (auto //typename std::vector<ijv_type>::const_iterator 
               it=(mm.begin()+1);it<mm.end();++it) {
          ijv_type aij = (*it);
          
          // row index
          if (aij.Row() != prev.Row()) 
            ap.push_back(aj.size());
          
          if (aij == prev) {
            aj.back()  = aij.Col();
            ax.back() += aij.Val();
          } else {
            aj.push_back(aij.Col());
            ax.push_back(aij.Val());
          }

          prev = aij;
        }
        
        // add the last index to terminate the storage
        ap.push_back(aj.size());
        nnz = aj.size();
      }

      // create crs matrix view
      A.create(m, n, nnz);
      for (ordinal_type i=0;i<m;++i) {
        A.RowPtrBegin(i) = ap.at(i);
        A.RowPtrEnd(i) = ap.at(i+1);
      }
      for (ordinal_type k=0;k<nnz;++k) {
        A.Col(k) = aj.at(k);
        A.Value(k) = ax.at(k);
      }
    }