KOKKOS_INLINE_FUNCTION int Chol<Uplo::Upper, AlgoChol::ExternalLapack,Variant::One> ::invoke(PolicyType &policy, const MemberType &member, DenseExecViewTypeA &A) { // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // Kokkos::Cuda // >::value, // "Cuda space is not available for calling external BLAS" ); //typedef typename DenseExecViewTypeA::space_type space_type; typedef typename DenseExecViewTypeA::ordinal_type ordinal_type; typedef typename DenseExecViewTypeA::value_type value_type; int r_val = 0; if (member.team_rank() == 0) { #ifdef HAVE_SHYLUTACHO_TEUCHOS Teuchos::LAPACK<ordinal_type,value_type> lapack; lapack.POTRF('U', A.NumRows(), A.ValuePtr(), A.BaseObject().ColStride(), &r_val); #else TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") ); #endif } return r_val; }
void setGraph(const ordinal_type m, const size_type_array rptr, const ordinal_type_array cidx) { _is_ordered = false; _cblk = 0; /// Scotch graph spec /// - no diagonals, symmetric _base = 0; _m = m; _nnz = rptr[m]; _rptr = rptr; _cidx = cidx; _perm = ordinal_type_array("Scotch::PermutationArray", _m); _peri = ordinal_type_array("Scotch::InvPermutationArray", _m); _range = ordinal_type_array("Scotch::RangeArray", _m); _tree = ordinal_type_array("Scotch::TreeArray", _m); _strat = 0; _level = 0; int ierr = 0; ordinal_type *rptr_ptr = reinterpret_cast<ordinal_type*>(_rptr.ptr_on_device()); ordinal_type *cidx_ptr = reinterpret_cast<ordinal_type*>(_cidx.ptr_on_device()); ierr = SCOTCH_graphInit(&_graph); TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphInit"); ierr = SCOTCH_graphBuild(&_graph, // scotch graph _base, // base value _m, // # of vertices rptr_ptr, // column index array pointer begin rptr_ptr+1, // column index array pointer end NULL, // weights on vertices (optional) NULL, // label array on vertices (optional) _nnz, // # of nonzeros cidx_ptr, // column index array NULL); // edge load array (optional) TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphBuild"); ierr = SCOTCH_graphCheck(&_graph); TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphCheck"); }
KOKKOS_INLINE_FUNCTION int Gemm<Trans::ConjTranspose,Trans::NoTranspose, AlgoGemm::ExternalBlas,Variant::One> ::invoke(PolicyType &policy, MemberType &member, const ScalarType alpha, DenseExecViewTypeA &A, DenseExecViewTypeB &B, const ScalarType beta, DenseExecViewTypeC &C) { // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // typename DenseMatrixTypeB::space_type // >::value && // Kokkos::Impl::is_same< // typename DenseMatrixTypeB::space_type, // typename DenseMatrixTypeC::space_type // >::value, // "Space type of input matrices does not match" ); if (member.team_rank() == 0) { #if \ defined( HAVE_SHYLUTACHO_TEUCHOS ) && \ defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) typedef typename DenseExecViewTypeA::ordinal_type ordinal_type; typedef typename DenseExecViewTypeA::value_type value_type; Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type m = C.NumRows(); const ordinal_type n = C.NumCols(); const ordinal_type k = B.NumRows(); if (m > 0 && n > 0 && k > 0) blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS, m, n, k, alpha, A.ValuePtr(), A.BaseObject().ColStride(), B.ValuePtr(), B.BaseObject().ColStride(), beta, C.ValuePtr(), C.BaseObject().ColStride()); #else TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") ); #endif } return 0; }
KOKKOS_INLINE_FUNCTION int Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose, AlgoTrsm::ExternalBlas,Variant::One> ::invoke(PolicyType &policy, const MemberType &member, const int diagA, const ScalarType alpha, DenseExecViewTypeA &A, DenseExecViewTypeB &B) { // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // Kokkos::Cuda // >::value, // "Cuda space is not available for calling external BLAS" ); // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // typename DenseMatrixTypeB::space_type // >::value, // "Space type of input matrices does not match" ); //typedef typename DenseExecViewTypeA::space_type space_type; typedef typename DenseExecViewTypeA::ordinal_type ordinal_type; typedef typename DenseExecViewTypeA::value_type value_type; if (member.team_rank() == 0) { #ifdef HAVE_SHYLUTACHO_TEUCHOS Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type m = A.NumRows(); const ordinal_type n = B.NumCols(); blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::NO_TRANS, (diagA == Diag::Unit ? Teuchos::UNIT_DIAG : Teuchos::NON_UNIT_DIAG), m, n, alpha, A.ValuePtr(), A.BaseObject().ColStride(), B.ValuePtr(), B.BaseObject().ColStride()); #else TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") ); #endif } return 0; }
KOKKOS_INLINE_FUNCTION int Chol<Uplo::Upper, AlgoChol::Unblocked,Variant::One> ::invoke(PolicyType &policy, const MemberType &member, CrsExecViewTypeA &A) { typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; // row_view_type r1t, r2t; for (ordinal_type k=0;k<A.NumRows();++k) { //r1t.setView(A, k); row_view_type &r1t = A.RowView(k); // extract diagonal from alpha11 value_type &alpha = r1t.Value(0); if (member.team_rank() == 0) { // if encounter null diag or wrong index, return -(row + 1) TACHO_TEST_FOR_ABORT( r1t.Col(0) != k, "Chol::Unblocked:: Diagonal does not exist"); if (Util::real(alpha) <= 0.0) { // warning message fprintf(stderr, " diagonal = %f, local col = %d, global col = %d\n", Util::real(alpha), k, r1t.OffsetCols() + k); // proceed with epsilon; for incomplete factorization, Cholesky factor may not exit alpha = 1.0e-8; //TACHO_TEST_FOR_ABORT( true, "Chol::Unblocked:: Diagonal is negative"); //return -(k + 1); } // error handling should be more carefully designed // sqrt on diag alpha = sqrt(Util::real(alpha)); } member.team_barrier(); const ordinal_type nnz_r1t = r1t.NumNonZeros(); if (nnz_r1t) { // inverse scale Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_r1t), [&](const ordinal_type j) { r1t.Value(j) /= alpha; }); member.team_barrier(); // hermitian rank update Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_r1t), [&](const ordinal_type i) { const ordinal_type row_at_i = r1t.Col(i); const value_type val_at_i = Util::conj(r1t.Value(i)); //r2t.setView(A, row_at_i); row_view_type &r2t = A.RowView(row_at_i); ordinal_type idx = 0; for (ordinal_type j=i;j<nnz_r1t && (idx > -2);++j) { const ordinal_type col_at_j = r1t.Col(j); idx = r2t.Index(col_at_j, idx); if (idx >= 0) { const value_type val_at_j = r1t.Value(j); r2t.Value(idx) -= val_at_i*val_at_j; } } }); } } return 0; }
void computeOrdering(const ordinal_type treecut = 0) { int ierr = 0; // pointers for global graph ordering ordinal_type *perm = _perm.ptr_on_device(); ordinal_type *peri = _peri.ptr_on_device(); ordinal_type *range = _range.ptr_on_device(); ordinal_type *tree = _tree.ptr_on_device(); { // set desired tree level if (_strat & SCOTCH_STRATLEVELMAX || _strat & SCOTCH_STRATLEVELMIN) { TACHO_TEST_FOR_ABORT(_level == 0, "SCOTCH_STRATLEVEL(MIN/MAX) is used but level is not specified"); } const int level = Util::max(1, _level-treecut); SCOTCH_Strat stradat; SCOTCH_Num straval = _strat; ierr = SCOTCH_stratInit(&stradat); TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_stratInit"); // if both are zero, do not build strategy if (_strat || _level) { if (_verbose) std::cout << "GraphTools_Scotch:: User provide a strategy and/or level" << std::endl << " strategy = " << _strat << ", level = " << _level << ", treecut = " << treecut << std::endl << " strategy & SCOTCH_STRATLEVELMAX = " << (_strat & SCOTCH_STRATLEVELMAX) << std::endl << " strategy & SCOTCH_STRATLEVELMIN = " << (_strat & SCOTCH_STRATLEVELMIN) << std::endl << " strategy & SCOTCH_STRATLEAFSIMPLE = " << (_strat & SCOTCH_STRATLEAFSIMPLE) << std::endl << " strategy & SCOTCH_STRATSEPASIMPLE = " << (_strat & SCOTCH_STRATSEPASIMPLE) << std::endl << std::endl; ierr = SCOTCH_stratGraphOrderBuild(&stradat, straval, level, 0.2); TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_stratGraphOrderBuild"); } ierr = SCOTCH_graphOrder(&_graph, &stradat, perm, peri, &_cblk, range, tree); TACHO_TEST_FOR_ABORT(ierr, "Failed in SCOTCH_graphOrder"); SCOTCH_stratExit(&stradat); } { ordinal_type nroot = 0; for (ordinal_type i=0; i<_cblk; ++i) nroot += (_tree[i] == -1); if (nroot > 1) { if (_verbose) std::cout << "GraphTools_Scotch:: # of roots " << nroot << std::endl << " a fake root is created to complete the tree" << std::endl << std::endl; _tree [_cblk] = -1; // dummy root _range[_cblk+1] = _range[_cblk]; // zero range for the dummy root for (ordinal_type i=0; i<_cblk; ++i) if (_tree[i] == -1) // multiple roots becomes children of the dummy root _tree[i] = (_cblk+1); ++_cblk; // include the dummy root } } _is_ordered = true; //std::cout << "SCOTCH level = " << level << std::endl; //std::cout << "Range Tree " << std::endl; //for (int i=0;i<_cblk;++i) // std::cout << _range[i] << " :: " << i << " " << _tree[i] << std::endl; }
int exampleCholByBlocks(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const int max_concurrency, const int max_task_dependence, const int team_size, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); // for simple test, let's use host space only here, for device it needs mirroring. typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType; typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType; typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType; typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType; int r_val = 0; Kokkos::Impl::Timer timer; /// /// Read from matrix market /// /// input - file /// output - AA_host /// CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; /// /// Create a graph structure for Scotch and CAMD (rptr, cidx) /// /// rptr and cidx are need to be set up for Scotch and CAMD /// typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); /// /// Run Scotch /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host"); AA_scotch_host.createConfTo(AA_host); CrsMatrixTools::copy(AA_scotch_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) AA_scotch_host.showMe(std::cout) << std::endl; /// /// Run CAMD /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(AA_scotch_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_camd_host("AA_camd_host"); AA_camd_host.createConfTo(AA_scotch_host); CrsMatrixTools::copy(AA_camd_host, C.PermVector(), C.InvPermVector(), AA_scotch_host); if (verbose) AA_camd_host.showMe(std::cout) << std::endl; /// /// Symbolic factorization /// /// input - /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// CrsMatrixBaseHostType AA_factor_host("AA_factor_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host, fill_level, Uplo::Upper, AA_camd_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; /// /// Clean tempoerary matrices /// /// input - AA_scotch_host, AA_camd_host, C, rptr, cidx /// output - none /// AA_scotch_host = CrsMatrixBaseHostType(); AA_camd_host = CrsMatrixBaseHostType(); C = GraphToolsHostType_CAMD(); rptr = typename GraphToolsHostType::size_type_array(); cidx = typename GraphToolsHostType::ordinal_type_array(); /// /// Create task policy /// /// input - max_task_size /// output - policy /// const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128); timer.reset(); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); double t_policy = timer.seconds(); /// /// Sequential execution /// /// input - AA_factor_host (matrix to be compared), rowviews /// output - BB_factor_host, B_factor_host /// double t_chol_serial = 0; CrsMatrixBaseHostType BB_factor_host("BB_factor_host"); if (check) { BB_factor_host.createConfTo(AA_factor_host); CrsMatrixTools::copy(BB_factor_host, AA_factor_host); CrsTaskViewHostType B_factor_host(BB_factor_host); Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType> rowviews("RowViewInMatView", B_factor_host.NumRows()); B_factor_host.setRowViewArray(rowviews); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One> ::createTaskFunctor(policy, B_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_serial = timer.seconds(); if (verbose) BB_factor_host.showMe(std::cout) << std::endl; } /// /// Task parallel execution /// /// input - AA_factor_host, rowviews /// output - HA_factor_host, AA_factor_host, B_factor_host /// double t_hier = 0, t_blocks = 0, t_chol_parallel = 0; CrsHierBaseHostType HA_factor_host("HA_factor_host"); { timer.reset(); S.pruneTree(prunecut); CrsMatrixTools::createHierMatrix(HA_factor_host, AA_factor_host, S.NumBlocks(), S.RangeVector(), S.TreeVector()); t_hier = timer.seconds(); timer.reset(); size_type nblocks = HA_factor_host.NumNonZeros(); Kokkos::View<ordinal_type*,HostSpaceType> ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1); ap_rowview_blocks(0) = 0; for (ordinal_type k=0;k<nblocks;++k) ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows(); Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType> rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks)); Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks), [&](const ordinal_type k) { const ordinal_type begin = ap_rowview_blocks(k); const ordinal_type end = ap_rowview_blocks(k+1); HA_factor_host.Value(k).setRowViewArray (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end))); } ); CrsMatrixTools::filterEmptyBlocks(HA_factor_host); t_blocks = timer.seconds(); { size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0; for (size_type k=0;k<nblocks_filtered; ++k) nnz_blocks += HA_factor_host.Value(k).NumNonZeros(); TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(), "nnz counted in blocks is different from nnz in the base matrix."); } CrsTaskHierViewHostType H_factor_host(HA_factor_host); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One> ::createTaskFunctor(policy, H_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_parallel = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; } if (check) { double diff = 0, norm = 0; TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(), "nnz used in serial is not same as nnz used in parallel"); const size_type nnz = AA_factor_host.NumNonZeros(); for (size_type k=0;k<nnz;++k) { norm += Util::abs(BB_factor_host.Value(k)); diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k)); } std::cout << std::scientific; std::cout << "CholByBlocks:: check with serial execution " << std::endl << " diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl; std::cout.unsetf(std::ios::scientific); } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "CholByBlocks:: Given matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Hier matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << std::endl << "CholByBlocks:: " << "symbolic factorization = " << t_symbolic << " [sec] " << std::endl << "CholByBlocks:: " << "policy creation = " << t_policy << " [sec] " << "hier creation = " << t_hier << " [sec] " << "block specification = " << t_blocks << " [sec] " << std::endl << "CholByBlocks:: " << "Chol Parallel = " << t_chol_parallel << " [sec] "; if (check) std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] " << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] "; std::cout << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }
int exampleDenseCholByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseCholByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host"); // random T matrix { TT_host.createConfTo(AA_host); for (ordinal_type j=0;j<TT_host.NumCols();++j) { for (ordinal_type i=0;i<TT_host.NumRows();++i) TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j)); } } // create SPD matrix { Teuchos::BLAS<ordinal_type,value_type> blas; blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, Teuchos::CONJ_TRANS, m, m, 1.0, TT_host.ValuePtr(), TT_host.ColStride(), 0.0, AA_host.ValuePtr(), AA_host.ColStride()); // preserve a copy of A AB_host.createConfTo(AA_host); DenseMatrixTools::copy(AB_host, AA_host); } const double flop = DenseFlopCount<value_type>::Chol(m); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseCholByBlocks:: m = " << m << " "; int ierr = 0; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AB_host); ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke (policy, policy.member_single(), A_host); t = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"); { timer.reset(); AA_device.mirror(AA_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device); timer.reset(); auto future = policy.proc_create_team (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, TA_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } AA_host.mirror(AA_device); if (!ierr && check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<=j;++i) { const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j)); const double val = AB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
int exampleCholUnblocked(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> CrsMatrixBaseDeviceType; typedef CrsMatrixView<CrsMatrixBaseDeviceType> CrsMatrixViewDeviceType; typedef TaskView<CrsMatrixViewDeviceType> CrsTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); S.pruneTree(prunecut); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType BB_host("BB_host"); BB_host.createConfTo(AA_host); CrsMatrixTools::copy(BB_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) BB_host.showMe(std::cout) << std::endl; timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, BB_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(BB_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType CC_host("CC_host"); CC_host.createConfTo(BB_host); CrsMatrixTools::copy(CC_host, C.PermVector(), C.InvPermVector(), BB_host); if (verbose) CC_host.showMe(std::cout) << std::endl; CrsMatrixBaseHostType DD_host("DD_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(DD_host, fill_level, Uplo::Upper, CC_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) DD_host.showMe(std::cout) << std::endl; // ================================================================================== CrsMatrixBaseDeviceType AA_device("AA_device"); AA_device.mirror(DD_host); const size_type max_concurrency = 10; const size_type max_task_size = (3*sizeof(CrsTaskViewDeviceType)+sizeof(PolicyType)+128); const size_type max_task_dependence = 0; const size_type team_size = 1; PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); CrsMatrixViewDeviceType A_device(AA_device); Kokkos::View<typename CrsMatrixViewDeviceType::row_view_type*,DeviceSpaceType> rowviews("RowViewInMatView", A_device.NumRows()); A_device.setRowViewArray(rowviews); timer.reset(); int ierr = Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>::invoke (policy, policy.member_single(), A_device); double t_chol = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); if (verbose) { DD_host.mirror(AA_device); DD_host.showMe(std::cout) << std::endl; } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "SymbolicFactorization:: Given matrix dimension = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", " << " nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: Upper factors dimension = " << DD_host.NumRows() << " x " << DD_host.NumCols() << ", " << " nnz = " << DD_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << "symbolic factorization = " << t_symbolic << " [sec] " << "Cholesky factorization = " << t_chol << " [sec] " << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }
static void read(CrsMatrixType &A, std::ifstream &file) { // static_assert( Kokkos::Impl::is_same< // typename CrsMatrixType::space_type, // Kokkos::HostSpace // >::value, // "Space type of the input should be HostSpace" ); typedef typename CrsMatrixType::value_type value_type; typedef typename CrsMatrixType::ordinal_type ordinal_type; typedef typename CrsMatrixType::size_type size_type; // coordinate format typedef Coo<ordinal_type,value_type> ijv_type; //typedef typename CrsMatrixType::space_type space_type; //typedef Kokkos::RangePolicy<space_type,Kokkos::Schedule<Kokkos::Static> > range_policy_type; ordinal_type m, n; size_type nnz; std::vector<ijv_type> mm; const ordinal_type mm_base = 1; { std::string header; if (file.is_open()) { std::getline(file, header); while (file.good()) { char c = file.peek(); if (c == '%' || c == '\n') { file.ignore(256, '\n'); continue; } break; } } else { TACHO_TEST_FOR_ABORT(true, MSG_INVALID_INPUT(file)); } // check the header bool symmetry = (header.find("symmetric") != std::string::npos); // read matrix specification file >> m >> n >> nnz; mm.reserve(nnz*(symmetry ? 2 : 1)); for (size_type i=0;i<nnz;++i) { ordinal_type row, col; value_type val; file >> row >> col >> val; row -= mm_base; col -= mm_base; mm.push_back(ijv_type(row, col, val)); if (symmetry && row != col) mm.push_back(ijv_type(col, row, val)); } // sort by row major order std::sort(mm.begin(), mm.end(), std::less<ijv_type>()); } // change mm to crs std::vector<size_type> ap; std::vector<ordinal_type> aj; std::vector<value_type> ax; ap.reserve(m+1); aj.reserve(nnz); ax.reserve(nnz); { ordinal_type ii = 0; size_type jj = 0; ijv_type prev = mm[0]; ap.push_back(0); aj.push_back(prev.Col()); ax.push_back(prev.Val()); for (auto //typename std::vector<ijv_type>::const_iterator it=(mm.begin()+1);it<mm.end();++it) { ijv_type aij = (*it); // row index if (aij.Row() != prev.Row()) ap.push_back(aj.size()); if (aij == prev) { aj.back() = aij.Col(); ax.back() += aij.Val(); } else { aj.push_back(aij.Col()); ax.push_back(aij.Val()); } prev = aij; } // add the last index to terminate the storage ap.push_back(aj.size()); nnz = aj.size(); } // create crs matrix view A.create(m, n, nnz); for (ordinal_type i=0;i<m;++i) { A.RowPtrBegin(i) = ap.at(i); A.RowPtrEnd(i) = ap.at(i+1); } for (ordinal_type k=0;k<nnz;++k) { A.Col(k) = aj.at(k); A.Value(k) = ax.at(k); } }