int exampleDenseGemmByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type k, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m); { if (ArgTransA == Trans::NoTranspose) AA_host = DenseMatrixBaseHostType("AA_host", m, k); else AA_host = DenseMatrixBaseHostType("AA_host", k, m); if (ArgTransB == Trans::NoTranspose) BB_host = DenseMatrixBaseHostType("BB_host", k, m); else BB_host = DenseMatrixBaseHostType("BB_host", m, k); for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<AA_host.NumRows();++i) AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB_host.NumCols();++j) for (ordinal_type i=0;i<BB_host.NumRows();++i) BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; DenseMatrixTools::copy(CB_host, CC_host); } const double flop = DenseFlopCount<value_type>::Gemm(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << " "; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke (policy, policy.member_single(), 1.0, A_host, B_host, 1.0, C_host); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device"); { timer.reset(); AA_device.mirror(AA_host); BB_device.mirror(BB_host); CC_device.mirror(CC_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb); DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device); timer.reset(); auto future = policy.proc_create_team (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } CC_host.mirror(CC_device); if (check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) { const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j)); const double val = CB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
int exampleCholUnblocked(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> CrsMatrixBaseDeviceType; typedef CrsMatrixView<CrsMatrixBaseDeviceType> CrsMatrixViewDeviceType; typedef TaskView<CrsMatrixViewDeviceType> CrsTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); S.pruneTree(prunecut); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType BB_host("BB_host"); BB_host.createConfTo(AA_host); CrsMatrixTools::copy(BB_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) BB_host.showMe(std::cout) << std::endl; timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, BB_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(BB_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType CC_host("CC_host"); CC_host.createConfTo(BB_host); CrsMatrixTools::copy(CC_host, C.PermVector(), C.InvPermVector(), BB_host); if (verbose) CC_host.showMe(std::cout) << std::endl; CrsMatrixBaseHostType DD_host("DD_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(DD_host, fill_level, Uplo::Upper, CC_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) DD_host.showMe(std::cout) << std::endl; // ================================================================================== CrsMatrixBaseDeviceType AA_device("AA_device"); AA_device.mirror(DD_host); const size_type max_concurrency = 10; const size_type max_task_size = (3*sizeof(CrsTaskViewDeviceType)+sizeof(PolicyType)+128); const size_type max_task_dependence = 0; const size_type team_size = 1; PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); CrsMatrixViewDeviceType A_device(AA_device); Kokkos::View<typename CrsMatrixViewDeviceType::row_view_type*,DeviceSpaceType> rowviews("RowViewInMatView", A_device.NumRows()); A_device.setRowViewArray(rowviews); timer.reset(); int ierr = Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>::invoke (policy, policy.member_single(), A_device); double t_chol = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); if (verbose) { DD_host.mirror(AA_device); DD_host.showMe(std::cout) << std::endl; } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "SymbolicFactorization:: Given matrix dimension = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", " << " nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: Upper factors dimension = " << DD_host.NumRows() << " x " << DD_host.NumCols() << ", " << " nnz = " << DD_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << "symbolic factorization = " << t_symbolic << " [sec] " << "Cholesky factorization = " << t_chol << " [sec] " << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }