typename boost::math::tools::promote_args<T_y, T_covar, T_w>::type multi_gp_cholesky_log(const Eigen::Matrix <T_y, Eigen::Dynamic, Eigen::Dynamic>& y, const Eigen::Matrix <T_covar, Eigen::Dynamic, Eigen::Dynamic>& L, const Eigen::Matrix<T_w, Eigen::Dynamic, 1>& w) { static const char* function("multi_gp_cholesky_log"); typedef typename boost::math::tools::promote_args<T_y, T_covar, T_w>::type T_lp; T_lp lp(0.0); check_size_match(function, "Size of random variable (rows y)", y.rows(), "Size of kernel scales (w)", w.size()); check_size_match(function, "Size of random variable", y.cols(), "rows of covariance parameter", L.rows()); check_finite(function, "Kernel scales", w); check_positive(function, "Kernel scales", w); check_finite(function, "Random variable", y); if (y.rows() == 0) return lp; if (include_summand<propto>::value) { lp += NEG_LOG_SQRT_TWO_PI * y.rows() * y.cols(); } if (include_summand<propto, T_covar>::value) { lp -= L.diagonal().array().log().sum() * y.rows(); } if (include_summand<propto, T_w>::value) { lp += 0.5 * y.cols() * sum(log(w)); } if (include_summand<propto, T_y, T_w, T_covar>::value) { T_lp sum_lp_vec(0.0); for (int i = 0; i < y.rows(); i++) { Eigen::Matrix<T_y, Eigen::Dynamic, 1> y_row(y.row(i)); Eigen::Matrix<typename boost::math::tools::promote_args <T_y, T_covar>::type, Eigen::Dynamic, 1> half(mdivide_left_tri_low(L, y_row)); sum_lp_vec += w(i) * dot_self(half); } lp -= 0.5*sum_lp_vec; } return lp; }
int main(int argc, char *argv[]) { typedef int IndexType; typedef double ValueType; typedef cusp::device_memory MemorySpace; //typedef cusp::row_major Orientation; bool success = true; bool verbose = false; try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString("This test performance of block multiply routines.\n"); IndexType n = 32; CLP.setOption("n", &n, "Number of mesh points in the each direction"); IndexType nrhs_begin = 32; CLP.setOption("begin", &nrhs_begin, "Staring number of right-hand-sides"); IndexType nrhs_end = 512; CLP.setOption("end", &nrhs_end, "Ending number of right-hand-sides"); IndexType nrhs_step = 32; CLP.setOption("step", &nrhs_step, "Increment in number of right-hand-sides"); IndexType nits = 10; CLP.setOption("nits", &nits, "Number of multiply iterations"); int device_id = 0; CLP.setOption("device", &device_id, "CUDA device ID"); CLP.parse( argc, argv ); // Set CUDA device cudaSetDevice(device_id); cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); // create 3D Poisson problem cusp::csr_matrix<IndexType, ValueType, MemorySpace> A; cusp::gallery::poisson27pt(A, n, n, n); std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , " << "col_time , col_gflops" << std::endl; for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) { double flops = 2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs); // test row-major storage cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row( A.num_rows, nrhs, 0); cusp::detail::timer row_timer; row_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_row, y_row); } cudaDeviceSynchronize(); double row_time = row_timer.seconds_elapsed() / nits; double row_gflops = 1.0e-9 * flops / row_time; // test column-major storage cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col( A.num_rows, nrhs, 0); cusp::detail::timer col_timer; col_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_col, y_col); } cudaDeviceSynchronize(); double col_time = col_timer.seconds_elapsed() / nits; double col_gflops = 1.0e-9 * flops / col_time; std::cout << nrhs << " , " << A.num_rows << " , " << A.num_entries << " , " << row_time << " , " << row_gflops << " , " << col_time << " , " << col_gflops << std::endl; } } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); if (success) return 0; return -1; }