typename boost::math::tools::promote_args<T_y, T_covar, T_w>::type
    multi_gp_cholesky_log(const Eigen::Matrix
                          <T_y, Eigen::Dynamic, Eigen::Dynamic>& y,
                          const Eigen::Matrix
                          <T_covar, Eigen::Dynamic, Eigen::Dynamic>& L,
                          const Eigen::Matrix<T_w, Eigen::Dynamic, 1>& w) {
      static const char* function("multi_gp_cholesky_log");
      typedef
        typename boost::math::tools::promote_args<T_y, T_covar, T_w>::type T_lp;
      T_lp lp(0.0);


      check_size_match(function,
                       "Size of random variable (rows y)", y.rows(),
                       "Size of kernel scales (w)", w.size());
      check_size_match(function,
                       "Size of random variable", y.cols(),
                       "rows of covariance parameter", L.rows());
      check_finite(function, "Kernel scales", w);
      check_positive(function, "Kernel scales", w);
      check_finite(function, "Random variable", y);

      if (y.rows() == 0)
        return lp;

      if (include_summand<propto>::value) {
        lp += NEG_LOG_SQRT_TWO_PI * y.rows() * y.cols();
      }

      if (include_summand<propto, T_covar>::value) {
        lp -= L.diagonal().array().log().sum() * y.rows();
      }

      if (include_summand<propto, T_w>::value) {
        lp += 0.5 * y.cols() * sum(log(w));
      }

      if (include_summand<propto, T_y, T_w, T_covar>::value) {
        T_lp sum_lp_vec(0.0);
        for (int i = 0; i < y.rows(); i++) {
          Eigen::Matrix<T_y, Eigen::Dynamic, 1> y_row(y.row(i));
          Eigen::Matrix<typename boost::math::tools::promote_args
                        <T_y, T_covar>::type,
                        Eigen::Dynamic, 1>
            half(mdivide_left_tri_low(L, y_row));
          sum_lp_vec += w(i) * dot_self(half);
        }
        lp -= 0.5*sum_lp_vec;
      }

      return lp;
    }
Beispiel #2
0
int main(int argc, char *argv[])
{
  typedef int                 IndexType;
  typedef double              ValueType;
  typedef cusp::device_memory MemorySpace;
  //typedef cusp::row_major     Orientation;

  bool success = true;
  bool verbose = false;
  try {

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString("This test performance of block multiply routines.\n");
    IndexType n = 32;
    CLP.setOption("n", &n, "Number of mesh points in the each direction");
    IndexType nrhs_begin = 32;
    CLP.setOption("begin", &nrhs_begin,
                  "Staring number of right-hand-sides");
    IndexType nrhs_end = 512;
    CLP.setOption("end", &nrhs_end,
                  "Ending number of right-hand-sides");
    IndexType nrhs_step = 32;
    CLP.setOption("step", &nrhs_step,
                  "Increment in number of right-hand-sides");
    IndexType nits = 10;
    CLP.setOption("nits", &nits,
                  "Number of multiply iterations");
    int device_id = 0;
    CLP.setOption("device", &device_id, "CUDA device ID");
    CLP.parse( argc, argv );

    // Set CUDA device
    cudaSetDevice(device_id);
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);

    // create 3D Poisson problem
    cusp::csr_matrix<IndexType, ValueType, MemorySpace> A;
    cusp::gallery::poisson27pt(A, n, n, n);

    std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , "
              << "col_time , col_gflops" << std::endl;

    for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) {

      double flops =
        2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs);

      // test row-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row(
        A.num_rows, nrhs, 0);

      cusp::detail::timer row_timer;
      row_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_row, y_row);
      }
      cudaDeviceSynchronize();
      double row_time = row_timer.seconds_elapsed() / nits;
      double row_gflops = 1.0e-9 * flops / row_time;

      // test column-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col(
        A.num_rows, nrhs, 0);

      cusp::detail::timer col_timer;
      col_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_col, y_col);
      }
      cudaDeviceSynchronize();
      double col_time = col_timer.seconds_elapsed() / nits;
      double col_gflops = 1.0e-9 * flops / col_time;

      std::cout << nrhs << " , "
                << A.num_rows << " , "  << A.num_entries << " , "
                << row_time << " , " << row_gflops << " , "
                << col_time << " , " << col_gflops
                << std::endl;

    }

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}