예제 #1
0
///////////////////////////////////////////////////////////////////////////////
//* performs a matrix-vector multiply with optional transposes BLAS version
void MultMv(const Matrix<double> &A, const Vector<double> &v, DenseVector<double> &c,
            const bool At, double a, double b)
{
  static char t[2] = {'N','T'};
  char *ta=t+At;
  int sA[2] = {A.nRows(), A.nCols()};  // sizes of A
  int sV[2] = {v.size(), 1};           // sizes of v
  
  GCK(A, v, sA[!At]!=sV[0], "MultAB<double>: matrix-vector multiply");
  if (c.size() != sA[At])
  {
    c.resize(sA[At]); // set size of C to final size
    c.zero();
  }
  // get pointers to the matrix sizes needed by BLAS
  int *M = sA+At;  // # of rows in op[A]  (op[A] = A' if At='T' else A)
  int *N = sV+1;   // # of cols in op[B]
  int *K = sA+!At; // # of cols in op[A] or # of rows in op[B]

  double *pa=A.ptr(), *pv=v.ptr(), *pc=c.ptr();

#ifdef COL_STORAGE
  dgemm_(ta, t, M, N, K, &a, pa, sA, pv, sV, &b, pc, M); 
#else
  dgemm_(t, ta, N, M, K, &a, pv, sV+1, pa, sA+1, &b, pc, N); 
#endif
}
//============================================================
void ParSparseMatrix<double>::MultMv(const Vector<double>& v,
    DenseVector<double>& c) const
{
  int numProcs = MPI_Wrappers::size(_comm);
#ifdef DISABLE_PAR_HEURISTICS
  // Use much more lenient heuristics to exercise parallel code
  if (numProcs == 1 ||  _size < 300) {
#else  
  // These are simple heuristics to perform multiplication in serial if
  //   parallel will be slower. They were determined experimentally.
  if ( numProcs == 1 ||
      (_size < 50000  || _size > 10000000) ||
     ((_size < 150000 || _size > 5000000) && numProcs > 8) ||
     ((_size < 500000 || _size > 2500000) && numProcs > 16 ) ||
     (numProcs > 32)) {
#endif
    SparseMatrix<double>::MultMv(v, c);
    return;
  }
 

  SparseMatrix<double>::compress(*this);
  GCK(*this, v, this->nCols() != v.size(), "ParSparseMatrix * Vector")

  SparseMatrix<double> A_local;

  // Split the sparse matrix. partition() takes a ParSparMat, so we cast.
  partition(*static_cast<ParSparseMatrix<double>*>(&A_local));

  // actually do multiplication - end up with partial result vector
  // on each processor
#ifdef TIMING_ON
  timespec before, after;
//  barrier(MPI_COMM_WORLD);
  clock_gettime(CLOCK_MONOTONIC, &before);
#endif
  DenseVector<double> c_local = A_local * v;
#ifdef TIMING_ON
//  barrier(MPI_COMM_WORLD);
  clock_gettime(CLOCK_MONOTONIC, &after);
  cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " mat.vec time\n";
  //LammpsInterface::instance()->all_print((after-before),"mat.vec time");
  barrier(MPI_COMM_WORLD);
#endif

  // destroy A_local intelligently
  static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

  // Add all the result vectors together on each processor.
#ifdef TIMING_ON
  barrier(MPI_COMM_WORLD);
//barrier(MPI_COMM_WORLD);
  clock_gettime(CLOCK_MONOTONIC, &before);
#endif
  allsum(_comm, c_local.ptr(), c.ptr(), c_local.size());
#ifdef TIMING_ON
//barrier(MPI_COMM_WORLD);
  clock_gettime(CLOCK_MONOTONIC, &after);
  cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " allsum time\n";
  //LammpsInterface::instance()->print_msg_once((after-before),"allsum time");
#endif
}

DenseVector<double> ParSparseMatrix<double>::transMat(
    const Vector<double>& v) const {
  SparseMatrix<double>::compress(*this);
  GCK(*this, v, this->nRows() != v.size(), "ParSparseMatrix transpose * Vector")

  DenseVector<double> c(nCols(), true);

  SparseMatrix<double> A_local;
  partition(*static_cast<ParSparseMatrix<double>*>(&A_local));

  // actually do multiplication - end up with partial result vector
  // on each processor
  DenseVector<double> c_local = A_local.transMat(v);

  static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

  // Add all the result vectors together on each processor.
  allsum(_comm, c_local.ptr(), c.ptr(), c_local.size());

  return c;
}