/////////////////////////////////////////////////////////////////////////////// //* performs a matrix-vector multiply with optional transposes BLAS version void MultMv(const Matrix<double> &A, const Vector<double> &v, DenseVector<double> &c, const bool At, double a, double b) { static char t[2] = {'N','T'}; char *ta=t+At; int sA[2] = {A.nRows(), A.nCols()}; // sizes of A int sV[2] = {v.size(), 1}; // sizes of v GCK(A, v, sA[!At]!=sV[0], "MultAB<double>: matrix-vector multiply"); if (c.size() != sA[At]) { c.resize(sA[At]); // set size of C to final size c.zero(); } // get pointers to the matrix sizes needed by BLAS int *M = sA+At; // # of rows in op[A] (op[A] = A' if At='T' else A) int *N = sV+1; // # of cols in op[B] int *K = sA+!At; // # of cols in op[A] or # of rows in op[B] double *pa=A.ptr(), *pv=v.ptr(), *pc=c.ptr(); #ifdef COL_STORAGE dgemm_(ta, t, M, N, K, &a, pa, sA, pv, sV, &b, pc, M); #else dgemm_(t, ta, N, M, K, &a, pv, sV+1, pa, sA+1, &b, pc, N); #endif }
//============================================================ void ParSparseMatrix<double>::MultMv(const Vector<double>& v, DenseVector<double>& c) const { int numProcs = MPI_Wrappers::size(_comm); #ifdef DISABLE_PAR_HEURISTICS // Use much more lenient heuristics to exercise parallel code if (numProcs == 1 || _size < 300) { #else // These are simple heuristics to perform multiplication in serial if // parallel will be slower. They were determined experimentally. if ( numProcs == 1 || (_size < 50000 || _size > 10000000) || ((_size < 150000 || _size > 5000000) && numProcs > 8) || ((_size < 500000 || _size > 2500000) && numProcs > 16 ) || (numProcs > 32)) { #endif SparseMatrix<double>::MultMv(v, c); return; } SparseMatrix<double>::compress(*this); GCK(*this, v, this->nCols() != v.size(), "ParSparseMatrix * Vector") SparseMatrix<double> A_local; // Split the sparse matrix. partition() takes a ParSparMat, so we cast. partition(*static_cast<ParSparseMatrix<double>*>(&A_local)); // actually do multiplication - end up with partial result vector // on each processor #ifdef TIMING_ON timespec before, after; // barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &before); #endif DenseVector<double> c_local = A_local * v; #ifdef TIMING_ON // barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &after); cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " mat.vec time\n"; //LammpsInterface::instance()->all_print((after-before),"mat.vec time"); barrier(MPI_COMM_WORLD); #endif // destroy A_local intelligently static_cast<ParSparseMatrix<double>*>(&A_local)->finalize(); // Add all the result vectors together on each processor. #ifdef TIMING_ON barrier(MPI_COMM_WORLD); //barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &before); #endif allsum(_comm, c_local.ptr(), c.ptr(), c_local.size()); #ifdef TIMING_ON //barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &after); cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " allsum time\n"; //LammpsInterface::instance()->print_msg_once((after-before),"allsum time"); #endif } DenseVector<double> ParSparseMatrix<double>::transMat( const Vector<double>& v) const { SparseMatrix<double>::compress(*this); GCK(*this, v, this->nRows() != v.size(), "ParSparseMatrix transpose * Vector") DenseVector<double> c(nCols(), true); SparseMatrix<double> A_local; partition(*static_cast<ParSparseMatrix<double>*>(&A_local)); // actually do multiplication - end up with partial result vector // on each processor DenseVector<double> c_local = A_local.transMat(v); static_cast<ParSparseMatrix<double>*>(&A_local)->finalize(); // Add all the result vectors together on each processor. allsum(_comm, c_local.ptr(), c.ptr(), c_local.size()); return c; }