DenseMatrix<double> ParSparseMatrix<double>::transMat(
    const DenseMatrix<double>& B) const {
  SparseMatrix<double>::compress(*this);
  GCK(*this, B, this->nRows() != B.nRows(), "ParSparseMatrix transpose * Matrix")

  DenseMatrix<double> C(nCols(), B.nCols(), true);

  SparseMatrix<double> A_local;
  partition(*static_cast<ParSparseMatrix<double>*>(&A_local));

  // actually do multiplication - end up with partial result matrix
  // on each processor
  DenseMatrix<double> C_local = A_local.transMat(B);

  static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

  // Add all the result vectors together on each processor.
  allsum(_comm, C_local.ptr(), C.ptr(), C_local.size());

  return C;
}