void DistributedVector<T>::localize (NumericVector<T>& v_local_in) const { libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); DistributedVector<T>* v_local = libmesh_cast_ptr<DistributedVector<T>*>(&v_local_in); v_local->_first_local_index = 0; v_local->_global_size = v_local->_local_size = v_local->_last_local_index = size(); v_local->_is_initialized = v_local->_is_closed = true; // Call localize on the vector's values. This will help // prevent code duplication localize (v_local->_values); #ifndef LIBMESH_HAVE_MPI libmesh_assert_equal_to (local_size(), size()); #endif }
void DistributedVector<T>::conjugate() { for (numeric_index_type i=0; i<local_size(); i++) { // Replace values by complex conjugate _values[i] = libmesh_conj(_values[i]); } }
void DistributedVector<T>::abs() { libmesh_assert (this->initialized()); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); for (std::size_t i=0; i<local_size(); i++) this->set(i,std::abs(_values[i])); }
void DistributedVector<T>::scale (const T factor) { libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); for (std::size_t i=0; i<local_size(); i++) _values[i] *= factor; }
void DistributedVector<T>::add (const T v) { libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); for (numeric_index_type i=0; i<local_size(); i++) _values[i] += v; }
void DistributedVector<T>::reciprocal() { for (numeric_index_type i=0; i<local_size(); i++) { // Don't divide by zero libmesh_assert_not_equal_to (_values[i], T(0)); _values[i] = 1. / _values[i]; } }
NumericVector<T>& DistributedVector<T>::operator = (const T s) { libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); for (std::size_t i=0; i<local_size(); i++) _values[i] = s; return *this; }
std::vector<Node*>::const_iterator AbstractLayer::local_end(int_t depth) const { if (depth >= depth_) throw BadProperty("Selected depth out of range"); index min_nodes_per_layer = local_size()/depth_; index last_gid_at_depth = gids_[(depth+1)*(global_size()/depth_)-1]; std::vector<Node*>::const_iterator iter = local_begin(); for(iter += (depth+1)*min_nodes_per_layer; iter != local_end(); ++iter) { if ((*iter)->get_gid() > last_gid_at_depth) break; } return iter; }
void PetscVector<Real>::localize_to_one (std::vector<Real>& v_local, const processor_id_type pid) const { this->_restore_array(); PetscErrorCode ierr=0; const PetscInt n = size(); const PetscInt nl = local_size(); PetscScalar *values; v_local.resize(n); // only one processor if (n == nl) { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); for (PetscInt i=0; i<n; i++) v_local[i] = static_cast<Real>(values[i]); ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } // otherwise multiple processors else { numeric_index_type ioff = this->first_local_index(); std::vector<Real> local_values (n, 0.); { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); for (PetscInt i=0; i<nl; i++) local_values[i+ioff] = static_cast<Real>(values[i]); ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } MPI_Reduce (&local_values[0], &v_local[0], n, MPI_REAL, MPI_SUM, pid, libMesh::COMM_WORLD); } }
void DistributedVector<T>::localize (std::vector<T>& v_local) const { // This function must be run on all processors at once parallel_object_only(); libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); v_local = this->_values; this->comm().allgather (v_local); #ifndef LIBMESH_HAVE_MPI libmesh_assert_equal_to (local_size(), size()); #endif }
Real DistributedVector<T>::l2_norm () const { // This function must be run on all processors at once parallel_object_only(); libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); double local_l2 = 0.; for (numeric_index_type i=0; i<local_size(); i++) local_l2 += TensorTools::norm_sq(_values[i]); this->comm().sum(local_l2); return std::sqrt(local_l2); }
T DistributedVector<T>::sum () const { // This function must be run on all processors at once parallel_object_only(); libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); T local_sum = 0.; for (numeric_index_type i=0; i<local_size(); i++) local_sum += _values[i]; this->comm().sum(local_sum); return local_sum; }
/* * MPID_Pack_size * * NOTE: MPID_Msg_pack_t msgact ignored for reasons stated at top of file * * NOTE: there's no way for me to report an error condition. * where's the *error_code arg? * in case of an error, i will return pass 0 and print an error * message to stdout. */ void MPID_Pack_size(int count, struct MPIR_DATATYPE *datatype, MPID_Msg_pack_t msgact, /* ignored */ int *size) { int tmp_size; tmp_size = local_size(count, datatype); if (tmp_size < 0) { globus_libc_fprintf(stderr, "ERROR: MPID_Pack_size could not calculate pack size, returning 0\n"); *size = 0; } /* endif */ *size = tmp_size + sizeof(unsigned char); }
NumericVector<T>& DistributedVector<T>::operator = (const std::vector<T>& v) { libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); if (v.size() == local_size()) _values = v; else if (v.size() == size()) for (std::size_t i=first_local_index(); i<last_local_index(); i++) _values[i-first_local_index()] = v[i]; else libmesh_error_msg("Incompatible sizes in DistributedVector::operator="); return *this; }
Real DistributedVector<T>::linfty_norm () const { // This function must be run on all processors at once parallel_object_only(); libmesh_assert (this->initialized()); libmesh_assert_equal_to (_values.size(), _local_size); libmesh_assert_equal_to ((_last_local_index - _first_local_index), _local_size); Real local_linfty = 0.; for (numeric_index_type i=0; i<local_size(); i++) local_linfty = std::max(local_linfty, static_cast<Real>(std::abs(_values[i])) ); // Note we static_cast so that both // types are the same, as required // by std::max this->comm().max(local_linfty); return local_linfty; }
void PetscVector<Complex>::localize_to_one (std::vector<Complex>& v_local, const processor_id_type pid) const { this->_restore_array(); PetscErrorCode ierr=0; const PetscInt n = size(); const PetscInt nl = local_size(); PetscScalar *values; v_local.resize(n); for (PetscInt i=0; i<n; i++) v_local[i] = 0.; // only one processor if (n == nl) { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); for (PetscInt i=0; i<n; i++) v_local[i] = static_cast<Complex>(values[i]); ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } // otherwise multiple processors else { numeric_index_type ioff = this->first_local_index(); /* in here the local values are stored, acting as send buffer for MPI * initialize to zero, since we collect using MPI_SUM */ std::vector<Real> real_local_values(n, 0.); std::vector<Real> imag_local_values(n, 0.); { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); // provide my local share to the real and imag buffers for (PetscInt i=0; i<nl; i++) { real_local_values[i+ioff] = static_cast<Complex>(values[i]).real(); imag_local_values[i+ioff] = static_cast<Complex>(values[i]).imag(); } ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } /* have buffers of the real and imaginary part of v_local. * Once MPI_Reduce() collected all the real and imaginary * parts in these std::vector<Real>, the values can be * copied to v_local */ std::vector<Real> real_v_local(n); std::vector<Real> imag_v_local(n); // collect entries from other proc's in real_v_local, imag_v_local MPI_Reduce (&real_local_values[0], &real_v_local[0], n, MPI_REAL, MPI_SUM, pid, libMesh::COMM_WORLD); MPI_Reduce (&imag_local_values[0], &imag_v_local[0], n, MPI_REAL, MPI_SUM, pid, libMesh::COMM_WORLD); // copy real_v_local and imag_v_local to v_local for (PetscInt i=0; i<n; i++) v_local[i] = Complex(real_v_local[i], imag_v_local[i]); } }
/* * local_size * * return -1 when there's problems * * NOTE: there is one more datatype found in datatype.h ... MPIR_FORT_INT * it has been explained to me by bill that we do not have to * support an explicit case for that type because it is a * synonym for one of the other types we already have a case * statement for (which type it is a synonym for is architecture * dependent and determined during mpich configuration). * */ int local_size(int count, struct MPIR_DATATYPE *datatype) { int rc; if (count < 0) { globus_libc_fprintf(stderr, "ERROR: local_size: passed count %d .... must be >= 0\n", count); return -1; } /* endif */ switch(datatype->dte_type) { case MPIR_CHAR: rc = globus_dc_sizeof_char(count); break; case MPIR_UCHAR: rc = globus_dc_sizeof_u_char(count); break; /* MPIR_PACKED are always raw bytes and are never converted */ case MPIR_PACKED: rc = count; break; case MPIR_BYTE: rc = count ; break; case MPIR_SHORT: rc = globus_dc_sizeof_short(count); break; case MPIR_USHORT: rc = globus_dc_sizeof_u_short(count); break; case MPIR_LOGICAL: /* 'logical' in FORTRAN is always same as 'int' */ case MPIR_INT: rc = globus_dc_sizeof_int(count); break; case MPIR_UINT: rc = globus_dc_sizeof_u_int(count); break; case MPIR_LONG: rc = globus_dc_sizeof_long(count); break; case MPIR_LONGLONGINT: rc = globus_dc_sizeof_long_long(count); break; case MPIR_ULONG: rc = globus_dc_sizeof_u_long(count); break; case MPIR_FLOAT: rc = globus_dc_sizeof_float(count); break; case MPIR_DOUBLE: rc = globus_dc_sizeof_double(count); break; case MPIR_LONGDOUBLE: /* not supported by Globus */ rc = 0; break; case MPIR_UB: case MPIR_LB: rc = 0; break; case MPIR_COMPLEX: rc = globus_dc_sizeof_float(2*count); break; case MPIR_DOUBLE_COMPLEX: rc = globus_dc_sizeof_double(2*count); break; case MPIR_CONTIG: rc = local_size(count*datatype->count, datatype->old_type); break; case MPIR_VECTOR: case MPIR_HVECTOR: { int tmp = local_size(datatype->blocklen, datatype->old_type); rc = (tmp == -1 ? -1 : tmp*count*datatype->count); } break; case MPIR_INDEXED: case MPIR_HINDEXED: { int i, tmp, tmp2; for (rc = tmp = tmp2 = i = 0; tmp2 != -1 && i < datatype->count; i++) { tmp2 = local_size(datatype->blocklens[i], datatype->old_type); if (tmp2 == -1) tmp = -1; else tmp += tmp2; } /* endfor */ if (tmp != -1) rc = tmp*count; else rc = -1; } break; case MPIR_STRUCT: { int i, tmp, tmp2; for (rc = tmp = tmp2 = i = 0; tmp2 != -1 && i < datatype->count; i++) { tmp2 = local_size(datatype->blocklens[i], datatype->old_types[i]); if (tmp2 == -1) tmp = -1; else tmp += tmp2; } /* endfor */ if (tmp != -1) rc = tmp*count; else rc = -1; } break; default: globus_libc_fprintf(stderr, "ERROR: local_size: encountered unrecognizable MPIR type %d\n", datatype->dte_type); rc = -1; break; } /* end switch */ return rc; } /* end local_size() */
/// 计算结果存储在矩阵a中 /// n_global: the order of the matrix static void inv_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); //// self code //n_global = 3; //double *aaa = new double(n_global*n_global); //for (int i = 0; i < 9; i++) //{ // aaa[i] = i + 1; //} //aaa[8] = 10; //auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa); // Create a NxN random matrix A auto a = block_cyclic_mat_t::random(grid, n_global, n_global); // Create a NxN matrix to hold A^{-1} auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global); // Copy A to A^{-1} since it will be overwritten during factorization std::copy_n(a->local_data(), a->local_size(), ai->local_data()); MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); // Factorize A blas_idx_t ia = 1, ja = 1; std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100); blas_idx_t info; //含义应该是D-GE-TRF。 //第一个D表示我们的矩阵是double类型的 //GE表示我们的矩阵是General类型的 //TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。 pdgetrf_(n_global, n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), info); assert(info == 0); double t_factor = MPI_Wtime() - t0; // Compute A^{-1} based on the LU factorization // Compute workspace for double and integer work arrays on each process blas_idx_t lwork = 10; blas_idx_t liwork = 10; std::vector<double> work (lwork); std::vector<blas_idx_t> iwork(liwork); lwork = liwork = -1; // 计算lwork与liwork的值 pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); lwork = static_cast<blas_idx_t>(work[0]); liwork = static_cast<size_t>(iwork[0]); work.resize(lwork); iwork.resize(liwork); // Now compute the inverse t0 = MPI_Wtime(); pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); double t_solve = MPI_Wtime() - t0; // Verify that the inverse is correct using A*A^{-1} = I auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global); // Compute I = A * A^{-1} - I and verify that the ||I|| is small char nein = 'N'; double alpha = 1.0, beta = -1.0; pdgemm_(nein, nein, n_global, n_global, n_global, alpha, a->local_data() , ia, ja, a->descriptor(), ai->local_data(), ia, ja, ai->descriptor(), beta, identity->local_data(), ia, ja, identity->descriptor()); // Compute 1-norm of the result char norm='1'; work.resize(identity->local_cols()); double err = pdlange_(norm, n_global, n_global, identity->local_data(), ia, ja, identity->descriptor(), work.data()); double t_total = t_factor + t_solve; double t_glob; MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = getri_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX INVERSE BENCHMARK SUMMARY\n" "================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops, err);fflush(stdout); } }