STKUNIT_UNIT_TEST(UnitTestLinsysFunctions, test1) { static const size_t spatial_dimension = 3; MPI_Barrier( MPI_COMM_WORLD ); MPI_Comm comm = MPI_COMM_WORLD; //First create and fill MetaData and BulkData objects: const unsigned bucket_size = 100; //for a real application mesh, bucket_size would be much bigger... stk::mesh::fem::FEMMetaData fem_meta; stk::mesh::fem::FEMMetaData fem_meta2; fem_meta.FEM_initialize(spatial_dimension); fem_meta2.FEM_initialize(spatial_dimension); stk::mesh::MetaData & meta_data = stk::mesh::fem::FEMMetaData::get_meta_data(fem_meta); stk::mesh::MetaData & meta_data2 = stk::mesh::fem::FEMMetaData::get_meta_data(fem_meta2); const stk::mesh::EntityRank element_rank = fem_meta.element_rank(); stk::mesh::BulkData bulk_data( meta_data, comm, bucket_size ); stk::mesh::BulkData bulk_data2( meta_data2, comm, bucket_size ); //create a boundary-condition part for testing later: stk::mesh::Part& bcpart = fem_meta.declare_part("bcpart"); fill_utest_mesh_meta_data( fem_meta ); bool use_temperature=false; fill_utest_mesh_meta_data( fem_meta2, use_temperature ); fill_utest_mesh_bulk_data( bulk_data ); fill_utest_mesh_bulk_data( bulk_data2 ); //set owner-processors to lowest-sharing (stk::mesh defaults to //highest-sharing) If highest-sharing owns, then it isn't correct for the //way the fei library sets ownership of shared nodes for vectors etc. stk::mesh::set_owners<stk::mesh::LowestRankSharingProcOwns>( bulk_data ); //put a node in our boundary-condition part. arbitrarily choose the //first locally-owned node: bulk_data.modification_begin(); std::vector<stk::mesh::Entity*> local_nodes; stk::mesh::Selector select_owned(meta_data.locally_owned_part()); stk::mesh::get_selected_entities(select_owned, bulk_data.buckets(NODE_RANK), local_nodes); stk::mesh::EntityId bc_node_id = 0; if (local_nodes.size() > 0) { stk::mesh::PartVector partvector; partvector.push_back(&bcpart); bulk_data.change_entity_parts(*local_nodes[0], partvector); bc_node_id = stk::linsys::impl::entityid_to_int(local_nodes[0]->identifier()); } bulk_data.modification_end(); stk::mesh::Selector selector = ( meta_data.locally_owned_part() | meta_data.globally_shared_part() ) & *meta_data.get_part("block_1"); std::vector<unsigned> count; stk::mesh::count_entities(selector, bulk_data, count); STKUNIT_ASSERT_EQUAL( count[element_rank], (unsigned)4 ); STKUNIT_ASSERT_EQUAL( count[NODE_RANK], (unsigned)20 ); ScalarField* temperature_field = meta_data.get_field<ScalarField>("temperature"); //Create a fei Factory and stk::linsys::LinearSystem object: fei::SharedPtr<fei::Factory> factory(new Factory_Trilinos(comm)); stk::linsys::LinearSystem ls(comm, factory); stk::linsys::add_connectivities(ls, element_rank, NODE_RANK, *temperature_field, selector, bulk_data); fei::SharedPtr<fei::MatrixGraph> matgraph = ls.get_fei_MatrixGraph(); int num_blocks = matgraph->getNumConnectivityBlocks(); STKUNIT_ASSERT_EQUAL( num_blocks, (int)1 ); ls.synchronize_mappings_and_structure(); ls.create_fei_LinearSystem(); //put 0 throughout the matrix and 3 throughout the rhs: fei::SharedPtr<fei::Matrix> mat = ls.get_fei_LinearSystem()->getMatrix(); ls.get_fei_LinearSystem()->getMatrix()->putScalar(0); ls.get_fei_LinearSystem()->getRHS()->putScalar(3.0); //put 10 on the matrix diagonal to ensure it will be easy to solve later. fei::SharedPtr<fei::VectorSpace> vspace = ls.get_fei_LinearSystem()->getRHS()->getVectorSpace(); int numLocalRows = vspace->getNumIndices_Owned(); std::vector<int> local_rows(numLocalRows); vspace->getIndices_Owned(numLocalRows, &local_rows[0], numLocalRows); for(size_t i=0; i<local_rows.size(); ++i) { int col = local_rows[i]; double coef = 10; double* coefPtr = &coef; mat->sumIn(1, &local_rows[i], 1, &col, &coefPtr); } //now we'll impose a dirichlet bc on our one-node bcpart: stk::linsys::dirichlet_bc(ls, bulk_data, bcpart, NODE_RANK, *temperature_field, 0, 9.0); ls.finalize_assembly(); //now confirm that the rhs value for the equation corresponding to our //bc node is 9.0: fei::SharedPtr<fei::Vector> rhsvec = ls.get_fei_LinearSystem()->getRHS(); double rhs_bc_val = 0; int bc_eqn_index = ls.get_DofMapper().get_global_index(NODE_RANK, bc_node_id, *temperature_field); rhsvec->copyOut(1, &bc_eqn_index, &rhs_bc_val); bool bc_val_is_correct = std::abs(rhs_bc_val - 9.0) < 1.e-13; STKUNIT_ASSERT( bc_val_is_correct ); stk::linsys::copy_vector_to_mesh( *rhsvec, ls.get_DofMapper(), bulk_data); stk::mesh::Entity* bc_node = bulk_data.get_entity(NODE_RANK, local_nodes[0]->identifier()); stk::mesh::FieldTraits<ScalarField>::data_type* bc_node_data = stk::mesh::field_data(*temperature_field, *bc_node); bool bc_node_data_is_correct = std::abs(bc_node_data[0] - 9.0) < 1.e-13; STKUNIT_ASSERT( bc_node_data_is_correct ); //now make sure we get a throw if we use the wrong bulk-data (that doesn't have the //temperature field defined) STKUNIT_ASSERT_THROW(stk::linsys::copy_vector_to_mesh( *rhsvec, ls.get_DofMapper(), bulk_data2), std::runtime_error); //obtain and zero the solution vector fei::SharedPtr<fei::Vector> solnvec = ls.get_fei_LinearSystem()->getSolutionVector(); solnvec->putScalar(0); //copy the vector of zeros into the mesh: stk::linsys::copy_vector_to_mesh( *solnvec, ls.get_DofMapper(), bulk_data); //assert that our bc node's data is now zero. bc_node_data_is_correct = std::abs(bc_node_data[0] - 0) < 1.e-13; STKUNIT_ASSERT( bc_node_data_is_correct ); //call the linear-system solve function. //(note that when we add options to the solve method, we'll need to enhance this //testing to exercise various specific solves.) Teuchos::ParameterList params; int status = 0; ls.solve(status, params); //copy the solution-vector into the mesh: stk::linsys::copy_vector_to_mesh( *solnvec, ls.get_DofMapper(), bulk_data); //now assert that the value 9 (bc value) produced by the solve is in this //node's data. //note that we use a loose tolerance, because the default solver tolerance //is (I think) only 1.e-6. bc_node_data_is_correct = std::abs(bc_node_data[0] - 9.0) < 1.e-6; STKUNIT_ASSERT( bc_node_data_is_correct ); STKUNIT_ASSERT(bc_node_data_is_correct); }
/// 计算结果存储在矩阵a中 /// n_global: the order of the matrix static void inv_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); //// self code //n_global = 3; //double *aaa = new double(n_global*n_global); //for (int i = 0; i < 9; i++) //{ // aaa[i] = i + 1; //} //aaa[8] = 10; //auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa); // Create a NxN random matrix A auto a = block_cyclic_mat_t::random(grid, n_global, n_global); // Create a NxN matrix to hold A^{-1} auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global); // Copy A to A^{-1} since it will be overwritten during factorization std::copy_n(a->local_data(), a->local_size(), ai->local_data()); MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); // Factorize A blas_idx_t ia = 1, ja = 1; std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100); blas_idx_t info; //含义应该是D-GE-TRF。 //第一个D表示我们的矩阵是double类型的 //GE表示我们的矩阵是General类型的 //TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。 pdgetrf_(n_global, n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), info); assert(info == 0); double t_factor = MPI_Wtime() - t0; // Compute A^{-1} based on the LU factorization // Compute workspace for double and integer work arrays on each process blas_idx_t lwork = 10; blas_idx_t liwork = 10; std::vector<double> work (lwork); std::vector<blas_idx_t> iwork(liwork); lwork = liwork = -1; // 计算lwork与liwork的值 pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); lwork = static_cast<blas_idx_t>(work[0]); liwork = static_cast<size_t>(iwork[0]); work.resize(lwork); iwork.resize(liwork); // Now compute the inverse t0 = MPI_Wtime(); pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); double t_solve = MPI_Wtime() - t0; // Verify that the inverse is correct using A*A^{-1} = I auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global); // Compute I = A * A^{-1} - I and verify that the ||I|| is small char nein = 'N'; double alpha = 1.0, beta = -1.0; pdgemm_(nein, nein, n_global, n_global, n_global, alpha, a->local_data() , ia, ja, a->descriptor(), ai->local_data(), ia, ja, ai->descriptor(), beta, identity->local_data(), ia, ja, identity->descriptor()); // Compute 1-norm of the result char norm='1'; work.resize(identity->local_cols()); double err = pdlange_(norm, n_global, n_global, identity->local_data(), ia, ja, identity->descriptor(), work.data()); double t_total = t_factor + t_solve; double t_glob; MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = getri_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX INVERSE BENCHMARK SUMMARY\n" "================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops, err);fflush(stdout); } }