std::pair< shared_ptr<typename HypersingularIntegralOperator< BasisFunctionType, KernelType, ResultType>::LocalAssembler>, shared_ptr<typename HypersingularIntegralOperator< BasisFunctionType, KernelType, ResultType>::LocalAssembler> > HypersingularIntegralOperator<BasisFunctionType, KernelType, ResultType>::makeAssemblers( const QuadratureStrategy& quadStrategy, const AssemblyOptions& options) const { typedef Fiber::RawGridGeometry<CoordinateType> RawGridGeometry; typedef std::vector<const Fiber::Shapeset<BasisFunctionType>*> ShapesetPtrVector; const bool verbose = (options.verbosityLevel() >= VerbosityLevel::DEFAULT); shared_ptr<RawGridGeometry> testRawGeometry, trialRawGeometry; shared_ptr<GeometryFactory> testGeometryFactory, trialGeometryFactory; shared_ptr<Fiber::OpenClHandler> openClHandler; shared_ptr<ShapesetPtrVector> testShapesets, trialShapesets; bool cacheSingularIntegrals; if (verbose) std::cout << "Collecting data for assembler construction..." << std::endl; this->collectDataForAssemblerConstruction(options, testRawGeometry, trialRawGeometry, testGeometryFactory, trialGeometryFactory, testShapesets, trialShapesets, openClHandler, cacheSingularIntegrals); if (verbose) std::cout << "Data collection finished." << std::endl; bool makeSeparateOffDiagonalAssembler = options.assemblyMode() == AssemblyOptions::ACA && options.acaOptions().mode == AcaOptions::HYBRID_ASSEMBLY; return reallyMakeAssemblers(quadStrategy, testGeometryFactory, trialGeometryFactory, testRawGeometry, trialRawGeometry, testShapesets, trialShapesets, openClHandler, options.parallelizationOptions(), options.verbosityLevel(), cacheSingularIntegrals, makeSeparateOffDiagonalAssembler); }
std::unique_ptr<DiscreteBoundaryOperator<ResultType>> ElementaryLocalOperator<BasisFunctionType, ResultType>:: assembleWeakFormInSparseMode(LocalAssembler &assembler, const AssemblyOptions &options) const { #ifdef WITH_TRILINOS if (boost::is_complex<BasisFunctionType>::value) throw std::runtime_error( "ElementaryLocalOperator::assembleWeakFormInSparseMode(): " "sparse-mode assembly of identity operators for " "complex-valued basis functions is not supported yet"); const Space<BasisFunctionType> &testSpace = *this->dualToRange(); const Space<BasisFunctionType> &trialSpace = *this->domain(); // Fill local submatrices const GridView &view = testSpace.gridView(); const size_t elementCount = view.entityCount(0); std::vector<int> elementIndices(elementCount); for (size_t i = 0; i < elementCount; ++i) elementIndices[i] = i; std::vector<arma::Mat<ResultType>> localResult; assembler.evaluateLocalWeakForms(elementIndices, localResult); // Global DOF indices corresponding to local DOFs on elements std::vector<std::vector<GlobalDofIndex>> testGdofs(elementCount); std::vector<std::vector<GlobalDofIndex>> trialGdofs(elementCount); std::vector<std::vector<BasisFunctionType>> testLdofWeights(elementCount); std::vector<std::vector<BasisFunctionType>> trialLdofWeights(elementCount); gatherGlobalDofs(testSpace, trialSpace, testGdofs, trialGdofs, testLdofWeights, trialLdofWeights); // Multiply matrix entries by DOF weights for (size_t e = 0; e < elementCount; ++e) for (size_t trialDof = 0; trialDof < trialGdofs[e].size(); ++trialDof) for (size_t testDof = 0; testDof < testGdofs[e].size(); ++testDof) localResult[e](testDof, trialDof) *= conj(testLdofWeights[e][testDof]) * trialLdofWeights[e][trialDof]; // Estimate number of entries in each row // This will be useful when we begin to use MPI // // Get global DOF indices for which this process is responsible // const int testGlobalDofCount = testSpace.globalDofCount(); // Epetra_Map rowMap(testGlobalDofCount, 0 /* index-base */, comm); // std::vector<int> myTestGlobalDofs(rowMap.MyGlobalElements(), // rowMap.MyGlobalElements() + // rowMap.NumMyElements()); // const int myTestGlobalDofCount = myTestGlobalDofs.size(); const int testGlobalDofCount = testSpace.globalDofCount(); const int trialGlobalDofCount = trialSpace.globalDofCount(); arma::Col<int> nonzeroEntryCountEstimates(testGlobalDofCount); nonzeroEntryCountEstimates.fill(0); // Upper estimate for the number of global trial DOFs coupled to a given // global test DOF: sum of the local trial DOF counts for each element that // contributes to the global test DOF in question for (size_t e = 0; e < elementCount; ++e) for (size_t testLdof = 0; testLdof < testGdofs[e].size(); ++testLdof) { int testGdof = testGdofs[e][testLdof]; if (testGdof >= 0) nonzeroEntryCountEstimates(testGdof) += trialGdofs[e].size(); } Epetra_SerialComm comm; // To be replaced once we begin to use MPI Epetra_LocalMap rowMap(testGlobalDofCount, 0 /* index_base */, comm); Epetra_LocalMap colMap(trialGlobalDofCount, 0 /* index_base */, comm); shared_ptr<Epetra_FECrsMatrix> result = boost::make_shared<Epetra_FECrsMatrix>( Copy, rowMap, colMap, nonzeroEntryCountEstimates.memptr()); // TODO: make each process responsible for a subset of elements // Find maximum number of local dofs per element size_t maxLdofCount = 0; for (size_t e = 0; e < elementCount; ++e) maxLdofCount = std::max(maxLdofCount, testGdofs[e].size() * trialGdofs[e].size()); // Initialise sparse matrix with zeros at required positions arma::Col<double> zeros(maxLdofCount); zeros.fill(0.); for (size_t e = 0; e < elementCount; ++e) result->InsertGlobalValues(testGdofs[e].size(), &testGdofs[e][0], trialGdofs[e].size(), &trialGdofs[e][0], zeros.memptr()); // Add contributions from individual elements for (size_t e = 0; e < elementCount; ++e) epetraSumIntoGlobalValues(*result, testGdofs[e], trialGdofs[e], localResult[e]); result->GlobalAssemble(); // If assembly mode is equal to ACA and we have AHMED, // construct the block cluster tree. Otherwise leave it uninitialized. typedef ClusterConstructionHelper<BasisFunctionType> CCH; typedef AhmedDofWrapper<CoordinateType> AhmedDofType; typedef ExtendedBemCluster<AhmedDofType> AhmedBemCluster; typedef bbxbemblcluster<AhmedDofType, AhmedDofType> AhmedBemBlcluster; shared_ptr<AhmedBemBlcluster> blockCluster; shared_ptr<IndexPermutation> test_o2pPermutation, test_p2oPermutation; shared_ptr<IndexPermutation> trial_o2pPermutation, trial_p2oPermutation; #ifdef WITH_AHMED if (options.assemblyMode() == AssemblyOptions::ACA) { const AcaOptions &acaOptions = options.acaOptions(); bool indexWithGlobalDofs = acaOptions.mode != AcaOptions::HYBRID_ASSEMBLY; typedef ClusterConstructionHelper<BasisFunctionType> CCH; shared_ptr<AhmedBemCluster> testClusterTree; CCH::constructBemCluster(testSpace, indexWithGlobalDofs, acaOptions, testClusterTree, test_o2pPermutation, test_p2oPermutation); // TODO: construct a hermitian H-matrix if possible shared_ptr<AhmedBemCluster> trialClusterTree; CCH::constructBemCluster(trialSpace, indexWithGlobalDofs, acaOptions, trialClusterTree, trial_o2pPermutation, trial_p2oPermutation); unsigned int blockCount = 0; bool useStrongAdmissibilityCondition = !indexWithGlobalDofs; blockCluster.reset(CCH::constructBemBlockCluster( acaOptions, false /* hermitian */, *testClusterTree, *trialClusterTree, useStrongAdmissibilityCondition, blockCount).release()); } #endif // Create and return a discrete operator represented by the matrix that // has just been calculated return std::unique_ptr<DiscreteBoundaryOperator<ResultType>>( new DiscreteSparseBoundaryOperator<ResultType>( result, this->symmetry(), NO_TRANSPOSE, blockCluster, trial_o2pPermutation, test_o2pPermutation)); #else // WITH_TRILINOS throw std::runtime_error( "ElementaryLocalOperator::assembleWeakFormInSparseMode(): " "To enable assembly in sparse mode, recompile BEM++ " "with the symbol WITH_TRILINOS defined."); #endif }
std::auto_ptr<DiscreteBoundaryOperator<ResultType> > AcaGlobalAssembler<BasisFunctionType, ResultType>::assembleDetachedWeakForm( const Space<BasisFunctionType>& testSpace, const Space<BasisFunctionType>& trialSpace, const std::vector<LocalAssembler*>& localAssemblers, const std::vector<const DiscreteBndOp*>& sparseTermsToAdd, const std::vector<ResultType>& denseTermsMultipliers, const std::vector<ResultType>& sparseTermsMultipliers, const AssemblyOptions& options, int symmetry) { #ifdef WITH_AHMED typedef AhmedDofWrapper<CoordinateType> AhmedDofType; typedef ExtendedBemCluster<AhmedDofType> AhmedBemCluster; typedef bemblcluster<AhmedDofType, AhmedDofType> AhmedBemBlcluster; typedef DiscreteAcaBoundaryOperator<ResultType> DiscreteAcaLinOp; const AcaOptions& acaOptions = options.acaOptions(); const bool indexWithGlobalDofs = acaOptions.globalAssemblyBeforeCompression; const bool verbosityAtLeastDefault = (options.verbosityLevel() >= VerbosityLevel::DEFAULT); const bool verbosityAtLeastHigh = (options.verbosityLevel() >= VerbosityLevel::HIGH); // Currently we don't support Hermitian ACA operators. This is because we // don't have the means to really test them -- we would need complex-valued // basis functions for that. (Assembly of such a matrix would be very easy // -- just change complex_sym from true to false in the call to apprx_sym() // in AcaWeakFormAssemblerLoopBody::operator() -- but operations on // symmetric/Hermitian matrices are not always trivial and we do need to be // able to test them properly.) bool symmetric = symmetry & SYMMETRIC; if (symmetry & HERMITIAN && !(symmetry & SYMMETRIC) && verbosityAtLeastDefault) std::cout << "Warning: assembly of non-symmetric Hermitian H-matrices " "is not supported yet. A general H-matrix will be assembled" << std::endl; #ifndef WITH_TRILINOS if (!indexWithGlobalDofs) throw std::runtime_error("AcaGlobalAssembler::assembleDetachedWeakForm(): " "ACA assembly with globalAssemblyBeforeCompression " "set to false requires BEM++ to be linked with " "Trilinos"); #endif // WITH_TRILINOS const size_t testDofCount = indexWithGlobalDofs ? testSpace.globalDofCount() : testSpace.flatLocalDofCount(); const size_t trialDofCount = indexWithGlobalDofs ? trialSpace.globalDofCount() : trialSpace.flatLocalDofCount(); if (symmetric && testDofCount != trialDofCount) throw std::invalid_argument("AcaGlobalAssembler::assembleDetachedWeakForm(): " "you cannot generate a symmetric weak form " "using test and trial spaces with different " "numbers of DOFs"); // o2p: map of original indices to permuted indices // p2o: map of permuted indices to original indices typedef ClusterConstructionHelper<BasisFunctionType> CCH; shared_ptr<AhmedBemCluster> testClusterTree; shared_ptr<IndexPermutation> test_o2pPermutation, test_p2oPermutation; CCH::constructBemCluster(testSpace, indexWithGlobalDofs, acaOptions, testClusterTree, test_o2pPermutation, test_p2oPermutation); shared_ptr<AhmedBemCluster> trialClusterTree; shared_ptr<IndexPermutation> trial_o2pPermutation, trial_p2oPermutation; if (symmetric || &testSpace == &trialSpace) { trialClusterTree = testClusterTree; trial_o2pPermutation = test_o2pPermutation; trial_p2oPermutation = test_p2oPermutation; } else CCH::constructBemCluster(trialSpace, indexWithGlobalDofs, acaOptions, trialClusterTree, trial_o2pPermutation, trial_p2oPermutation); // // Export VTK plots showing the disctribution of leaf cluster ids // std::vector<unsigned int> testClusterIds; // getClusterIds(*testClusterTree, test_p2oPermutation->permutedIndices(), testClusterIds); // testSpace.dumpClusterIds("testClusterIds", testClusterIds, // indexWithGlobalDofs ? GLOBAL_DOFS : FLAT_LOCAL_DOFS); // std::vector<unsigned int> trialClusterIds; // getClusterIds(*trialClusterTree, trial_p2oPermutation->permutedIndices(), trialClusterIds); // trialSpace.dumpClusterIds("trialClusterIds", trialClusterIds, // indexWithGlobalDofs ? GLOBAL_DOFS : FLAT_LOCAL_DOFS); if (verbosityAtLeastHigh) std::cout << "Test cluster count: " << testClusterTree->getncl() << "\nTrial cluster count: " << trialClusterTree->getncl() << std::endl; unsigned int blockCount = 0; shared_ptr<AhmedBemBlcluster> bemBlclusterTree( CCH::constructBemBlockCluster(acaOptions, symmetric, *testClusterTree, *trialClusterTree, blockCount).release()); if (verbosityAtLeastHigh) std::cout << "Mblock count: " << blockCount << std::endl; std::vector<unsigned int> p2oTestDofs = test_p2oPermutation->permutedIndices(); std::vector<unsigned int> p2oTrialDofs = trial_p2oPermutation->permutedIndices(); WeakFormAcaAssemblyHelper<BasisFunctionType, ResultType> helper(testSpace, trialSpace, p2oTestDofs, p2oTrialDofs, localAssemblers, sparseTermsToAdd, denseTermsMultipliers, sparseTermsMultipliers, options); typedef mblock<typename AhmedTypeTraits<ResultType>::Type> AhmedMblock; boost::shared_array<AhmedMblock*> blocks = allocateAhmedMblockArray<ResultType>(bemBlclusterTree.get()); // matgen_sqntl(helper, AhmedBemBlclusterTree.get(), AhmedBemBlclusterTree.get(), // acaOptions.recompress, acaOptions.eps, // acaOptions.maximumRank, blocks.get()); // matgen_omp(helper, blockCount, AhmedBemBlclusterTree.get(), // acaOptions.eps, acaOptions.maximumRank, blocks.get()); // // Dump mblocks // const int mblockCount = AhmedBemBlclusterTree->nleaves(); // for (int i = 0; i < mblockCount; ++i) // if (blocks[i]->isdns()) // { // char buffer[1024]; // sprintf(buffer, "mblock-dns-%d-%d.txt", // blocks[i]->getn1(), blocks[i]->getn2()); // arma::Col<ResultType> block((ResultType*)blocks[i]->getdata(), // blocks[i]->nvals()); // arma::diskio::save_raw_ascii(block, buffer); // } // else // { // char buffer[1024]; // sprintf(buffer, "mblock-lwr-%d-%d.txt", // blocks[i]->getn1(), blocks[i]->getn2()); // arma::Col<ResultType> block((ResultType*)blocks[i]->getdata(), // blocks[i]->nvals()); // arma::diskio::save_raw_ascii(block, buffer); // } AhmedLeafClusterArray leafClusters(bemBlclusterTree.get()); leafClusters.sortAccordingToClusterSize(); const size_t leafClusterCount = leafClusters.size(); const ParallelizationOptions& parallelOptions = options.parallelizationOptions(); int maxThreadCount = 1; if (!parallelOptions.isOpenClEnabled()) { if (parallelOptions.maxThreadCount() == ParallelizationOptions::AUTO) maxThreadCount = tbb::task_scheduler_init::automatic; else maxThreadCount = parallelOptions.maxThreadCount(); } tbb::task_scheduler_init scheduler(maxThreadCount); tbb::atomic<size_t> done; done = 0; std::vector<ChunkStatistics> chunkStats(leafClusterCount); // typedef AcaWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body; // // std::cout << "Loop start" << std::endl; // tbb::tick_count loopStart = tbb::tick_count::now(); // // tbb::parallel_for(tbb::blocked_range<size_t>(0, leafClusterCount), // // Body(helper, leafClusters, blocks, acaOptions, done // // , chunkStats)); // tbb::parallel_for(ScatteredRange(0, leafClusterCount), // Body(helper, leafClusters, blocks, acaOptions, done // , chunkStats)); // tbb::tick_count loopEnd = tbb::tick_count::now(); // // std::cout << "Loop end" << std::endl; typedef AcaWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body; typename Body::LeafClusterIndexQueue leafClusterIndexQueue; for (size_t i = 0; i < leafClusterCount; ++i) leafClusterIndexQueue.push(i); if (verbosityAtLeastDefault) std::cout << "About to start the ACA assembly loop" << std::endl; tbb::tick_count loopStart = tbb::tick_count::now(); { Fiber::SerialBlasRegion region; // if possible, ensure that BLAS is single-threaded tbb::parallel_for(tbb::blocked_range<size_t>(0, leafClusterCount), Body(helper, leafClusters, blocks, acaOptions, done, verbosityAtLeastDefault, leafClusterIndexQueue, symmetric, chunkStats)); } tbb::tick_count loopEnd = tbb::tick_count::now(); if (verbosityAtLeastDefault) { std::cout << "\n"; // the progress bar doesn't print the final \n std::cout << "ACA loop took " << (loopEnd - loopStart).seconds() << " s" << std::endl; } // TODO: parallelise! if (acaOptions.recompress) { if (verbosityAtLeastDefault) std::cout << "About to start ACA agglomeration" << std::endl; agglH(bemBlclusterTree.get(), blocks.get(), acaOptions.eps, acaOptions.maximumRank); if (verbosityAtLeastDefault) std::cout << "Agglomeration finished" << std::endl; } // // Dump timing data of individual chunks // std::cout << "\nChunks:\n"; // for (int i = 0; i < leafClusterCount; ++i) // if (chunkStats[i].valid) { // int blockIndex = leafClusters[i]->getidx(); // std::cout << chunkStats[i].chunkStart << "\t" // << chunkStats[i].chunkSize << "\t" // << (chunkStats[i].startTime - loopStart).seconds() << "\t" // << (chunkStats[i].endTime - loopStart).seconds() << "\t" // << (chunkStats[i].endTime - chunkStats[i].startTime).seconds() << "\t" // << blocks[blockIndex]->getn1() << "\t" // << blocks[blockIndex]->getn2() << "\t" // << blocks[blockIndex]->islwr() << "\t" // << (blocks[blockIndex]->islwr() ? blocks[blockIndex]->rank() : 0) << "\n"; // } { size_t origMemory = sizeof(ResultType) * testDofCount * trialDofCount; size_t ahmedMemory = sizeH(bemBlclusterTree.get(), blocks.get()); int maximumRank = Hmax_rank(bemBlclusterTree.get(), blocks.get()); if (verbosityAtLeastDefault) std::cout << "\nNeeded storage: " << ahmedMemory / 1024. / 1024. << " MB.\n" << "Without approximation: " << origMemory / 1024. / 1024. << " MB.\n" << "Compressed to " << (100. * ahmedMemory) / origMemory << "%.\n" << "Maximum rank: " << maximumRank << ".\n" << std::endl; if (acaOptions.outputPostscript) { if (verbosityAtLeastDefault) std::cout << "Writing matrix partition ..." << std::flush; std::ofstream os(acaOptions.outputFname.c_str()); if (symmetric) // seems valid also for Hermitian matrices psoutputHeH(os, bemBlclusterTree.get(), testDofCount, blocks.get()); else psoutputGeH(os, bemBlclusterTree.get(), testDofCount, blocks.get()); os.close(); if (verbosityAtLeastDefault) std::cout << " done." << std::endl; } } int outSymmetry = NO_SYMMETRY; if (symmetric) { outSymmetry = SYMMETRIC; if (!boost::is_complex<ResultType>()) outSymmetry |= HERMITIAN; } std::auto_ptr<DiscreteAcaLinOp> acaOp( new DiscreteAcaLinOp(testDofCount, trialDofCount, acaOptions.eps, acaOptions.maximumRank, outSymmetry, bemBlclusterTree, blocks, *trial_o2pPermutation, *test_o2pPermutation, parallelOptions)); std::auto_ptr<DiscreteBndOp> result; if (indexWithGlobalDofs) result = acaOp; else { #ifdef WITH_TRILINOS // without Trilinos, this code will never be reached -- an exception // will be thrown earlier in this function typedef DiscreteBoundaryOperatorComposition<ResultType> DiscreteBndOpComp; shared_ptr<DiscreteBndOp> acaOpShared(acaOp.release()); shared_ptr<DiscreteBndOp> trialGlobalToLocal = constructOperatorMappingGlobalToFlatLocalDofs< BasisFunctionType, ResultType>(trialSpace); shared_ptr<DiscreteBndOp> testLocalToGlobal = constructOperatorMappingFlatLocalToGlobalDofs< BasisFunctionType, ResultType>(testSpace); shared_ptr<DiscreteBndOp> tmp( new DiscreteBndOpComp(acaOpShared, trialGlobalToLocal)); result.reset(new DiscreteBndOpComp(testLocalToGlobal, tmp)); #endif // WITH_TRILINOS } return result; #else // without Ahmed throw std::runtime_error("AcaGlobalAssembler::assembleDetachedWeakForm(): " "To enable assembly in ACA mode, recompile BEM++ " "with the symbol WITH_AHMED defined."); #endif // WITH_AHMED }