shared_ptr<const DiscreteBoundaryOperator<ResultType>> assembleDenseBlock( int rowStart, int rowEnd, int colStart, int colEnd, const Space<BasisFunctionType> &testSpace, const Space<BasisFunctionType> &trialSpace, Fiber::LocalAssemblerForIntegralOperators<ResultType>& assembler, const ParameterList ¶meterList) { int numberOfRows = rowEnd - rowStart; int numberOfColumns = colEnd - colStart; if (colEnd > trialSpace.globalDofCount() || rowEnd > testSpace.globalDofCount() || colStart < 0 || rowStart < 0) throw std::runtime_error("DenseGlobalBlockAssember::assembleWeakForm(): " "Indices out of bounds"); Context<BasisFunctionType, ResultType> context(parameterList); const AssemblyOptions &options = context.assemblyOptions(); // Create the operator's matrix Matrix<ResultType> result(numberOfRows, numberOfColumns); result.setZero(); std::unordered_map<int, std::vector<GlobalDofIndex>> trialIndexMap, testIndexMap; std::unordered_map<int, std::vector<BasisFunctionType>> trialDofWeights, testDofWeights; gatherElementInformation(colStart, colEnd, trialSpace, trialIndexMap, trialDofWeights); gatherElementInformation(rowStart, rowEnd, testSpace, testIndexMap, testDofWeights); std::vector<int> testIndices; testIndices.reserve(testIndexMap.size()); for (const auto &p : testIndexMap) testIndices.push_back(p.first); typedef DenseWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body; typename Body::MutexType mutex; { Fiber::SerialBlasRegion region; tbb::parallel_for_each(trialIndexMap.begin(), trialIndexMap.end(), Body(rowStart, colStart, testIndices, testIndexMap, trialIndexMap, testDofWeights, trialDofWeights, assembler, result, mutex)); } // Create and return a discrete operator represented by the matrix that // has just been calculated return shared_ptr<DiscreteBoundaryOperator<ResultType>>( new DiscreteDenseBoundaryOperator<ResultType>(result)); }
Vector<ResultType> reallyCalculateProjections( const Space<BasisFunctionType> &dualSpace, Fiber::LocalAssemblerForGridFunctions<ResultType> &assembler, const AssemblyOptions &options) { // TODO: parallelise using TBB (the parameter options will then start be used) // Get the grid's leaf view so that we can iterate over elements const GridView &view = dualSpace.gridView(); const size_t elementCount = view.entityCount(0); // Global DOF indices corresponding to local DOFs on elements std::vector<std::vector<GlobalDofIndex>> testGlobalDofs(elementCount); std::vector<std::vector<BasisFunctionType>> testLocalDofWeights(elementCount); // Gather global DOF lists const Mapper &mapper = view.elementMapper(); std::unique_ptr<EntityIterator<0>> it = view.entityIterator<0>(); while (!it->finished()) { const Entity<0> &element = it->entity(); const int elementIndex = mapper.entityIndex(element); dualSpace.getGlobalDofs(element, testGlobalDofs[elementIndex], testLocalDofWeights[elementIndex]); it->next(); } // Make a vector of all element indices std::vector<int> testIndices(elementCount); for (size_t i = 0; i < elementCount; ++i) testIndices[i] = i; // Create the weak form's column vector Vector<ResultType> result(dualSpace.globalDofCount()); result.setZero(); std::vector<Vector<ResultType>> localResult; // Evaluate local weak forms assembler.evaluateLocalWeakForms(testIndices, localResult); // Loop over test indices for (size_t testIndex = 0; testIndex < elementCount; ++testIndex) // Add the integrals to appropriate entries in the global weak form for (size_t testDof = 0; testDof < testGlobalDofs[testIndex].size(); ++testDof) { int testGlobalDof = testGlobalDofs[testIndex][testDof]; if (testGlobalDof >= 0) // if it's negative, it means that this // local dof is constrained (not used) result(testGlobalDof) += conj(testLocalDofWeights[testIndex][testDof]) * localResult[testIndex](testDof); } // Return the vector of projections <phi_i, f> return result; }
std::auto_ptr<DiscreteBoundaryOperator<ResultType> > DenseGlobalAssembler<BasisFunctionType, ResultType>:: assembleDetachedWeakForm( const Space<BasisFunctionType>& testSpace, const Space<BasisFunctionType>& trialSpace, LocalAssemblerForIntegralOperators& assembler, const Context<BasisFunctionType, ResultType>& context) { const AssemblyOptions& options = context.assemblyOptions(); // Global DOF indices corresponding to local DOFs on elements std::vector<std::vector<GlobalDofIndex> > testGlobalDofs, trialGlobalDofs; std::vector<std::vector<BasisFunctionType> > testLocalDofWeights, trialLocalDofWeights; gatherGlobalDofs(testSpace, testGlobalDofs, testLocalDofWeights); if (&testSpace == &trialSpace) { trialGlobalDofs = testGlobalDofs; trialLocalDofWeights = testLocalDofWeights; } else gatherGlobalDofs(trialSpace, trialGlobalDofs, trialLocalDofWeights); const size_t testElementCount = testGlobalDofs.size(); const size_t trialElementCount = trialGlobalDofs.size(); // Make a vector of all element indices std::vector<int> testIndices(testElementCount); for (int i = 0; i < testElementCount; ++i) testIndices[i] = i; // Create the operator's matrix arma::Mat<ResultType> result(testSpace.globalDofCount(), trialSpace.globalDofCount()); result.fill(0.); typedef DenseWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body; typename Body::MutexType mutex; const ParallelizationOptions& parallelOptions = options.parallelizationOptions(); int maxThreadCount = 1; if (!parallelOptions.isOpenClEnabled()) { if (parallelOptions.maxThreadCount() == ParallelizationOptions::AUTO) maxThreadCount = tbb::task_scheduler_init::automatic; else maxThreadCount = parallelOptions.maxThreadCount(); } tbb::task_scheduler_init scheduler(maxThreadCount); { Fiber::SerialBlasRegion region; tbb::parallel_for(tbb::blocked_range<size_t>(0, trialElementCount), Body(testIndices, testGlobalDofs, trialGlobalDofs, testLocalDofWeights, trialLocalDofWeights, assembler, result, mutex)); } //// Old serial code (TODO: decide whether to keep it behind e.g. #ifndef PARALLEL) // std::vector<arma::Mat<ValueType> > localResult; // // Loop over trial elements // for (int trialIndex = 0; trialIndex < trialElementCount; ++trialIndex) // { // // Evaluate integrals over pairs of the current trial element and // // all the test elements // assembler.evaluateLocalWeakForms(TEST_TRIAL, testIndices, trialIndex, // ALL_DOFS, localResult); // // Loop over test indices // for (int testIndex = 0; testIndex < testElementCount; ++testIndex) // // Add the integrals to appropriate entries in the operator's matrix // for (int trialDof = 0; trialDof < trialGlobalDofs[trialIndex].size(); ++trialDof) // for (int testDof = 0; testDof < testGlobalDofs[testIndex].size(); ++testDof) // result(testGlobalDofs[testIndex][testDof], // trialGlobalDofs[trialIndex][trialDof]) += // localResult[testIndex](testDof, trialDof); // } // Create and return a discrete operator represented by the matrix that // has just been calculated return std::auto_ptr<DiscreteBoundaryOperator<ResultType> >( new DiscreteDenseBoundaryOperator<ResultType>(result)); }
std::auto_ptr<DiscreteBoundaryOperator<ResultType> > AcaGlobalAssembler<BasisFunctionType, ResultType>::assembleDetachedWeakForm( const Space<BasisFunctionType>& testSpace, const Space<BasisFunctionType>& trialSpace, const std::vector<LocalAssembler*>& localAssemblers, const std::vector<const DiscreteBndOp*>& sparseTermsToAdd, const std::vector<ResultType>& denseTermsMultipliers, const std::vector<ResultType>& sparseTermsMultipliers, const AssemblyOptions& options, int symmetry) { #ifdef WITH_AHMED typedef AhmedDofWrapper<CoordinateType> AhmedDofType; typedef ExtendedBemCluster<AhmedDofType> AhmedBemCluster; typedef bemblcluster<AhmedDofType, AhmedDofType> AhmedBemBlcluster; typedef DiscreteAcaBoundaryOperator<ResultType> DiscreteAcaLinOp; const AcaOptions& acaOptions = options.acaOptions(); const bool indexWithGlobalDofs = acaOptions.globalAssemblyBeforeCompression; const bool verbosityAtLeastDefault = (options.verbosityLevel() >= VerbosityLevel::DEFAULT); const bool verbosityAtLeastHigh = (options.verbosityLevel() >= VerbosityLevel::HIGH); // Currently we don't support Hermitian ACA operators. This is because we // don't have the means to really test them -- we would need complex-valued // basis functions for that. (Assembly of such a matrix would be very easy // -- just change complex_sym from true to false in the call to apprx_sym() // in AcaWeakFormAssemblerLoopBody::operator() -- but operations on // symmetric/Hermitian matrices are not always trivial and we do need to be // able to test them properly.) bool symmetric = symmetry & SYMMETRIC; if (symmetry & HERMITIAN && !(symmetry & SYMMETRIC) && verbosityAtLeastDefault) std::cout << "Warning: assembly of non-symmetric Hermitian H-matrices " "is not supported yet. A general H-matrix will be assembled" << std::endl; #ifndef WITH_TRILINOS if (!indexWithGlobalDofs) throw std::runtime_error("AcaGlobalAssembler::assembleDetachedWeakForm(): " "ACA assembly with globalAssemblyBeforeCompression " "set to false requires BEM++ to be linked with " "Trilinos"); #endif // WITH_TRILINOS const size_t testDofCount = indexWithGlobalDofs ? testSpace.globalDofCount() : testSpace.flatLocalDofCount(); const size_t trialDofCount = indexWithGlobalDofs ? trialSpace.globalDofCount() : trialSpace.flatLocalDofCount(); if (symmetric && testDofCount != trialDofCount) throw std::invalid_argument("AcaGlobalAssembler::assembleDetachedWeakForm(): " "you cannot generate a symmetric weak form " "using test and trial spaces with different " "numbers of DOFs"); // o2p: map of original indices to permuted indices // p2o: map of permuted indices to original indices typedef ClusterConstructionHelper<BasisFunctionType> CCH; shared_ptr<AhmedBemCluster> testClusterTree; shared_ptr<IndexPermutation> test_o2pPermutation, test_p2oPermutation; CCH::constructBemCluster(testSpace, indexWithGlobalDofs, acaOptions, testClusterTree, test_o2pPermutation, test_p2oPermutation); shared_ptr<AhmedBemCluster> trialClusterTree; shared_ptr<IndexPermutation> trial_o2pPermutation, trial_p2oPermutation; if (symmetric || &testSpace == &trialSpace) { trialClusterTree = testClusterTree; trial_o2pPermutation = test_o2pPermutation; trial_p2oPermutation = test_p2oPermutation; } else CCH::constructBemCluster(trialSpace, indexWithGlobalDofs, acaOptions, trialClusterTree, trial_o2pPermutation, trial_p2oPermutation); // // Export VTK plots showing the disctribution of leaf cluster ids // std::vector<unsigned int> testClusterIds; // getClusterIds(*testClusterTree, test_p2oPermutation->permutedIndices(), testClusterIds); // testSpace.dumpClusterIds("testClusterIds", testClusterIds, // indexWithGlobalDofs ? GLOBAL_DOFS : FLAT_LOCAL_DOFS); // std::vector<unsigned int> trialClusterIds; // getClusterIds(*trialClusterTree, trial_p2oPermutation->permutedIndices(), trialClusterIds); // trialSpace.dumpClusterIds("trialClusterIds", trialClusterIds, // indexWithGlobalDofs ? GLOBAL_DOFS : FLAT_LOCAL_DOFS); if (verbosityAtLeastHigh) std::cout << "Test cluster count: " << testClusterTree->getncl() << "\nTrial cluster count: " << trialClusterTree->getncl() << std::endl; unsigned int blockCount = 0; shared_ptr<AhmedBemBlcluster> bemBlclusterTree( CCH::constructBemBlockCluster(acaOptions, symmetric, *testClusterTree, *trialClusterTree, blockCount).release()); if (verbosityAtLeastHigh) std::cout << "Mblock count: " << blockCount << std::endl; std::vector<unsigned int> p2oTestDofs = test_p2oPermutation->permutedIndices(); std::vector<unsigned int> p2oTrialDofs = trial_p2oPermutation->permutedIndices(); WeakFormAcaAssemblyHelper<BasisFunctionType, ResultType> helper(testSpace, trialSpace, p2oTestDofs, p2oTrialDofs, localAssemblers, sparseTermsToAdd, denseTermsMultipliers, sparseTermsMultipliers, options); typedef mblock<typename AhmedTypeTraits<ResultType>::Type> AhmedMblock; boost::shared_array<AhmedMblock*> blocks = allocateAhmedMblockArray<ResultType>(bemBlclusterTree.get()); // matgen_sqntl(helper, AhmedBemBlclusterTree.get(), AhmedBemBlclusterTree.get(), // acaOptions.recompress, acaOptions.eps, // acaOptions.maximumRank, blocks.get()); // matgen_omp(helper, blockCount, AhmedBemBlclusterTree.get(), // acaOptions.eps, acaOptions.maximumRank, blocks.get()); // // Dump mblocks // const int mblockCount = AhmedBemBlclusterTree->nleaves(); // for (int i = 0; i < mblockCount; ++i) // if (blocks[i]->isdns()) // { // char buffer[1024]; // sprintf(buffer, "mblock-dns-%d-%d.txt", // blocks[i]->getn1(), blocks[i]->getn2()); // arma::Col<ResultType> block((ResultType*)blocks[i]->getdata(), // blocks[i]->nvals()); // arma::diskio::save_raw_ascii(block, buffer); // } // else // { // char buffer[1024]; // sprintf(buffer, "mblock-lwr-%d-%d.txt", // blocks[i]->getn1(), blocks[i]->getn2()); // arma::Col<ResultType> block((ResultType*)blocks[i]->getdata(), // blocks[i]->nvals()); // arma::diskio::save_raw_ascii(block, buffer); // } AhmedLeafClusterArray leafClusters(bemBlclusterTree.get()); leafClusters.sortAccordingToClusterSize(); const size_t leafClusterCount = leafClusters.size(); const ParallelizationOptions& parallelOptions = options.parallelizationOptions(); int maxThreadCount = 1; if (!parallelOptions.isOpenClEnabled()) { if (parallelOptions.maxThreadCount() == ParallelizationOptions::AUTO) maxThreadCount = tbb::task_scheduler_init::automatic; else maxThreadCount = parallelOptions.maxThreadCount(); } tbb::task_scheduler_init scheduler(maxThreadCount); tbb::atomic<size_t> done; done = 0; std::vector<ChunkStatistics> chunkStats(leafClusterCount); // typedef AcaWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body; // // std::cout << "Loop start" << std::endl; // tbb::tick_count loopStart = tbb::tick_count::now(); // // tbb::parallel_for(tbb::blocked_range<size_t>(0, leafClusterCount), // // Body(helper, leafClusters, blocks, acaOptions, done // // , chunkStats)); // tbb::parallel_for(ScatteredRange(0, leafClusterCount), // Body(helper, leafClusters, blocks, acaOptions, done // , chunkStats)); // tbb::tick_count loopEnd = tbb::tick_count::now(); // // std::cout << "Loop end" << std::endl; typedef AcaWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body; typename Body::LeafClusterIndexQueue leafClusterIndexQueue; for (size_t i = 0; i < leafClusterCount; ++i) leafClusterIndexQueue.push(i); if (verbosityAtLeastDefault) std::cout << "About to start the ACA assembly loop" << std::endl; tbb::tick_count loopStart = tbb::tick_count::now(); { Fiber::SerialBlasRegion region; // if possible, ensure that BLAS is single-threaded tbb::parallel_for(tbb::blocked_range<size_t>(0, leafClusterCount), Body(helper, leafClusters, blocks, acaOptions, done, verbosityAtLeastDefault, leafClusterIndexQueue, symmetric, chunkStats)); } tbb::tick_count loopEnd = tbb::tick_count::now(); if (verbosityAtLeastDefault) { std::cout << "\n"; // the progress bar doesn't print the final \n std::cout << "ACA loop took " << (loopEnd - loopStart).seconds() << " s" << std::endl; } // TODO: parallelise! if (acaOptions.recompress) { if (verbosityAtLeastDefault) std::cout << "About to start ACA agglomeration" << std::endl; agglH(bemBlclusterTree.get(), blocks.get(), acaOptions.eps, acaOptions.maximumRank); if (verbosityAtLeastDefault) std::cout << "Agglomeration finished" << std::endl; } // // Dump timing data of individual chunks // std::cout << "\nChunks:\n"; // for (int i = 0; i < leafClusterCount; ++i) // if (chunkStats[i].valid) { // int blockIndex = leafClusters[i]->getidx(); // std::cout << chunkStats[i].chunkStart << "\t" // << chunkStats[i].chunkSize << "\t" // << (chunkStats[i].startTime - loopStart).seconds() << "\t" // << (chunkStats[i].endTime - loopStart).seconds() << "\t" // << (chunkStats[i].endTime - chunkStats[i].startTime).seconds() << "\t" // << blocks[blockIndex]->getn1() << "\t" // << blocks[blockIndex]->getn2() << "\t" // << blocks[blockIndex]->islwr() << "\t" // << (blocks[blockIndex]->islwr() ? blocks[blockIndex]->rank() : 0) << "\n"; // } { size_t origMemory = sizeof(ResultType) * testDofCount * trialDofCount; size_t ahmedMemory = sizeH(bemBlclusterTree.get(), blocks.get()); int maximumRank = Hmax_rank(bemBlclusterTree.get(), blocks.get()); if (verbosityAtLeastDefault) std::cout << "\nNeeded storage: " << ahmedMemory / 1024. / 1024. << " MB.\n" << "Without approximation: " << origMemory / 1024. / 1024. << " MB.\n" << "Compressed to " << (100. * ahmedMemory) / origMemory << "%.\n" << "Maximum rank: " << maximumRank << ".\n" << std::endl; if (acaOptions.outputPostscript) { if (verbosityAtLeastDefault) std::cout << "Writing matrix partition ..." << std::flush; std::ofstream os(acaOptions.outputFname.c_str()); if (symmetric) // seems valid also for Hermitian matrices psoutputHeH(os, bemBlclusterTree.get(), testDofCount, blocks.get()); else psoutputGeH(os, bemBlclusterTree.get(), testDofCount, blocks.get()); os.close(); if (verbosityAtLeastDefault) std::cout << " done." << std::endl; } } int outSymmetry = NO_SYMMETRY; if (symmetric) { outSymmetry = SYMMETRIC; if (!boost::is_complex<ResultType>()) outSymmetry |= HERMITIAN; } std::auto_ptr<DiscreteAcaLinOp> acaOp( new DiscreteAcaLinOp(testDofCount, trialDofCount, acaOptions.eps, acaOptions.maximumRank, outSymmetry, bemBlclusterTree, blocks, *trial_o2pPermutation, *test_o2pPermutation, parallelOptions)); std::auto_ptr<DiscreteBndOp> result; if (indexWithGlobalDofs) result = acaOp; else { #ifdef WITH_TRILINOS // without Trilinos, this code will never be reached -- an exception // will be thrown earlier in this function typedef DiscreteBoundaryOperatorComposition<ResultType> DiscreteBndOpComp; shared_ptr<DiscreteBndOp> acaOpShared(acaOp.release()); shared_ptr<DiscreteBndOp> trialGlobalToLocal = constructOperatorMappingGlobalToFlatLocalDofs< BasisFunctionType, ResultType>(trialSpace); shared_ptr<DiscreteBndOp> testLocalToGlobal = constructOperatorMappingFlatLocalToGlobalDofs< BasisFunctionType, ResultType>(testSpace); shared_ptr<DiscreteBndOp> tmp( new DiscreteBndOpComp(acaOpShared, trialGlobalToLocal)); result.reset(new DiscreteBndOpComp(testLocalToGlobal, tmp)); #endif // WITH_TRILINOS } return result; #else // without Ahmed throw std::runtime_error("AcaGlobalAssembler::assembleDetachedWeakForm(): " "To enable assembly in ACA mode, recompile BEM++ " "with the symbol WITH_AHMED defined."); #endif // WITH_AHMED }