shared_ptr<const DiscreteBoundaryOperator<ResultType>>
assembleDenseBlock(
    int rowStart, int rowEnd, int colStart, int colEnd,
    const Space<BasisFunctionType> &testSpace,
    const Space<BasisFunctionType> &trialSpace,
    Fiber::LocalAssemblerForIntegralOperators<ResultType>& assembler,
    const ParameterList &parameterList) {

  int numberOfRows = rowEnd - rowStart;
  int numberOfColumns = colEnd - colStart;

  if (colEnd > trialSpace.globalDofCount() ||
      rowEnd > testSpace.globalDofCount() || colStart < 0 || rowStart < 0)
    throw std::runtime_error("DenseGlobalBlockAssember::assembleWeakForm(): "
                             "Indices out of bounds");

  Context<BasisFunctionType, ResultType> context(parameterList);
  const AssemblyOptions &options = context.assemblyOptions();

  // Create the operator's matrix
  Matrix<ResultType> result(numberOfRows, numberOfColumns);
  result.setZero();

  std::unordered_map<int, std::vector<GlobalDofIndex>> trialIndexMap,
      testIndexMap;
  std::unordered_map<int, std::vector<BasisFunctionType>> trialDofWeights,
      testDofWeights;

  gatherElementInformation(colStart, colEnd, trialSpace, trialIndexMap,
                           trialDofWeights);
  gatherElementInformation(rowStart, rowEnd, testSpace, testIndexMap,
                           testDofWeights);

  std::vector<int> testIndices;
  testIndices.reserve(testIndexMap.size());
  for (const auto &p : testIndexMap)
    testIndices.push_back(p.first);

  typedef DenseWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body;
  typename Body::MutexType mutex;

  {
    Fiber::SerialBlasRegion region;
    tbb::parallel_for_each(trialIndexMap.begin(), trialIndexMap.end(),
                           Body(rowStart, colStart, testIndices, testIndexMap,
                                trialIndexMap, testDofWeights, trialDofWeights,
                                assembler, result, mutex));
  }

  // Create and return a discrete operator represented by the matrix that
  // has just been calculated
  return shared_ptr<DiscreteBoundaryOperator<ResultType>>(
      new DiscreteDenseBoundaryOperator<ResultType>(result));
}
Beispiel #2
0
Vector<ResultType> reallyCalculateProjections(
    const Space<BasisFunctionType> &dualSpace,
    Fiber::LocalAssemblerForGridFunctions<ResultType> &assembler,
    const AssemblyOptions &options) {
  // TODO: parallelise using TBB (the parameter options will then start be used)

  // Get the grid's leaf view so that we can iterate over elements
  const GridView &view = dualSpace.gridView();
  const size_t elementCount = view.entityCount(0);

  // Global DOF indices corresponding to local DOFs on elements
  std::vector<std::vector<GlobalDofIndex>> testGlobalDofs(elementCount);
  std::vector<std::vector<BasisFunctionType>> testLocalDofWeights(elementCount);

  // Gather global DOF lists
  const Mapper &mapper = view.elementMapper();
  std::unique_ptr<EntityIterator<0>> it = view.entityIterator<0>();
  while (!it->finished()) {
    const Entity<0> &element = it->entity();
    const int elementIndex = mapper.entityIndex(element);
    dualSpace.getGlobalDofs(element, testGlobalDofs[elementIndex],
                            testLocalDofWeights[elementIndex]);
    it->next();
  }

  // Make a vector of all element indices
  std::vector<int> testIndices(elementCount);
  for (size_t i = 0; i < elementCount; ++i)
    testIndices[i] = i;

  // Create the weak form's column vector
  Vector<ResultType> result(dualSpace.globalDofCount());
  result.setZero();

  std::vector<Vector<ResultType>> localResult;
  // Evaluate local weak forms
  assembler.evaluateLocalWeakForms(testIndices, localResult);

  // Loop over test indices
  for (size_t testIndex = 0; testIndex < elementCount; ++testIndex)
    // Add the integrals to appropriate entries in the global weak form
    for (size_t testDof = 0; testDof < testGlobalDofs[testIndex].size();
         ++testDof) {
      int testGlobalDof = testGlobalDofs[testIndex][testDof];
      if (testGlobalDof >= 0) // if it's negative, it means that this
                              // local dof is constrained (not used)
        result(testGlobalDof) +=
            conj(testLocalDofWeights[testIndex][testDof]) *
            localResult[testIndex](testDof);
    }

  // Return the vector of projections <phi_i, f>
  return result;
}
std::auto_ptr<DiscreteBoundaryOperator<ResultType> >
DenseGlobalAssembler<BasisFunctionType, ResultType>::
assembleDetachedWeakForm(
        const Space<BasisFunctionType>& testSpace,
        const Space<BasisFunctionType>& trialSpace,
        LocalAssemblerForIntegralOperators& assembler,
        const Context<BasisFunctionType, ResultType>& context)
{
    const AssemblyOptions& options = context.assemblyOptions();

    // Global DOF indices corresponding to local DOFs on elements
    std::vector<std::vector<GlobalDofIndex> > testGlobalDofs, trialGlobalDofs;
    std::vector<std::vector<BasisFunctionType> > testLocalDofWeights,
        trialLocalDofWeights;
    gatherGlobalDofs(testSpace, testGlobalDofs, testLocalDofWeights);
    if (&testSpace == &trialSpace) {
        trialGlobalDofs = testGlobalDofs;
        trialLocalDofWeights = testLocalDofWeights;
    } else
        gatherGlobalDofs(trialSpace, trialGlobalDofs, trialLocalDofWeights);
    const size_t testElementCount = testGlobalDofs.size();
    const size_t trialElementCount = trialGlobalDofs.size();

    // Make a vector of all element indices
    std::vector<int> testIndices(testElementCount);
    for (int i = 0; i < testElementCount; ++i)
        testIndices[i] = i;

    // Create the operator's matrix
    arma::Mat<ResultType> result(testSpace.globalDofCount(),
                                 trialSpace.globalDofCount());
    result.fill(0.);

    typedef DenseWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body;
    typename Body::MutexType mutex;

    const ParallelizationOptions& parallelOptions =
            options.parallelizationOptions();
    int maxThreadCount = 1;
    if (!parallelOptions.isOpenClEnabled()) {
        if (parallelOptions.maxThreadCount() == ParallelizationOptions::AUTO)
            maxThreadCount = tbb::task_scheduler_init::automatic;
        else
            maxThreadCount = parallelOptions.maxThreadCount();
    }
    tbb::task_scheduler_init scheduler(maxThreadCount);
    {
        Fiber::SerialBlasRegion region;
        tbb::parallel_for(tbb::blocked_range<size_t>(0, trialElementCount),
                          Body(testIndices, testGlobalDofs, trialGlobalDofs,
                               testLocalDofWeights, trialLocalDofWeights,
                               assembler, result, mutex));
    }

    //// Old serial code (TODO: decide whether to keep it behind e.g. #ifndef PARALLEL)
    //    std::vector<arma::Mat<ValueType> > localResult;
    //    // Loop over trial elements
    //    for (int trialIndex = 0; trialIndex < trialElementCount; ++trialIndex)
    //    {
    //        // Evaluate integrals over pairs of the current trial element and
    //        // all the test elements
    //        assembler.evaluateLocalWeakForms(TEST_TRIAL, testIndices, trialIndex,
    //                                         ALL_DOFS, localResult);

    //        // Loop over test indices
    //        for (int testIndex = 0; testIndex < testElementCount; ++testIndex)
    //            // Add the integrals to appropriate entries in the operator's matrix
    //            for (int trialDof = 0; trialDof < trialGlobalDofs[trialIndex].size(); ++trialDof)
    //                for (int testDof = 0; testDof < testGlobalDofs[testIndex].size(); ++testDof)
    //                result(testGlobalDofs[testIndex][testDof],
    //                       trialGlobalDofs[trialIndex][trialDof]) +=
    //                        localResult[testIndex](testDof, trialDof);
    //    }

    // Create and return a discrete operator represented by the matrix that
    // has just been calculated
    return std::auto_ptr<DiscreteBoundaryOperator<ResultType> >(
                new DiscreteDenseBoundaryOperator<ResultType>(result));
}
std::auto_ptr<DiscreteBoundaryOperator<ResultType> >
AcaGlobalAssembler<BasisFunctionType, ResultType>::assembleDetachedWeakForm(
        const Space<BasisFunctionType>& testSpace,
        const Space<BasisFunctionType>& trialSpace,
        const std::vector<LocalAssembler*>& localAssemblers,
        const std::vector<const DiscreteBndOp*>& sparseTermsToAdd,
        const std::vector<ResultType>& denseTermsMultipliers,
        const std::vector<ResultType>& sparseTermsMultipliers,
        const AssemblyOptions& options,
        int symmetry)
{
#ifdef WITH_AHMED
    typedef AhmedDofWrapper<CoordinateType> AhmedDofType;
    typedef ExtendedBemCluster<AhmedDofType> AhmedBemCluster;
    typedef bemblcluster<AhmedDofType, AhmedDofType> AhmedBemBlcluster;
    typedef DiscreteAcaBoundaryOperator<ResultType> DiscreteAcaLinOp;

    const AcaOptions& acaOptions = options.acaOptions();
    const bool indexWithGlobalDofs = acaOptions.globalAssemblyBeforeCompression;
    const bool verbosityAtLeastDefault =
            (options.verbosityLevel() >= VerbosityLevel::DEFAULT);
    const bool verbosityAtLeastHigh =
            (options.verbosityLevel() >= VerbosityLevel::HIGH);

    // Currently we don't support Hermitian ACA operators. This is because we
    // don't have the means to really test them -- we would need complex-valued
    // basis functions for that. (Assembly of such a matrix would be very easy
    // -- just change complex_sym from true to false in the call to apprx_sym()
    // in AcaWeakFormAssemblerLoopBody::operator() -- but operations on
    // symmetric/Hermitian matrices are not always trivial and we do need to be
    // able to test them properly.)
    bool symmetric = symmetry & SYMMETRIC;
    if (symmetry & HERMITIAN && !(symmetry & SYMMETRIC) &&
            verbosityAtLeastDefault)
        std::cout << "Warning: assembly of non-symmetric Hermitian H-matrices "
                     "is not supported yet. A general H-matrix will be assembled"
                  << std::endl;

#ifndef WITH_TRILINOS
    if (!indexWithGlobalDofs)
        throw std::runtime_error("AcaGlobalAssembler::assembleDetachedWeakForm(): "
                                 "ACA assembly with globalAssemblyBeforeCompression "
                                 "set to false requires BEM++ to be linked with "
                                 "Trilinos");
#endif // WITH_TRILINOS

    const size_t testDofCount = indexWithGlobalDofs ?
                testSpace.globalDofCount() : testSpace.flatLocalDofCount();
    const size_t trialDofCount = indexWithGlobalDofs ?
                trialSpace.globalDofCount() : trialSpace.flatLocalDofCount();

    if (symmetric && testDofCount != trialDofCount)
        throw std::invalid_argument("AcaGlobalAssembler::assembleDetachedWeakForm(): "
                                    "you cannot generate a symmetric weak form "
                                    "using test and trial spaces with different "
                                    "numbers of DOFs");

    // o2p: map of original indices to permuted indices
    // p2o: map of permuted indices to original indices
    typedef ClusterConstructionHelper<BasisFunctionType> CCH;
    shared_ptr<AhmedBemCluster> testClusterTree;
    shared_ptr<IndexPermutation> test_o2pPermutation, test_p2oPermutation;
    CCH::constructBemCluster(testSpace, indexWithGlobalDofs, acaOptions,
                             testClusterTree,
                             test_o2pPermutation, test_p2oPermutation);
    shared_ptr<AhmedBemCluster> trialClusterTree;
    shared_ptr<IndexPermutation> trial_o2pPermutation, trial_p2oPermutation;
    if (symmetric || &testSpace == &trialSpace) {
        trialClusterTree = testClusterTree;
        trial_o2pPermutation = test_o2pPermutation;
        trial_p2oPermutation = test_p2oPermutation;
    } else
        CCH::constructBemCluster(trialSpace, indexWithGlobalDofs, acaOptions,
                                 trialClusterTree,
                                 trial_o2pPermutation, trial_p2oPermutation);

//    // Export VTK plots showing the disctribution of leaf cluster ids
//    std::vector<unsigned int> testClusterIds;
//    getClusterIds(*testClusterTree, test_p2oPermutation->permutedIndices(), testClusterIds);
//    testSpace.dumpClusterIds("testClusterIds", testClusterIds,
//                             indexWithGlobalDofs ? GLOBAL_DOFS : FLAT_LOCAL_DOFS);
//    std::vector<unsigned int> trialClusterIds;
//    getClusterIds(*trialClusterTree, trial_p2oPermutation->permutedIndices(), trialClusterIds);
//    trialSpace.dumpClusterIds("trialClusterIds", trialClusterIds,
//                              indexWithGlobalDofs ? GLOBAL_DOFS : FLAT_LOCAL_DOFS);

    if (verbosityAtLeastHigh)
        std::cout << "Test cluster count: " << testClusterTree->getncl()
                  << "\nTrial cluster count: " << trialClusterTree->getncl()
                  << std::endl;

    unsigned int blockCount = 0;
    shared_ptr<AhmedBemBlcluster> bemBlclusterTree(
                CCH::constructBemBlockCluster(acaOptions, symmetric,
                                              *testClusterTree, *trialClusterTree,
                                              blockCount).release());

    if (verbosityAtLeastHigh)
        std::cout << "Mblock count: " << blockCount << std::endl;

    std::vector<unsigned int> p2oTestDofs =
        test_p2oPermutation->permutedIndices();
    std::vector<unsigned int> p2oTrialDofs =
        trial_p2oPermutation->permutedIndices();
    WeakFormAcaAssemblyHelper<BasisFunctionType, ResultType>
        helper(testSpace, trialSpace, p2oTestDofs, p2oTrialDofs,
               localAssemblers, sparseTermsToAdd,
               denseTermsMultipliers, sparseTermsMultipliers, options);

    typedef mblock<typename AhmedTypeTraits<ResultType>::Type> AhmedMblock;
    boost::shared_array<AhmedMblock*> blocks =
            allocateAhmedMblockArray<ResultType>(bemBlclusterTree.get());

    // matgen_sqntl(helper, AhmedBemBlclusterTree.get(), AhmedBemBlclusterTree.get(),
    //              acaOptions.recompress, acaOptions.eps,
    //              acaOptions.maximumRank, blocks.get());

    // matgen_omp(helper, blockCount, AhmedBemBlclusterTree.get(),
    //            acaOptions.eps, acaOptions.maximumRank, blocks.get());

    // // Dump mblocks
    // const int mblockCount = AhmedBemBlclusterTree->nleaves();
    // for (int i = 0; i < mblockCount; ++i)
    //     if (blocks[i]->isdns())
    //     {
    //         char  buffer[1024];
    //         sprintf(buffer, "mblock-dns-%d-%d.txt",
    //                 blocks[i]->getn1(), blocks[i]->getn2());
    //         arma::Col<ResultType> block((ResultType*)blocks[i]->getdata(),
    //                                     blocks[i]->nvals());
    //         arma::diskio::save_raw_ascii(block, buffer);
    //     }
    //     else
    //     {
    //         char buffer[1024];
    //         sprintf(buffer, "mblock-lwr-%d-%d.txt",
    //                 blocks[i]->getn1(), blocks[i]->getn2());
    //         arma::Col<ResultType> block((ResultType*)blocks[i]->getdata(),
    //                                     blocks[i]->nvals());
    //         arma::diskio::save_raw_ascii(block, buffer);
    //     }

    AhmedLeafClusterArray leafClusters(bemBlclusterTree.get());
    leafClusters.sortAccordingToClusterSize();
    const size_t leafClusterCount = leafClusters.size();

    const ParallelizationOptions& parallelOptions =
            options.parallelizationOptions();
    int maxThreadCount = 1;
    if (!parallelOptions.isOpenClEnabled())
    {
        if (parallelOptions.maxThreadCount() == ParallelizationOptions::AUTO)
            maxThreadCount = tbb::task_scheduler_init::automatic;
        else
            maxThreadCount = parallelOptions.maxThreadCount();
    }
    tbb::task_scheduler_init scheduler(maxThreadCount);
    tbb::atomic<size_t> done;
    done = 0;

    std::vector<ChunkStatistics> chunkStats(leafClusterCount);

    //    typedef AcaWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body;
    //    // std::cout << "Loop start" << std::endl;
    //    tbb::tick_count loopStart = tbb::tick_count::now();
    // //    tbb::parallel_for(tbb::blocked_range<size_t>(0, leafClusterCount),
    // //                      Body(helper, leafClusters, blocks, acaOptions, done
    // //                           , chunkStats));
    //    tbb::parallel_for(ScatteredRange(0, leafClusterCount),
    //                      Body(helper, leafClusters, blocks, acaOptions, done
    //                           , chunkStats));
    //    tbb::tick_count loopEnd = tbb::tick_count::now();
    //    // std::cout << "Loop end" << std::endl;

    typedef AcaWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body;
    typename Body::LeafClusterIndexQueue leafClusterIndexQueue;
    for (size_t i = 0; i < leafClusterCount; ++i)
        leafClusterIndexQueue.push(i);

    if (verbosityAtLeastDefault)
        std::cout << "About to start the ACA assembly loop" << std::endl;
    tbb::tick_count loopStart = tbb::tick_count::now();
    {
        Fiber::SerialBlasRegion region; // if possible, ensure that BLAS is single-threaded
        tbb::parallel_for(tbb::blocked_range<size_t>(0, leafClusterCount),
                          Body(helper, leafClusters, blocks, acaOptions, done,
                               verbosityAtLeastDefault,
                               leafClusterIndexQueue, symmetric, chunkStats));
    }
    tbb::tick_count loopEnd = tbb::tick_count::now();
    if (verbosityAtLeastDefault) {
        std::cout << "\n"; // the progress bar doesn't print the final \n
        std::cout << "ACA loop took " << (loopEnd - loopStart).seconds() << " s"
                  << std::endl;
    }

    // TODO: parallelise!
    if (acaOptions.recompress) {
        if (verbosityAtLeastDefault)
            std::cout << "About to start ACA agglomeration" << std::endl;
        agglH(bemBlclusterTree.get(), blocks.get(),
              acaOptions.eps, acaOptions.maximumRank);
        if (verbosityAtLeastDefault)
            std::cout << "Agglomeration finished" << std::endl;
    }

    // // Dump timing data of individual chunks
    //    std::cout << "\nChunks:\n";
    //    for (int i = 0; i < leafClusterCount; ++i)
    //        if (chunkStats[i].valid) {
    //            int blockIndex = leafClusters[i]->getidx();
    //            std::cout << chunkStats[i].chunkStart << "\t"
    //                      << chunkStats[i].chunkSize << "\t"
    //                      << (chunkStats[i].startTime - loopStart).seconds() << "\t"
    //                      << (chunkStats[i].endTime - loopStart).seconds() << "\t"
    //                      << (chunkStats[i].endTime - chunkStats[i].startTime).seconds() << "\t"
    //                      << blocks[blockIndex]->getn1() << "\t"
    //                      << blocks[blockIndex]->getn2() << "\t"
    //                      << blocks[blockIndex]->islwr() << "\t"
    //                      << (blocks[blockIndex]->islwr() ? blocks[blockIndex]->rank() : 0) << "\n";
    //        }

    {
        size_t origMemory = sizeof(ResultType) * testDofCount * trialDofCount;
        size_t ahmedMemory = sizeH(bemBlclusterTree.get(), blocks.get());
        int maximumRank = Hmax_rank(bemBlclusterTree.get(), blocks.get());
        if (verbosityAtLeastDefault)
            std::cout << "\nNeeded storage: "
                      << ahmedMemory / 1024. / 1024. << " MB.\n"
                      << "Without approximation: "
                      << origMemory / 1024. / 1024. << " MB.\n"
                      << "Compressed to "
                      << (100. * ahmedMemory) / origMemory << "%.\n"
                      << "Maximum rank: " << maximumRank << ".\n"
                      << std::endl;

        if (acaOptions.outputPostscript) {
            if (verbosityAtLeastDefault)
                std::cout << "Writing matrix partition ..." << std::flush;
            std::ofstream os(acaOptions.outputFname.c_str());
            if (symmetric) // seems valid also for Hermitian matrices
                psoutputHeH(os, bemBlclusterTree.get(), testDofCount, blocks.get());
            else
                psoutputGeH(os, bemBlclusterTree.get(), testDofCount, blocks.get());
            os.close();
            if (verbosityAtLeastDefault)
                std::cout << " done." << std::endl;
        }
    }

    int outSymmetry = NO_SYMMETRY;
    if (symmetric) {
        outSymmetry = SYMMETRIC;
        if (!boost::is_complex<ResultType>())
            outSymmetry |= HERMITIAN;
    }
    std::auto_ptr<DiscreteAcaLinOp> acaOp(
                new DiscreteAcaLinOp(testDofCount, trialDofCount,
                                     acaOptions.eps,
                                     acaOptions.maximumRank,
                                     outSymmetry,
                                     bemBlclusterTree, blocks,
                                     *trial_o2pPermutation,
                                     *test_o2pPermutation,
                                     parallelOptions));

    std::auto_ptr<DiscreteBndOp> result;
    if (indexWithGlobalDofs)
        result = acaOp;
    else {
#ifdef WITH_TRILINOS
        // without Trilinos, this code will never be reached -- an exception
        // will be thrown earlier in this function
        typedef DiscreteBoundaryOperatorComposition<ResultType> DiscreteBndOpComp;
        shared_ptr<DiscreteBndOp> acaOpShared(acaOp.release());
        shared_ptr<DiscreteBndOp> trialGlobalToLocal =
                constructOperatorMappingGlobalToFlatLocalDofs<
                BasisFunctionType, ResultType>(trialSpace);
        shared_ptr<DiscreteBndOp> testLocalToGlobal =
                constructOperatorMappingFlatLocalToGlobalDofs<
                BasisFunctionType, ResultType>(testSpace);
        shared_ptr<DiscreteBndOp> tmp(
                    new DiscreteBndOpComp(acaOpShared, trialGlobalToLocal));
        result.reset(new DiscreteBndOpComp(testLocalToGlobal, tmp));
#endif // WITH_TRILINOS
    }
    return result;

#else // without Ahmed
    throw std::runtime_error("AcaGlobalAssembler::assembleDetachedWeakForm(): "
                             "To enable assembly in ACA mode, recompile BEM++ "
                             "with the symbol WITH_AHMED defined.");
#endif // WITH_AHMED
}