std::unique_ptr<DiscreteBoundaryOperator<ResultType>>
ElementaryLocalOperator<BasisFunctionType, ResultType>::
    assembleWeakFormInSparseMode(LocalAssembler &assembler,
                                 const AssemblyOptions &options) const {
  if (boost::is_complex<BasisFunctionType>::value)
    throw std::runtime_error(
        "ElementaryLocalOperator::assembleWeakFormInSparseMode(): "
        "sparse-mode assembly of identity operators for "
        "complex-valued basis functions is not supported yet");

  const Space<BasisFunctionType> &testSpace = *this->dualToRange();
  const Space<BasisFunctionType> &trialSpace = *this->domain();

  // Fill local submatrices
  const GridView &view = testSpace.gridView();
  const size_t elementCount = view.entityCount(0);
  std::vector<int> elementIndices(elementCount);
  for (size_t i = 0; i < elementCount; ++i)
    elementIndices[i] = i;
  std::vector<Matrix<ResultType>> localResult;
  assembler.evaluateLocalWeakForms(elementIndices, localResult);

  // Global DOF indices corresponding to local DOFs on elements
  std::vector<std::vector<GlobalDofIndex>> testGdofs(elementCount);
  std::vector<std::vector<GlobalDofIndex>> trialGdofs(elementCount);
  std::vector<std::vector<BasisFunctionType>> testLdofWeights(elementCount);
  std::vector<std::vector<BasisFunctionType>> trialLdofWeights(elementCount);
  gatherGlobalDofs(testSpace, trialSpace, testGdofs, trialGdofs,
                   testLdofWeights, trialLdofWeights);

  // Multiply matrix entries by DOF weights
  for (size_t e = 0; e < elementCount; ++e)
    for (size_t trialDof = 0; trialDof < trialGdofs[e].size(); ++trialDof)
      for (size_t testDof = 0; testDof < testGdofs[e].size(); ++testDof)
        localResult[e](testDof, trialDof) *=
            conj(testLdofWeights[e][testDof]) * trialLdofWeights[e][trialDof];

  const int testGlobalDofCount = testSpace.globalDofCount();
  const int trialGlobalDofCount = trialSpace.globalDofCount();

  shared_ptr<RealSparseMatrix> result = boost::make_shared<RealSparseMatrix>(
      testGlobalDofCount, trialGlobalDofCount);

  std::vector<Eigen::Triplet<double>> triplets;
  generateTriplets<ResultType>(*result, testGdofs, trialGdofs, localResult,
                               elementCount, triplets);
  result->setFromTriplets(triplets.begin(), triplets.end());

  // Create and return a discrete operator represented by the matrix that
  // has just been calculated
  return std::unique_ptr<DiscreteBoundaryOperator<ResultType>>(
      new DiscreteSparseBoundaryOperator<ResultType>(result, this->symmetry(),
                                                     NO_TRANSPOSE));
}
std::unique_ptr<DiscreteBoundaryOperator<ResultType>>
ElementaryLocalOperator<BasisFunctionType, ResultType>::
    assembleWeakFormInDenseMode(LocalAssembler &assembler,
                                const AssemblyOptions &options) const {
  const Space<BasisFunctionType> &testSpace = *this->dualToRange();
  const Space<BasisFunctionType> &trialSpace = *this->domain();

  // Fill local submatrices
  const GridView &view = testSpace.gridView();
  const size_t elementCount = view.entityCount(0);
  std::vector<int> elementIndices(elementCount);
  for (size_t i = 0; i < elementCount; ++i)
    elementIndices[i] = i;
  std::vector<Matrix<ResultType>> localResult;
  assembler.evaluateLocalWeakForms(elementIndices, localResult);

  // Create the operator's matrix
  Matrix<ResultType> result(testSpace.globalDofCount(),
                            trialSpace.globalDofCount());
  result.setZero();

  // Retrieve global DOFs corresponding to local DOFs on all elements
  std::vector<std::vector<GlobalDofIndex>> testGdofs(elementCount);
  std::vector<std::vector<GlobalDofIndex>> trialGdofs(elementCount);
  std::vector<std::vector<BasisFunctionType>> testLdofWeights(elementCount);
  std::vector<std::vector<BasisFunctionType>> trialLdofWeights(elementCount);
  gatherGlobalDofs(testSpace, trialSpace, testGdofs, trialGdofs,
                   testLdofWeights, trialLdofWeights);

  // Distribute local matrices into the global matrix
  for (size_t e = 0; e < elementCount; ++e)
    for (size_t trialIndex = 0; trialIndex < trialGdofs[e].size();
         ++trialIndex) {
      int trialGdof = trialGdofs[e][trialIndex];
      if (trialGdof < 0)
        continue;
      for (size_t testIndex = 0; testIndex < testGdofs[e].size(); ++testIndex) {
        int testGdof = testGdofs[e][testIndex];
        if (testGdof < 0)
          continue;
        result(testGdof, trialGdof) += conj(testLdofWeights[e][testIndex]) *
                                       trialLdofWeights[e][trialIndex] *
                                       localResult[e](testIndex, trialIndex);
      }
    }

  return std::unique_ptr<DiscreteBoundaryOperator<ResultType>>(
      new DiscreteDenseBoundaryOperator<ResultType>(result));
}
std::unique_ptr<DiscreteBoundaryOperator<ResultType>>
ElementaryLocalOperator<BasisFunctionType, ResultType>::
    assembleWeakFormInSparseMode(LocalAssembler &assembler,
                                 const AssemblyOptions &options) const {
#ifdef WITH_TRILINOS
  if (boost::is_complex<BasisFunctionType>::value)
    throw std::runtime_error(
        "ElementaryLocalOperator::assembleWeakFormInSparseMode(): "
        "sparse-mode assembly of identity operators for "
        "complex-valued basis functions is not supported yet");

  const Space<BasisFunctionType> &testSpace = *this->dualToRange();
  const Space<BasisFunctionType> &trialSpace = *this->domain();

  // Fill local submatrices
  const GridView &view = testSpace.gridView();
  const size_t elementCount = view.entityCount(0);
  std::vector<int> elementIndices(elementCount);
  for (size_t i = 0; i < elementCount; ++i)
    elementIndices[i] = i;
  std::vector<arma::Mat<ResultType>> localResult;
  assembler.evaluateLocalWeakForms(elementIndices, localResult);

  // Global DOF indices corresponding to local DOFs on elements
  std::vector<std::vector<GlobalDofIndex>> testGdofs(elementCount);
  std::vector<std::vector<GlobalDofIndex>> trialGdofs(elementCount);
  std::vector<std::vector<BasisFunctionType>> testLdofWeights(elementCount);
  std::vector<std::vector<BasisFunctionType>> trialLdofWeights(elementCount);
  gatherGlobalDofs(testSpace, trialSpace, testGdofs, trialGdofs,
                   testLdofWeights, trialLdofWeights);

  // Multiply matrix entries by DOF weights
  for (size_t e = 0; e < elementCount; ++e)
    for (size_t trialDof = 0; trialDof < trialGdofs[e].size(); ++trialDof)
      for (size_t testDof = 0; testDof < testGdofs[e].size(); ++testDof)
        localResult[e](testDof, trialDof) *=
            conj(testLdofWeights[e][testDof]) * trialLdofWeights[e][trialDof];

  // Estimate number of entries in each row

  //    This will be useful when we begin to use MPI
  //    // Get global DOF indices for which this process is responsible
  //    const int testGlobalDofCount = testSpace.globalDofCount();
  //    Epetra_Map rowMap(testGlobalDofCount, 0 /* index-base */, comm);
  //    std::vector<int> myTestGlobalDofs(rowMap.MyGlobalElements(),
  //                                      rowMap.MyGlobalElements() +
  //                                      rowMap.NumMyElements());
  //    const int myTestGlobalDofCount = myTestGlobalDofs.size();

  const int testGlobalDofCount = testSpace.globalDofCount();
  const int trialGlobalDofCount = trialSpace.globalDofCount();
  arma::Col<int> nonzeroEntryCountEstimates(testGlobalDofCount);
  nonzeroEntryCountEstimates.fill(0);

  // Upper estimate for the number of global trial DOFs coupled to a given
  // global test DOF: sum of the local trial DOF counts for each element that
  // contributes to the global test DOF in question
  for (size_t e = 0; e < elementCount; ++e)
    for (size_t testLdof = 0; testLdof < testGdofs[e].size(); ++testLdof) {
      int testGdof = testGdofs[e][testLdof];
      if (testGdof >= 0)
        nonzeroEntryCountEstimates(testGdof) += trialGdofs[e].size();
    }

  Epetra_SerialComm comm; // To be replaced once we begin to use MPI
  Epetra_LocalMap rowMap(testGlobalDofCount, 0 /* index_base */, comm);
  Epetra_LocalMap colMap(trialGlobalDofCount, 0 /* index_base */, comm);
  shared_ptr<Epetra_FECrsMatrix> result =
      boost::make_shared<Epetra_FECrsMatrix>(
          Copy, rowMap, colMap, nonzeroEntryCountEstimates.memptr());

  // TODO: make each process responsible for a subset of elements
  // Find maximum number of local dofs per element
  size_t maxLdofCount = 0;
  for (size_t e = 0; e < elementCount; ++e)
    maxLdofCount =
        std::max(maxLdofCount, testGdofs[e].size() * trialGdofs[e].size());

  // Initialise sparse matrix with zeros at required positions
  arma::Col<double> zeros(maxLdofCount);
  zeros.fill(0.);
  for (size_t e = 0; e < elementCount; ++e)
    result->InsertGlobalValues(testGdofs[e].size(), &testGdofs[e][0],
                               trialGdofs[e].size(), &trialGdofs[e][0],
                               zeros.memptr());
  // Add contributions from individual elements
  for (size_t e = 0; e < elementCount; ++e)
    epetraSumIntoGlobalValues(*result, testGdofs[e], trialGdofs[e],
                              localResult[e]);
  result->GlobalAssemble();

  // If assembly mode is equal to ACA and we have AHMED,
  // construct the block cluster tree. Otherwise leave it uninitialized.
  typedef ClusterConstructionHelper<BasisFunctionType> CCH;
  typedef AhmedDofWrapper<CoordinateType> AhmedDofType;
  typedef ExtendedBemCluster<AhmedDofType> AhmedBemCluster;
  typedef bbxbemblcluster<AhmedDofType, AhmedDofType> AhmedBemBlcluster;

  shared_ptr<AhmedBemBlcluster> blockCluster;
  shared_ptr<IndexPermutation> test_o2pPermutation, test_p2oPermutation;
  shared_ptr<IndexPermutation> trial_o2pPermutation, trial_p2oPermutation;
#ifdef WITH_AHMED
  if (options.assemblyMode() == AssemblyOptions::ACA) {
    const AcaOptions &acaOptions = options.acaOptions();
    bool indexWithGlobalDofs = acaOptions.mode != AcaOptions::HYBRID_ASSEMBLY;

    typedef ClusterConstructionHelper<BasisFunctionType> CCH;
    shared_ptr<AhmedBemCluster> testClusterTree;
    CCH::constructBemCluster(testSpace, indexWithGlobalDofs, acaOptions,
                             testClusterTree, test_o2pPermutation,
                             test_p2oPermutation);
    // TODO: construct a hermitian H-matrix if possible
    shared_ptr<AhmedBemCluster> trialClusterTree;
    CCH::constructBemCluster(trialSpace, indexWithGlobalDofs, acaOptions,
                             trialClusterTree, trial_o2pPermutation,
                             trial_p2oPermutation);
    unsigned int blockCount = 0;
    bool useStrongAdmissibilityCondition = !indexWithGlobalDofs;
    blockCluster.reset(CCH::constructBemBlockCluster(
        acaOptions, false /* hermitian */, *testClusterTree, *trialClusterTree,
        useStrongAdmissibilityCondition, blockCount).release());
  }
#endif

  // Create and return a discrete operator represented by the matrix that
  // has just been calculated
  return std::unique_ptr<DiscreteBoundaryOperator<ResultType>>(
      new DiscreteSparseBoundaryOperator<ResultType>(
          result, this->symmetry(), NO_TRANSPOSE, blockCluster,
          trial_o2pPermutation, test_o2pPermutation));
#else // WITH_TRILINOS
  throw std::runtime_error(
      "ElementaryLocalOperator::assembleWeakFormInSparseMode(): "
      "To enable assembly in sparse mode, recompile BEM++ "
      "with the symbol WITH_TRILINOS defined.");
#endif
}
std::auto_ptr<DiscreteBoundaryOperator<ResultType> >
DenseGlobalAssembler<BasisFunctionType, ResultType>::
assembleDetachedWeakForm(
        const Space<BasisFunctionType>& testSpace,
        const Space<BasisFunctionType>& trialSpace,
        LocalAssemblerForIntegralOperators& assembler,
        const Context<BasisFunctionType, ResultType>& context)
{
    const AssemblyOptions& options = context.assemblyOptions();

    // Global DOF indices corresponding to local DOFs on elements
    std::vector<std::vector<GlobalDofIndex> > testGlobalDofs, trialGlobalDofs;
    std::vector<std::vector<BasisFunctionType> > testLocalDofWeights,
        trialLocalDofWeights;
    gatherGlobalDofs(testSpace, testGlobalDofs, testLocalDofWeights);
    if (&testSpace == &trialSpace) {
        trialGlobalDofs = testGlobalDofs;
        trialLocalDofWeights = testLocalDofWeights;
    } else
        gatherGlobalDofs(trialSpace, trialGlobalDofs, trialLocalDofWeights);
    const size_t testElementCount = testGlobalDofs.size();
    const size_t trialElementCount = trialGlobalDofs.size();

    // Make a vector of all element indices
    std::vector<int> testIndices(testElementCount);
    for (int i = 0; i < testElementCount; ++i)
        testIndices[i] = i;

    // Create the operator's matrix
    arma::Mat<ResultType> result(testSpace.globalDofCount(),
                                 trialSpace.globalDofCount());
    result.fill(0.);

    typedef DenseWeakFormAssemblerLoopBody<BasisFunctionType, ResultType> Body;
    typename Body::MutexType mutex;

    const ParallelizationOptions& parallelOptions =
            options.parallelizationOptions();
    int maxThreadCount = 1;
    if (!parallelOptions.isOpenClEnabled()) {
        if (parallelOptions.maxThreadCount() == ParallelizationOptions::AUTO)
            maxThreadCount = tbb::task_scheduler_init::automatic;
        else
            maxThreadCount = parallelOptions.maxThreadCount();
    }
    tbb::task_scheduler_init scheduler(maxThreadCount);
    {
        Fiber::SerialBlasRegion region;
        tbb::parallel_for(tbb::blocked_range<size_t>(0, trialElementCount),
                          Body(testIndices, testGlobalDofs, trialGlobalDofs,
                               testLocalDofWeights, trialLocalDofWeights,
                               assembler, result, mutex));
    }

    //// Old serial code (TODO: decide whether to keep it behind e.g. #ifndef PARALLEL)
    //    std::vector<arma::Mat<ValueType> > localResult;
    //    // Loop over trial elements
    //    for (int trialIndex = 0; trialIndex < trialElementCount; ++trialIndex)
    //    {
    //        // Evaluate integrals over pairs of the current trial element and
    //        // all the test elements
    //        assembler.evaluateLocalWeakForms(TEST_TRIAL, testIndices, trialIndex,
    //                                         ALL_DOFS, localResult);

    //        // Loop over test indices
    //        for (int testIndex = 0; testIndex < testElementCount; ++testIndex)
    //            // Add the integrals to appropriate entries in the operator's matrix
    //            for (int trialDof = 0; trialDof < trialGlobalDofs[trialIndex].size(); ++trialDof)
    //                for (int testDof = 0; testDof < testGlobalDofs[testIndex].size(); ++testDof)
    //                result(testGlobalDofs[testIndex][testDof],
    //                       trialGlobalDofs[trialIndex][trialDof]) +=
    //                        localResult[testIndex](testDof, trialDof);
    //    }

    // Create and return a discrete operator represented by the matrix that
    // has just been calculated
    return std::auto_ptr<DiscreteBoundaryOperator<ResultType> >(
                new DiscreteDenseBoundaryOperator<ResultType>(result));
}