Solution<BasisFunctionType, ResultType>
DefaultIterativeSolver<BasisFunctionType, ResultType>::solveImplNonblocked(
        const GridFunction<BasisFunctionType, ResultType>& rhs) const
{
    typedef BoundaryOperator<BasisFunctionType, ResultType> BoundaryOp;
    typedef typename ScalarTraits<ResultType>::RealType MagnitudeType;
    typedef Thyra::MultiVectorBase<ResultType> TrilinosVector;

    const BoundaryOp* boundaryOp = boost::get<BoundaryOp>(&m_impl->op);
    if (!boundaryOp)
        throw std::logic_error(
            "DefaultIterativeSolver::solve(): for solvers constructed "
            "from a BlockedBoundaryOperator the other solve() overload "
            "must be used");
    Solver<BasisFunctionType, ResultType>::checkConsistency(
        *boundaryOp, rhs, m_impl->mode);

    // Construct rhs vector
    Vector<ResultType> projectionsVector(
                rhs.projections(*boundaryOp->dualToRange()));
    Teuchos::RCP<TrilinosVector> rhsVector;
    if (m_impl->mode == ConvergenceTestMode::TEST_CONVERGENCE_IN_DUAL_TO_RANGE)
        rhsVector = Teuchos::rcpFromRef(projectionsVector);
    else {
        const size_t size = boundaryOp->range()->globalDofCount();
        rhsVector.reset(new Vector<ResultType>(size));
        boost::get<BoundaryOp>(m_impl->pinvId).weakForm()->apply(
            Thyra::NOTRANS, projectionsVector, rhsVector.ptr(), 1., 0.);
    }

    // Construct solution vector
    arma::Col<ResultType> armaSolution(rhsVector->range()->dim());
    armaSolution.fill(static_cast<ResultType>(0.));
    Teuchos::RCP<TrilinosVector> solutionVector = wrapInTrilinosVector(armaSolution);

    // Get number of threads
    Fiber::ParallelizationOptions parallelOptions =
        boundaryOp->context()->assemblyOptions().parallelizationOptions();
    int maxThreadCount = 1;
    if (!parallelOptions.isOpenClEnabled()) {
        if (parallelOptions.maxThreadCount() ==
            ParallelizationOptions::AUTO)
            maxThreadCount = tbb::task_scheduler_init::automatic;
        else
            maxThreadCount = parallelOptions.maxThreadCount();
    }

    // Solve
    Thyra::SolveStatus<MagnitudeType> status;
    {
        // Initialize TBB threads here (to prevent their construction and
        // destruction on every matrix-vector multiplication)
        tbb::task_scheduler_init scheduler(maxThreadCount);
        status = m_impl->solverWrapper->solve(
            Thyra::NOTRANS, *rhsVector, solutionVector.ptr());
    }

    // Construct grid function and return
    return Solution<BasisFunctionType, ResultType>(
        GridFunction<BasisFunctionType, ResultType>(
            boundaryOp->context(), boundaryOp->domain(), armaSolution),
        status);
}