Пример #1
0
// Load a graph (with no edges) from a fasta file
StringGraph* SGUtil::loadFASTA(const std::string& filename)
{
    StringGraph* pGraph = new StringGraph;
    SeqReader reader(filename);
    SeqRecord record;

    while(reader.get(record))
    {
        Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(record.id, record.seq.toString());
        pGraph->addVertex(pVertex);
    }
    return pGraph;
}
Пример #2
0
StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigned int minOverlap, 
                              bool allowContainments)
{
    // Initialize graph
    StringGraph* pGraph = new StringGraph;

    std::istream* pReader = createReader(filename);

    int stage = 0;
    int line = 0;
    std::string recordLine;
    while(getline(*pReader, recordLine))
    {
        ASQG::RecordType rt = ASQG::getRecordType(recordLine);
        switch(rt)
        {
            case ASQG::RT_HEADER:
            {
                if(stage != 0)
                {
                    std::cerr << "Error: Unexpected header record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::HeaderRecord headerRecord(recordLine);
                const SQG::IntTag& overlapTag = headerRecord.getOverlapTag();
                if(overlapTag.isInitialized())
                    pGraph->setMinOverlap(overlapTag.get());
                else
                    pGraph->setMinOverlap(0);

                const SQG::FloatTag& errorRateTag = headerRecord.getErrorRateTag();
                if(errorRateTag.isInitialized())
                    pGraph->setErrorRate(errorRateTag.get());
                
                const SQG::IntTag& containmentTag = headerRecord.getContainmentTag();
                if(containmentTag.isInitialized())
                    pGraph->setContainmentFlag(containmentTag.get());
                else
                    pGraph->setContainmentFlag(true); // conservatively assume containments are present

                const SQG::IntTag& transitiveTag = headerRecord.getTransitiveTag();
                if(!transitiveTag.isInitialized())
                {
                    std::cerr << "Warning: ASQG does not have transitive tag\n";
                    pGraph->setTransitiveFlag(true);
                }
                else
                {
                    pGraph->setTransitiveFlag(transitiveTag.get());
                }

                break;
            }
            case ASQG::RT_VERTEX:
            {
                // progress the stage if we are done the header
                if(stage == 0)
                    stage = 1;

                if(stage != 1)
                {
                    std::cerr << "Error: Unexpected vertex record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::VertexRecord vertexRecord(recordLine);
                const SQG::IntTag& ssTag = vertexRecord.getSubstringTag();

                Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(vertexRecord.getID(), vertexRecord.getSeq());
                if(ssTag.isInitialized() && ssTag.get() == 1)
                {
                    // Vertex is a substring of some other vertex, mark it as contained
                    pVertex->setContained(true);
                    pGraph->setContainmentFlag(true);
                }
                pGraph->addVertex(pVertex);
                break;
            }
            case ASQG::RT_EDGE:
            {
                if(stage == 1)
                    stage = 2;
                
                if(stage != 2)
                {
                    std::cerr << "Error: Unexpected edge record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::EdgeRecord edgeRecord(recordLine);
                const Overlap& ovr = edgeRecord.getOverlap();

                // Add the edge to the graph
                if(ovr.match.getMinOverlapLength() >= (int)minOverlap)
                    SGAlgorithms::createEdgesFromOverlap(pGraph, ovr, allowContainments);
                break;
            }
        }
        ++line;
    }

    // Remove any duplicate edges
    SGDuplicateVisitor dupVisit;
    pGraph->visit(dupVisit);

    SGGraphStatsVisitor statsVisit;
    pGraph->visit(statsVisit);
    // Remove identical vertices
    // This is much cheaper to do than remove via
    // SGContainRemove as no remodelling needs to occur
   /*
    SGIdenticalRemoveVisitor irv;
    pGraph->visit(irv);

    // Remove substring vertices
    while(pGraph->hasContainment())
    {
        SGContainRemoveVisitor crv;
        pGraph->visit(crv);
    }
*/
    delete pReader;
    return pGraph;
}
// Run the bubble construction process
HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplotypes)
{
    PROFILE_FUNC("GraphCompare::buildVariantStringGraph")
    assert(!m_startingKmer.empty());

    std::map<std::string, int> kmerCountMap;

    // We search until we find the first common vertex in each direction
    size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2;
    size_t MAX_ITERATIONS = 2000;
    size_t MAX_SIMULTANEOUS_BRANCHES = 40;
    size_t MAX_TOTAL_BRANCHES = 50;

    // Tracking stats
    size_t max_simul_branches_used = 0;
    size_t total_branches = 0;
    size_t iterations = 0;

    // Initialize the graph
    StringGraph* pGraph = new StringGraph;
    BuilderExtensionQueue queue;

    Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(m_startingKmer, m_startingKmer);
    pVertex->setColor(GC_BLACK);
    pGraph->addVertex(pVertex);

    // Add the vertex to the extension queue
    queue.push(BuilderExtensionNode(pVertex, ED_SENSE));
    queue.push(BuilderExtensionNode(pVertex, ED_ANTISENSE));

    std::vector<Vertex*> sense_join_vector;
    std::vector<Vertex*> antisense_join_vector;

    // Perform the extension. The while conditions are heuristics to avoid searching
    // the graph too much 
    while(!queue.empty() && iterations++ < MAX_ITERATIONS && queue.size() < MAX_SIMULTANEOUS_BRANCHES && total_branches < MAX_TOTAL_BRANCHES)
    {
        if(queue.size() > max_simul_branches_used)
            max_simul_branches_used = queue.size();

        BuilderExtensionNode curr = queue.front();
        queue.pop();

        // Calculate de Bruijn extensions for this node
        std::string vertStr = curr.pVertex->getSeq().toString();
        AlphaCount64 extensionCounts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction);

        std::string extensionsUsed;
        for(size_t i = 0; i < DNA_ALPHABET::size; ++i)
        {
            char b = DNA_ALPHABET::getBase(i);
            size_t count = extensionCounts.get(b);
            bool acceptExt = count >= m_parameters.minDBGCount;
            if(!acceptExt)
                continue;

            extensionsUsed.push_back(b);
            std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, b, curr.direction);
            kmerCountMap[newStr] = count;

            // Create the new vertex and edge in the graph
            // Skip if the vertex already exists
            if(pGraph->getVertex(newStr) != NULL)
                continue;
            
            // Allocate the new vertex and add it to the graph
            Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(newStr, newStr);
            pVertex->setColor(GC_BLACK);
            pGraph->addVertex(pVertex);

            // Add edges
            VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, curr.pVertex, pVertex, curr.direction);
            
            // Check if this sequence is present in the FM-index of the target
            // If so, it is the join point of the de Bruijn graph and we extend no further.
            size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex);

            if(targetCount >= MIN_TARGET_COUNT)
            {
                if(curr.direction == ED_SENSE)
                    sense_join_vector.push_back(pVertex);
                else
                    antisense_join_vector.push_back(pVertex);
            }
            else
            {
                // Add the vertex to the extension queue
                queue.push(BuilderExtensionNode(pVertex, curr.direction));
            }
        }
        
        // Update the total number of times we branches the search
        if(!extensionsUsed.empty())
            total_branches += extensionsUsed.size() - 1;
    }

    // If the graph construction was successful, walk the graph
    // between the endpoints to make a string
    // Generate haplotypes between every pair of antisense/sense join vertices
    for(size_t i = 0; i < antisense_join_vector.size(); ++i) {
        for(size_t j = 0; j < sense_join_vector.size(); ++j) {
            SGWalkVector outWalks;
            SGSearch::findWalks(antisense_join_vector[i],
                                sense_join_vector[j],
                                ED_SENSE,
                                100000, // max distance to search
                                10000, // max nodes to search
                                true, // exhaustive search
                                outWalks);

            for(size_t k = 0; k < outWalks.size(); ++k)
                out_haplotypes.push_back(outWalks[k].getString(SGWT_START_TO_END));
        }
    }
    
    delete pGraph;
    return HBRC_OK;
}