bool visit(StringGraph*, Vertex* pVertex)
 {
     if(pVertex->getColor() == GC_BLACK)
         return false;
     
     if((int)pVertex->getSeqLen() >= m_minLength)
     {
         std::stringstream idss;
         idss << "unplaced-" << m_numUnplaced++;
         writeFastaRecord(m_pWriter, idss.str(), pVertex->getSeq().toString());
     }
     return false;
 }
// write the unplaced sequences of length at least minLength using pWriter
void MapSequenceCollection::writeUnplaced(std::ostream* pWriter, int minLength)
{
    int numUnplaced = 0;
    for(SMPMap::iterator iter = m_map.begin(); iter != m_map.end(); ++iter)
    {
        if(!iter->second.isPlaced && (int)iter->second.sequence.size() >= minLength)
        {
            std::stringstream idss;
            idss << "unplaced-" << numUnplaced++;
            writeFastaRecord(pWriter, idss.str(), iter->second.sequence);
        }
    }
}
예제 #3
0
bool SGSmoothingVisitor::visit(StringGraph* pGraph, Vertex* pVertex)
{
    (void)pGraph;
    if(pVertex->getColor() == GC_RED)
        return false;

    bool found = false;
    for(size_t idx = 0; idx < ED_COUNT; idx++)
    {
        EdgeDir dir = EDGE_DIRECTIONS[idx];
        EdgePtrVec edges = pVertex->getEdges(dir);
        if(edges.size() <= 1)
            continue;

        for(size_t i = 0; i < edges.size(); ++i)
        {
            if(edges[i]->getEnd()->getColor() == GC_RED)
                return false;
        }

        //std::cout << "Smoothing " << pVertex->getID() << "\n";

        const int MAX_WALKS = 10;
        const int MAX_DISTANCE = 5000;
        bool bIsDegenerate = false;
        bool bFailGapCheck = false;
        bool bFailDivergenceCheck = false;
        bool bFailIndelSizeCheck = false;

        SGWalkVector variantWalks;
        SGSearch::findVariantWalks(pVertex, dir, MAX_DISTANCE, MAX_WALKS, variantWalks);

        if(variantWalks.size() > 0)
        {
            found = true;
            size_t selectedIdx = -1;
            size_t selectedCoverage = 0;

            // Calculate the minimum amount overlapped on the start/end vertex.
            // This is used to properly extract the sequences from walks that represent the variation.
            int minOverlapX = std::numeric_limits<int>::max();
            int minOverlapY = std::numeric_limits<int>::max();

            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                if(variantWalks[i].getNumEdges() <= 1)
                    bIsDegenerate = true;

                // Calculate the walk coverage using the internal vertices of the walk. 
                // The walk with the highest coverage will be retained
                size_t walkCoverage = 0;
                for(size_t j = 1; j < variantWalks[i].getNumVertices() - 1; ++j)
                    walkCoverage += variantWalks[i].getVertex(j)->getCoverage();

                if(walkCoverage > selectedCoverage || selectedCoverage == 0)
                {
                    selectedIdx = i;
                    selectedCoverage = walkCoverage;
                }
                
                Edge* pFirstEdge = variantWalks[i].getFirstEdge();
                Edge* pLastEdge = variantWalks[i].getLastEdge();

                if((int)pFirstEdge->getMatchLength() < minOverlapX)
                    minOverlapX = pFirstEdge->getMatchLength();

                if((int)pLastEdge->getTwin()->getMatchLength() < minOverlapY)
                    minOverlapY = pLastEdge->getTwin()->getMatchLength();
            }

            // Calculate the strings for each walk that represent the region of variation
            StringVector walkStrings;
            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                Vertex* pStartVertex = variantWalks[i].getStartVertex();
                Vertex* pLastVertex = variantWalks[i].getLastVertex();
                assert(pStartVertex != NULL && pLastVertex != NULL);
                
                std::string full = variantWalks[i].getString(SGWT_START_TO_END);
                int posStart = 0;
                int posEnd = 0;

                if(dir == ED_ANTISENSE)
                {
                    // pLast   -----------
                    // pStart          ------------
                    // full    --------------------
                    // out             ----
                    posStart = pLastVertex->getSeqLen() - minOverlapY;
                    posEnd = full.size() - (pStartVertex->getSeqLen() - minOverlapX);
                }
                else
                {
                    // pStart         --------------
                    // pLast   -----------
                    // full    ---------------------
                    // out            ----
                    posStart = pStartVertex->getSeqLen() - minOverlapX; // match start position
                    posEnd = full.size() - (pLastVertex->getSeqLen() - minOverlapY); // match end position
                }
                
                std::string out;
                if(posEnd > posStart)
                    out = full.substr(posStart, posEnd - posStart);
                walkStrings.push_back(out);
            }

            assert(selectedIdx != (size_t)-1);
            SGWalk& selectedWalk = variantWalks[selectedIdx];
            assert(selectedWalk.isIndexed());

            // Check the divergence of the other walks to this walk
            StringVector cigarStrings;
            std::vector<int> maxIndel;
            std::vector<double> gapPercent; // percentage of matching that is gaps
            std::vector<double> totalPercent; // percent of total alignment that is mismatch or gap

            cigarStrings.resize(variantWalks.size());
            gapPercent.resize(variantWalks.size());
            totalPercent.resize(variantWalks.size());
            maxIndel.resize(variantWalks.size());

            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                if(i == selectedIdx)
                    continue;

                // We want to compute the total gap length, total mismatches and percent
                // divergence between the two paths.
                int matchLen = 0;
                int totalDiff = 0;
                int gapLength = 0;
                int maxGapLength = 0;
                // We have to handle the degenerate case where one internal string has zero length
                // this can happen when there is an isolated insertion/deletion and the walks are like:
                // x -> y -> z
                // x -> z
                if(walkStrings[selectedIdx].empty() || walkStrings[i].empty())
                {
                    matchLen = std::max(walkStrings[selectedIdx].size(), walkStrings[i].size());
                    totalDiff = matchLen;
                    gapLength = matchLen;
                }
                else
                {
                    AlnAln *aln_global;
                    aln_global = aln_stdaln(walkStrings[selectedIdx].c_str(), walkStrings[i].c_str(), &aln_param_blast, 1, 1);

                    // Calculate the alignment parameters
                    while(aln_global->outm[matchLen] != '\0')
                    {
                        if(aln_global->outm[matchLen] == ' ')
                            totalDiff += 1;
                        matchLen += 1;
                    }

                    std::stringstream cigarSS;
                    for (int j = 0; j != aln_global->n_cigar; ++j)
                    {
                        char cigarOp = "MID"[aln_global->cigar32[j]&0xf];
                        int cigarLen = aln_global->cigar32[j]>>4;
                        if(cigarOp == 'I' || cigarOp == 'D')
                        {
                            gapLength += cigarLen;
                            if(gapLength > maxGapLength)
                                maxGapLength = gapLength;
                        }

                        cigarSS << cigarLen;
                        cigarSS << cigarOp;
                    }
                    cigarStrings[i] = cigarSS.str();
                    aln_free_AlnAln(aln_global);
                }

                double percentDiff = (double)totalDiff / matchLen;
                double percentGap = (double)gapLength / matchLen;

                if(percentDiff > m_maxTotalDivergence)
                    bFailDivergenceCheck = true;
                
                if(percentGap > m_maxGapDivergence)
                    bFailGapCheck = true;

                if(maxGapLength > m_maxIndelLength)
                    bFailIndelSizeCheck = true;

                gapPercent[i] = percentGap;
                totalPercent[i] = percentDiff;
                maxIndel[i] = maxGapLength;
            }

            if(bIsDegenerate || bFailGapCheck || bFailDivergenceCheck || bFailIndelSizeCheck)
                continue;

            // Write the selected path to the variants file as variant 0
            int variantIdx = 0;
            std::string selectedSequence = selectedWalk.getString(SGWT_START_TO_END);
            std::stringstream ss;
            ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++;
            writeFastaRecord(&m_outFile, ss.str(), selectedSequence);


            // The vertex set for each walk is not necessarily disjoint,
            // the selected walk may contain vertices that are part
            // of other paths. We handle this be initially marking all
            // vertices of the 
            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                if(i == selectedIdx)
                    continue;

                SGWalk& currWalk = variantWalks[i];
                for(size_t j = 0; j < currWalk.getNumEdges() - 1; ++j)
                {
                    Edge* currEdge = currWalk.getEdge(j);
                    
                    // If the vertex is also on the selected path, do not mark it
                    Vertex* currVertex = currEdge->getEnd();
                    if(!selectedWalk.containsVertex(currVertex->getID()))
                    {
                        currEdge->getEnd()->setColor(GC_RED);
                    }
                }

                // Write the variant to a file
                std::string variantSequence = currWalk.getString(SGWT_START_TO_END);
                std::stringstream variantID;
                std::stringstream ss;
                ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++;
                ss << " IGD:" << (double)gapPercent[i] << " ITD:" << totalPercent[i] << " MID: " << maxIndel[i] << " InternalCigar:" << cigarStrings[i];
                writeFastaRecord(&m_outFile, ss.str(), variantSequence);
            }

            if(variantWalks.size() == 2)
                m_simpleBubblesRemoved += 1;
            else
                m_complexBubblesRemoved += 1;
            ++m_numRemovedTotal;
        }
    }