bool visit(StringGraph*, Vertex* pVertex) { if(pVertex->getColor() == GC_BLACK) return false; if((int)pVertex->getSeqLen() >= m_minLength) { std::stringstream idss; idss << "unplaced-" << m_numUnplaced++; writeFastaRecord(m_pWriter, idss.str(), pVertex->getSeq().toString()); } return false; }
// write the unplaced sequences of length at least minLength using pWriter void MapSequenceCollection::writeUnplaced(std::ostream* pWriter, int minLength) { int numUnplaced = 0; for(SMPMap::iterator iter = m_map.begin(); iter != m_map.end(); ++iter) { if(!iter->second.isPlaced && (int)iter->second.sequence.size() >= minLength) { std::stringstream idss; idss << "unplaced-" << numUnplaced++; writeFastaRecord(pWriter, idss.str(), iter->second.sequence); } } }
bool SGSmoothingVisitor::visit(StringGraph* pGraph, Vertex* pVertex) { (void)pGraph; if(pVertex->getColor() == GC_RED) return false; bool found = false; for(size_t idx = 0; idx < ED_COUNT; idx++) { EdgeDir dir = EDGE_DIRECTIONS[idx]; EdgePtrVec edges = pVertex->getEdges(dir); if(edges.size() <= 1) continue; for(size_t i = 0; i < edges.size(); ++i) { if(edges[i]->getEnd()->getColor() == GC_RED) return false; } //std::cout << "Smoothing " << pVertex->getID() << "\n"; const int MAX_WALKS = 10; const int MAX_DISTANCE = 5000; bool bIsDegenerate = false; bool bFailGapCheck = false; bool bFailDivergenceCheck = false; bool bFailIndelSizeCheck = false; SGWalkVector variantWalks; SGSearch::findVariantWalks(pVertex, dir, MAX_DISTANCE, MAX_WALKS, variantWalks); if(variantWalks.size() > 0) { found = true; size_t selectedIdx = -1; size_t selectedCoverage = 0; // Calculate the minimum amount overlapped on the start/end vertex. // This is used to properly extract the sequences from walks that represent the variation. int minOverlapX = std::numeric_limits<int>::max(); int minOverlapY = std::numeric_limits<int>::max(); for(size_t i = 0; i < variantWalks.size(); ++i) { if(variantWalks[i].getNumEdges() <= 1) bIsDegenerate = true; // Calculate the walk coverage using the internal vertices of the walk. // The walk with the highest coverage will be retained size_t walkCoverage = 0; for(size_t j = 1; j < variantWalks[i].getNumVertices() - 1; ++j) walkCoverage += variantWalks[i].getVertex(j)->getCoverage(); if(walkCoverage > selectedCoverage || selectedCoverage == 0) { selectedIdx = i; selectedCoverage = walkCoverage; } Edge* pFirstEdge = variantWalks[i].getFirstEdge(); Edge* pLastEdge = variantWalks[i].getLastEdge(); if((int)pFirstEdge->getMatchLength() < minOverlapX) minOverlapX = pFirstEdge->getMatchLength(); if((int)pLastEdge->getTwin()->getMatchLength() < minOverlapY) minOverlapY = pLastEdge->getTwin()->getMatchLength(); } // Calculate the strings for each walk that represent the region of variation StringVector walkStrings; for(size_t i = 0; i < variantWalks.size(); ++i) { Vertex* pStartVertex = variantWalks[i].getStartVertex(); Vertex* pLastVertex = variantWalks[i].getLastVertex(); assert(pStartVertex != NULL && pLastVertex != NULL); std::string full = variantWalks[i].getString(SGWT_START_TO_END); int posStart = 0; int posEnd = 0; if(dir == ED_ANTISENSE) { // pLast ----------- // pStart ------------ // full -------------------- // out ---- posStart = pLastVertex->getSeqLen() - minOverlapY; posEnd = full.size() - (pStartVertex->getSeqLen() - minOverlapX); } else { // pStart -------------- // pLast ----------- // full --------------------- // out ---- posStart = pStartVertex->getSeqLen() - minOverlapX; // match start position posEnd = full.size() - (pLastVertex->getSeqLen() - minOverlapY); // match end position } std::string out; if(posEnd > posStart) out = full.substr(posStart, posEnd - posStart); walkStrings.push_back(out); } assert(selectedIdx != (size_t)-1); SGWalk& selectedWalk = variantWalks[selectedIdx]; assert(selectedWalk.isIndexed()); // Check the divergence of the other walks to this walk StringVector cigarStrings; std::vector<int> maxIndel; std::vector<double> gapPercent; // percentage of matching that is gaps std::vector<double> totalPercent; // percent of total alignment that is mismatch or gap cigarStrings.resize(variantWalks.size()); gapPercent.resize(variantWalks.size()); totalPercent.resize(variantWalks.size()); maxIndel.resize(variantWalks.size()); for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; // We want to compute the total gap length, total mismatches and percent // divergence between the two paths. int matchLen = 0; int totalDiff = 0; int gapLength = 0; int maxGapLength = 0; // We have to handle the degenerate case where one internal string has zero length // this can happen when there is an isolated insertion/deletion and the walks are like: // x -> y -> z // x -> z if(walkStrings[selectedIdx].empty() || walkStrings[i].empty()) { matchLen = std::max(walkStrings[selectedIdx].size(), walkStrings[i].size()); totalDiff = matchLen; gapLength = matchLen; } else { AlnAln *aln_global; aln_global = aln_stdaln(walkStrings[selectedIdx].c_str(), walkStrings[i].c_str(), &aln_param_blast, 1, 1); // Calculate the alignment parameters while(aln_global->outm[matchLen] != '\0') { if(aln_global->outm[matchLen] == ' ') totalDiff += 1; matchLen += 1; } std::stringstream cigarSS; for (int j = 0; j != aln_global->n_cigar; ++j) { char cigarOp = "MID"[aln_global->cigar32[j]&0xf]; int cigarLen = aln_global->cigar32[j]>>4; if(cigarOp == 'I' || cigarOp == 'D') { gapLength += cigarLen; if(gapLength > maxGapLength) maxGapLength = gapLength; } cigarSS << cigarLen; cigarSS << cigarOp; } cigarStrings[i] = cigarSS.str(); aln_free_AlnAln(aln_global); } double percentDiff = (double)totalDiff / matchLen; double percentGap = (double)gapLength / matchLen; if(percentDiff > m_maxTotalDivergence) bFailDivergenceCheck = true; if(percentGap > m_maxGapDivergence) bFailGapCheck = true; if(maxGapLength > m_maxIndelLength) bFailIndelSizeCheck = true; gapPercent[i] = percentGap; totalPercent[i] = percentDiff; maxIndel[i] = maxGapLength; } if(bIsDegenerate || bFailGapCheck || bFailDivergenceCheck || bFailIndelSizeCheck) continue; // Write the selected path to the variants file as variant 0 int variantIdx = 0; std::string selectedSequence = selectedWalk.getString(SGWT_START_TO_END); std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; writeFastaRecord(&m_outFile, ss.str(), selectedSequence); // The vertex set for each walk is not necessarily disjoint, // the selected walk may contain vertices that are part // of other paths. We handle this be initially marking all // vertices of the for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; SGWalk& currWalk = variantWalks[i]; for(size_t j = 0; j < currWalk.getNumEdges() - 1; ++j) { Edge* currEdge = currWalk.getEdge(j); // If the vertex is also on the selected path, do not mark it Vertex* currVertex = currEdge->getEnd(); if(!selectedWalk.containsVertex(currVertex->getID())) { currEdge->getEnd()->setColor(GC_RED); } } // Write the variant to a file std::string variantSequence = currWalk.getString(SGWT_START_TO_END); std::stringstream variantID; std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; ss << " IGD:" << (double)gapPercent[i] << " ITD:" << totalPercent[i] << " MID: " << maxIndel[i] << " InternalCigar:" << cigarStrings[i]; writeFastaRecord(&m_outFile, ss.str(), variantSequence); } if(variantWalks.size() == 2) m_simpleBubblesRemoved += 1; else m_complexBubblesRemoved += 1; ++m_numRemovedTotal; } }