// Explore the neighborhood around a vertex looking for missing overlaps SGEdgeStatsVisitor::CandidateVector SGEdgeStatsVisitor::getMissingCandidates(StringGraph* /*pGraph*/, Vertex* pVertex, int minOverlap) const { CandidateVector out; // Mark the vertices that are reached from this vertex as black to indicate // they already are overlapping EdgePtrVec edges = pVertex->getEdges(); for(size_t i = 0; i < edges.size(); ++i) { edges[i]->getEnd()->setColor(GC_BLACK); } pVertex->setColor(GC_BLACK); for(size_t i = 0; i < edges.size(); ++i) { Edge* pXY = edges[i]; EdgePtrVec neighborEdges = pXY->getEnd()->getEdges(); for(size_t j = 0; j < neighborEdges.size(); ++j) { Edge* pYZ = neighborEdges[j]; if(pYZ->getEnd()->getColor() != GC_BLACK) { // Infer the overlap object from the edges Overlap ovrXY = pXY->getOverlap(); Overlap ovrYZ = pYZ->getOverlap(); if(SGAlgorithms::hasTransitiveOverlap(ovrXY, ovrYZ)) { Overlap ovr_xz = SGAlgorithms::inferTransitiveOverlap(ovrXY, ovrYZ); if(ovr_xz.match.getMinOverlapLength() >= minOverlap) { out.push_back(Candidate(pYZ->getEnd(), ovr_xz)); pYZ->getEnd()->setColor(GC_BLACK); } } } } } // Reset colors for(size_t i = 0; i < edges.size(); ++i) edges[i]->getEnd()->setColor(GC_WHITE); pVertex->setColor(GC_WHITE); for(size_t i = 0; i < out.size(); ++i) out[i].pEndpoint->setColor(GC_WHITE); return out; }
bool SGEdgeStatsVisitor::visit(StringGraph* pGraph, Vertex* pVertex) { const int MIN_OVERLAP = pGraph->getMinOverlap(); const double MAX_ERROR = pGraph->getErrorRate(); static int visited = 0; ++visited; if(visited % 50000 == 0) std::cout << "visited: " << visited << "\n"; // Add stats for the found overlaps EdgePtrVec edges = pVertex->getEdges(); for(size_t i = 0; i < edges.size(); ++i) { Overlap ovr = edges[i]->getOverlap(); int numDiff = ovr.match.countDifferences(pVertex->getStr(), edges[i]->getEnd()->getStr()); int overlapLen = ovr.match.getMinOverlapLength(); addOverlapToCount(overlapLen, numDiff, foundCounts); } // Explore the neighborhood around this graph for potentially missing overlaps CandidateVector candidates = getMissingCandidates(pGraph, pVertex, MIN_OVERLAP); MultiOverlap addedMO(pVertex->getID(), pVertex->getStr()); for(size_t i = 0; i < candidates.size(); ++i) { Candidate& c = candidates[i]; int numDiff = c.ovr.match.countDifferences(pVertex->getStr(), c.pEndpoint->getStr()); double error_rate = double(numDiff) / double(c.ovr.match.getMinOverlapLength()); if(error_rate < MAX_ERROR) { int overlapLen = c.ovr.match.getMinOverlapLength(); addOverlapToCount(overlapLen, numDiff, missingCounts); } } return false; }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id); const DNAString& ref_sequence = ref_record.seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query); if(overlap.score < 0 || !overlap.isValid()) continue; int alignment_start = ref_start + overlap.match[0].start; int alignment_end = ref_start + overlap.match[0].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events bool has_indel = false; int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') { num_events -= (len - 1); has_indel = true; } } // Skip poor alignments double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f); if(mismatch_rate > 0.05f || overlap.total_columns < 50) { if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } continue; } bool is_snp = !has_indel && overlap.edit_distance == 1; HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, num_events, is_reverse, is_snp); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); else if(Verbosity::Instance().getPrintLevel() > 3) printf("Haplotype Alignment - Ignoring alignment with too many events (%d)\n", event_count_vector[i]); } }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring); // Skip terrible alignments double percent_aligned = (double)overlap.getOverlapLength() / query.size(); if(percent_aligned < 0.95f) continue; /* // Skip alignments that are not full-length matches of the haplotype if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1) continue; */ int alignment_start = ref_start + overlap.match[1].start; int alignment_end = ref_start + overlap.match[1].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') num_events -= (len - 1); } HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); } }