// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceBWASW(const std::string& haplotype, const BWTIndexSet& referenceIndex, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceBWASW") LRAlignment::LRParams params; params.zBest = 20; for(size_t i = 0; i <= 1; ++i) { LRAlignment::LRHitVector hits; std::string query = (i == 0) ? haplotype : reverseComplement(haplotype); LRAlignment::bwaswAlignment(query, referenceIndex.pBWT, referenceIndex.pSSA, params, hits); // Convert the hits into alignments for(size_t j = 0; j < hits.size(); ++j) { int q_alignment_length = hits[j].q_end - hits[j].q_start; // Skip non-complete alignments if((int)haplotype.length() == q_alignment_length) { HapgenAlignment aln(hits[j].targetID, hits[j].t_start, hits[j].length, hits[j].G, i == 1); outAlignments.push_back(aln); } } } }
// Coalesce a set of alignments into distinct locations void HapgenUtil::coalesceAlignments(HapgenAlignmentVector& alignments) { if(alignments.empty()) return; // Sort the alignments by reference id, then position std::sort(alignments.begin(), alignments.end()); HapgenAlignmentVector outAlignments; // Iterate over the alignments in sorted order // If an alignment is distinct (=does not overlap) from the // previous alignment, add it to the output collection. // First alignment is always ok outAlignments.push_back(alignments[alignments.size()-1]); // Kees: start from back because alignments are sorted in order of increasing score for(size_t i = alignments.size()-1; i-- > 0;) { // Check this alignment against the last alignment added to the output set HapgenAlignment& prevAlign = outAlignments.back(); const HapgenAlignment& currAlign = alignments[i]; int s1 = prevAlign.position; int e1 = s1 + prevAlign.length; int s2 = currAlign.position; int e2 = s2 + currAlign.length; bool intersecting = Interval::isIntersecting(s1, e1, s2, e2); if(prevAlign.referenceID != currAlign.referenceID || !intersecting) { outAlignments.push_back(currAlign); } else { // merge the intersecting alignment into a window that covers both prevAlign.position = std::min(s1, s2); prevAlign.length = std::max(e1, e2) - prevAlign.position; } } alignments = outAlignments; }
// Compute the best alignment of the haplotype collection to the reference DindelReturnCode DindelUtil::computeBestAlignment(const StringVector& inHaplotypes, const SeqItemVector& variantMates, const SeqItemVector& variantRCMates, const GraphCompareParameters& parameters, HapgenAlignment& bestAlignment) { size_t MAX_DEPTH = 2000; if(variantMates.size() + variantRCMates.size() > MAX_DEPTH) return DRC_OVER_DEPTH; // // Align the haplotypes to the reference genome to generate candidate alignments // HapgenAlignmentVector candidateAlignments; for(size_t i = 0; i < inHaplotypes.size(); ++i) HapgenUtil::alignHaplotypeToReferenceBWASW(inHaplotypes[i], parameters.referenceIndex, candidateAlignments); // Remove duplicate or bad alignment pairs HapgenUtil::coalesceAlignments(candidateAlignments); if(candidateAlignments.empty()) return DRC_NO_ALIGNMENT; // // Score each candidate alignment against the mates of all the variant reads // int bestCandidate = -1; double bestAverageScoreFrac = 0.0f; double secondBest = 0.0f; for(size_t i = 0; i < candidateAlignments.size(); ++i) { // Compute the average score of the reads' mates to the flanking sequence StringVector referenceFlanking; StringVector referenceHaplotypes; HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i], parameters.pRefTable, 1000, inHaplotypes, referenceFlanking, referenceHaplotypes); // If valid flanking haplotypes could not be made, skip this alignment if(referenceFlanking.empty()) continue; // Realign the mates LocalAlignmentResultVector localAlignments = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantMates); LocalAlignmentResultVector localAlignmentsRC = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantRCMates); // Merge alignments localAlignments.insert(localAlignments.end(), localAlignmentsRC.begin(), localAlignmentsRC.end()); double sum = 0.0f; double count = 0.0f; for(size_t j = 0; j < localAlignments.size(); ++j) { double max_score = localAlignments[j].queryEndIndex - localAlignments[j].queryStartIndex + 1; double frac = (double)localAlignments[j].score / max_score; //printf("Score: %d frac: %lf\n", localAlignments[j].score, frac); sum += frac; count += 1; } double score = sum / count; if(score > bestAverageScoreFrac) { secondBest = bestAverageScoreFrac; bestAverageScoreFrac = score; bestCandidate = i; } else if(score > secondBest) { secondBest = score; } //printf("Alignment %zu mate-score: %lf\n", i, score); } if(bestCandidate == -1) return DRC_NO_ALIGNMENT; /* if(bestAverageScoreFrac < 0.9f) return DRC_POOR_ALIGNMENT; if(bestAverageScoreFrac - secondBest < 0.05f) return DRC_AMBIGUOUS_ALIGNMENT; */ bestAlignment = candidateAlignments[bestCandidate]; return DRC_OK; }
// Run dindel on a pair of samples DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id, const StringVector& base_haplotypes, const StringVector& variant_haplotypes, const GraphCompareParameters& parameters, std::ostream& baseOut, std::ostream& variantOut, std::ostream& callsOut, DindelReadReferenceAlignmentVector* pReadAlignments) { PROFILE_FUNC("runDindelPairMatePair") StringVector inHaplotypes; inHaplotypes.insert(inHaplotypes.end(), base_haplotypes.begin(), base_haplotypes.end()); inHaplotypes.insert(inHaplotypes.end(), variant_haplotypes.begin(), variant_haplotypes.end()); // // First, extract the reads from the normal and variant data sets that match each haplotype // assert(inHaplotypes.size() > 0); // Get canidate alignments for the input haplotypes HapgenAlignmentVector candidateAlignments; // Choose the kmer size for alignment size_t align_kmer = 31; for(size_t i = 0; i < inHaplotypes.size(); ++i) { HapgenAlignmentVector thisCandidateAlignments; HapgenUtil::alignHaplotypeToReferenceKmer(align_kmer, inHaplotypes[i], parameters.referenceIndex, parameters.pRefTable, thisCandidateAlignments); candidateAlignments.insert(candidateAlignments.end(), thisCandidateAlignments.begin(), thisCandidateAlignments.end()); } // Remove duplicate or bad alignment pairs HapgenUtil::coalesceAlignments(candidateAlignments); if(Verbosity::Instance().getPrintLevel() > 3) printf("runDindel -- %zu candidate alignments found\n", candidateAlignments.size()); size_t MAX_ALIGNMENTS = 10; if(candidateAlignments.size() > MAX_ALIGNMENTS) return DRC_AMBIGUOUS_ALIGNMENT; // Join each haplotype with flanking sequence from the reference genome for each alignment // This function also adds a haplotype (with flanking sequence) for the piece of the reference int FLANKING_SIZE = 0; if (parameters.dindelRealignParameters.realignMatePairs) FLANKING_SIZE = 1000; StringVector flankingHaplotypes; // This vector contains the internal portion of the haplotypes, without the flanking sequence // It is used to extract reads StringVector candidateHaplotypes; for(size_t i = 0; i < candidateAlignments.size(); ++i) { HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i], parameters.pRefTable, FLANKING_SIZE, inHaplotypes, flankingHaplotypes, candidateHaplotypes); } if(Verbosity::Instance().getPrintLevel() > 3) printf("runDindel -- made %zu flanking haplotypes\n", candidateHaplotypes.size()); // Normal reads SeqRecordVector normalReads; SeqRecordVector normalRCReads; // Remove non-unique candidate haplotypes std::sort(candidateHaplotypes.begin(), candidateHaplotypes.end()); StringVector::iterator haplotype_iterator = std::unique(candidateHaplotypes.begin(), candidateHaplotypes.end()); candidateHaplotypes.resize(haplotype_iterator - candidateHaplotypes.begin()); // Set the value to use for extracting reads that potentially match the haplotype // Do not use a kmer for extraction greater than this value size_t KMER_CEILING = 31; size_t extractionKmer = parameters.kmer < KMER_CEILING ? parameters.kmer : KMER_CEILING; bool extractOK = true; if(!parameters.bReferenceMode) { // Reads on the same strand as the haplotype extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, false, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; // Reads on the reverse strand extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, true, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalRCReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; } // Variant reads SeqRecordVector variantReads; SeqRecordVector variantRCReads; extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, false, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, true, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantRCReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; size_t normal_reads = normalReads.size() + normalRCReads.size(); size_t variant_reads = variantReads.size() + variantRCReads.size(); size_t total_reads = normal_reads + variant_reads; if(Verbosity::Instance().getPrintLevel() > 3) printf("Extracted %zu normal reads, %zu variant reads\n", normal_reads, variant_reads); if(total_reads > parameters.maxReads) return DRC_OVER_DEPTH; if (total_reads == 0) return DRC_UNDER_DEPTH; // Generate the input haplotypes for dindel // We need at least 2 haplotypes (one is the reference) size_t totFlankingHaplotypes = flankingHaplotypes.size(); if(totFlankingHaplotypes < 2) return DRC_NO_ALIGNMENT; // Ensure the reference haplotype is a non-empty string if(flankingHaplotypes[0].size() == 0) return DRC_NO_ALIGNMENT; // Make Dindel referenceMappings StringVector dindelHaplotypes; std::set<DindelReferenceMapping> refMappings; // for(size_t i = 0; i < candidateAlignments.size(); ++i) { std::string upstream, defined, downstream; std::string refName = parameters.pRefTable->getRead(candidateAlignments[i].referenceID).id; HapgenUtil::extractReferenceSubstrings(candidateAlignments[i],parameters.pRefTable, FLANKING_SIZE, upstream, defined, downstream); std::string refSeq = upstream + defined + downstream; int refStart = candidateAlignments[i].position - int(upstream.size()) + 1; // Here the score is used as an estimate of how unique "defined" is in the reference sequence. // "defined" is not the reference sequence but a candidate haplotype. // It is conservative because the flanking sequence is not used in this estimation. DindelReferenceMapping rm(refName, refSeq, refStart, candidateAlignments[i].score+2*FLANKING_SIZE, candidateAlignments[i].isRC); std::set<DindelReferenceMapping>::iterator rmit = refMappings.find(rm); if(rmit == refMappings.end()) { refMappings.insert(rm); } else { if(rm.referenceAlignmentScore > rmit->referenceAlignmentScore) rmit->referenceAlignmentScore = rm.referenceAlignmentScore; } } // RESET MAPPING SCORES for(std::set<DindelReferenceMapping>::iterator it = refMappings.begin(); it != refMappings.end(); it++) it->referenceAlignmentScore = 1000; // make flankingHaplotypes unique std::set< std::string > setFlanking(flankingHaplotypes.begin(), flankingHaplotypes.end()); for(std::set< std::string >::const_iterator it = setFlanking.begin(); it != setFlanking.end(); it++) { dindelHaplotypes.push_back(*it); //dindelRefMappings[i] = std::vector<DindelReferenceMapping>(refMappings.begin(),refMappings.end()); } std::vector<DindelReferenceMapping> dRefMappings(refMappings.begin(),refMappings.end()); DindelWindow dWindow(dindelHaplotypes, dRefMappings); // // Run Dindel // // Initialize VCF collections VCFCollection vcfCollections[2]; // If in multisample mode, load the sample names into the VCFCollection if(parameters.variantIndex.pPopIdx != NULL) { for(size_t i = 0; i <= 1; ++i) vcfCollections[i].samples = parameters.variantIndex.pPopIdx->getSamples(); } size_t start_i = parameters.bReferenceMode ? 1 : 0; DindelRealignWindowResult *pThisResult = NULL; DindelRealignWindowResult *pPreviousResult = NULL; for(size_t i = start_i; i <= 1; ++i) { SeqRecordVector& fwdReads = (i == 0) ? normalReads : variantReads; SeqRecordVector& rcReads = (i == 0) ? normalRCReads : variantRCReads; const BWTIndexSet* indices = ¶meters.variantIndex; // Create dindel reads // Mates must be at the end of the array. std::vector<DindelRead> dReads; for(size_t j = 0; j < fwdReads.size(); ++j) dReads.push_back(convertToDindelRead(indices, fwdReads[j], true)); for(size_t j = 0; j < rcReads.size(); ++j) { rcReads[j].seq.reverseComplement(); std::reverse(rcReads[j].qual.begin(), rcReads[j].qual.end()); dReads.push_back(convertToDindelRead(indices, rcReads[j], false)); } pThisResult = new DindelRealignWindowResult(); std::stringstream out_ss; try { DindelRealignWindow dRealignWindow(&dWindow, dReads, parameters.dindelRealignParameters); dRealignWindow.run("hmm", vcfCollections[i], pReadAlignments, id, pThisResult, pPreviousResult, parameters.pRefTable); } catch(std::string e) { std::cerr << "Dindel Exception: " << e << "\n"; exit(DRC_EXCEPTION); } if(i == 0) pPreviousResult = pThisResult; } // Copy raw VCFRecords to output for(size_t i = 0; i <= 1; ++i) { std::ostream& curr_out = i == 0 ? baseOut : variantOut; for(size_t j = 0; j < vcfCollections[i].records.size(); ++j) curr_out << vcfCollections[i].records[j] << "\n"; } // Make comparative calls size_t VARIANT_IDX = 1; size_t BASE_IDX = 0; bool has_base_calls = !vcfCollections[BASE_IDX].records.empty(); for(size_t i = 0; i < vcfCollections[1].records.size(); ++i) { bool not_called_in_base = true; if(has_base_calls) not_called_in_base = vcfCollections[BASE_IDX].records[i].passStr == "NoCall" || vcfCollections[BASE_IDX].records[i].passStr == "NoSupp"; bool called_in_variant = vcfCollections[VARIANT_IDX].records[i].passStr == "PASS"; if(called_in_variant && not_called_in_base) callsOut << vcfCollections[VARIANT_IDX].records[i] << "\n"; } baseOut.flush(); variantOut.flush(); delete pThisResult; delete pPreviousResult; return DRC_OK; }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id); const DNAString& ref_sequence = ref_record.seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query); if(overlap.score < 0 || !overlap.isValid()) continue; int alignment_start = ref_start + overlap.match[0].start; int alignment_end = ref_start + overlap.match[0].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events bool has_indel = false; int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') { num_events -= (len - 1); has_indel = true; } } // Skip poor alignments double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f); if(mismatch_rate > 0.05f || overlap.total_columns < 50) { if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } continue; } bool is_snp = !has_indel && overlap.edit_distance == 1; HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, num_events, is_reverse, is_snp); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); else if(Verbosity::Instance().getPrintLevel() > 3) printf("Haplotype Alignment - Ignoring alignment with too many events (%d)\n", event_count_vector[i]); } }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring); // Skip terrible alignments double percent_aligned = (double)overlap.getOverlapLength() / query.size(); if(percent_aligned < 0.95f) continue; /* // Skip alignments that are not full-length matches of the haplotype if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1) continue; */ int alignment_start = ref_start + overlap.match[1].start; int alignment_end = ref_start + overlap.match[1].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') num_events -= (len - 1); } HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); } }