// Write an ascii pictogram of an overlap // O = overhang of length block_size (region of no overlap) // P = perfect match of length block_size // I = matched block with mismatches std::string ascii_overlap(const std::string& s0, const std::string& s1, SequenceOverlap& ovr, int block_size = 50) { std::string out; int pre_overhang = ovr.match[0].start / block_size; int post_overhang = (ovr.length[0] - ovr.match[0].end - 1) / block_size; std::string p0; std::string p1; ovr.makePaddedMatches(s0, s1, &p0, &p1); out.append(pre_overhang, '-'); for(size_t i = 0; i < p0.size(); i += block_size) { if(p0.substr(i, block_size) == p1.substr(i, block_size)) out.append(1, '='); else out.append(1, 'x'); } out.append(post_overhang, '-'); return out; }
SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query, size_t k, int min_overlap, double min_identity, int bandwidth, const BWTIndexSet& indices) { PROFILE_FUNC("OverlapHaplotypeBuilder::retrieveMatches") assert(indices.pBWT != NULL); assert(indices.pSSA != NULL); int64_t max_interval_size = 200; SequenceOverlapPairVector overlap_vector; // Use the FM-index to look up intervals for each kmer of the read. Each index // in the interval is stored individually in the KmerMatchMap. We then // backtrack to map these kmer indices to read IDs. As reads can share // multiple kmers, we use the map to avoid redundant lookups. // There is likely a faster algorithm which performs direct decompression // of the read sequences without having to expand the intervals to individual // indices. The current algorithm suffices for now. KmerMatchMap prematchMap; size_t num_kmers = query.size() - k + 1; for(size_t i = 0; i < num_kmers; ++i) { std::string kmer = query.substr(i, k); BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.isValid() && interval.size() < max_interval_size) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), false }; prematchMap.insert(std::make_pair(match, false)); } } kmer = reverseComplement(kmer); interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.isValid() && interval.size() < max_interval_size) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), true }; prematchMap.insert(std::make_pair(match, false)); } } } // Backtrack through the kmer indices to turn them into read indices. // This mirrors the calcSA function in SampledSuffixArray except we mark each entry // as visited once it is processed. KmerMatchSet matches; for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter) { // This index has been visited if(iter->second) continue; // Mark this as visited iter->second = true; // Backtrack the index until we hit the starting symbol KmerMatch out_match = iter->first; while(1) { char b = indices.pBWT->getChar(out_match.index); out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1); // Check if the hash indicates we have visited this index. If so, stop the backtrack KmerMatchMap::iterator find_iter = prematchMap.find(out_match); if(find_iter != prematchMap.end()) { // We have processed this index already if(find_iter->second) break; else find_iter->second = true; } if(b == '$') { // We've found the lexicographic index for this read. Turn it into a proper ID out_match.index = indices.pSSA->lookupLexoRank(out_match.index); matches.insert(out_match); break; } } } // Refine the matches by computing proper overlaps between the sequences // Use the overlaps that meet the thresholds to build a multiple alignment for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter) { std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index); if(iter->is_reverse) match_sequence = reverseComplement(match_sequence); // Ignore identical matches if(match_sequence == query) continue; // Compute the overlap. If the kmer match occurs a single time in each sequence we use // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper. SequenceOverlap overlap; std::string match_kmer = query.substr(iter->position, k); size_t pos_0 = query.find(match_kmer); size_t pos_1 = match_sequence.find(match_kmer); assert(pos_0 != std::string::npos && pos_1 != std::string::npos); // Check for secondary occurrences if(query.find(match_kmer, pos_0 + 1) != std::string::npos || match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlap = Overlapper::computeOverlap(query, match_sequence); } else { overlap = Overlapper::extendMatch(query, match_sequence, pos_0, pos_1, bandwidth); } bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap; bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= min_identity; if(bPassedOverlap && bPassedIdentity) { SequenceOverlapPair op; op.sequence[0] = query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = iter->is_reverse; overlap_vector.push_back(op); } } return overlap_vector; }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id); const DNAString& ref_sequence = ref_record.seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query); if(overlap.score < 0 || !overlap.isValid()) continue; int alignment_start = ref_start + overlap.match[0].start; int alignment_end = ref_start + overlap.match[0].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events bool has_indel = false; int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') { num_events -= (len - 1); has_indel = true; } } // Skip poor alignments double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f); if(mismatch_rate > 0.05f || overlap.total_columns < 50) { if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } continue; } bool is_snp = !has_indel && overlap.edit_distance == 1; HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, num_events, is_reverse, is_snp); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); else if(Verbosity::Instance().getPrintLevel() > 3) printf("Haplotype Alignment - Ignoring alignment with too many events (%d)\n", event_count_vector[i]); } }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring); // Skip terrible alignments double percent_aligned = (double)overlap.getOverlapLength() / query.size(); if(percent_aligned < 0.95f) continue; /* // Skip alignments that are not full-length matches of the haplotype if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1) continue; */ int alignment_start = ref_start + overlap.match[1].start; int alignment_end = ref_start + overlap.match[1].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') num_events -= (len - 1); } HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); } }
void OverlapExtractorWithCorrection::getRawOverlapsCached(const std::string& query, bool is_reverse, SequenceOverlapPairVector* out_vector) { PROFILE_FUNC("OverlapExtractorWithCorrection::getRawOverlaps") size_t nk = query.size() - m_k + 1; std::set<std::string> sequences_seen; for(size_t i = 0; i < nk; ++i) { std::string kmer = query.substr(i, m_k); std::string q_kmer = is_reverse ? reverseComplement(kmer) : kmer; const std::vector<size_t>& indices = m_cache_map.find(q_kmer)->second; for(size_t j = 0; j < indices.size(); ++j) { size_t index = indices[j]; std::string match_sequence = m_strings[index]; if(is_reverse) match_sequence = reverseComplement(match_sequence); if(sequences_seen.find(match_sequence) != sequences_seen.end()) continue; sequences_seen.insert(match_sequence); // Ignore identical matches if(match_sequence.empty() || match_sequence == query) continue; // Compute the overlap. If the kmer match occurs a single time in each sequence we use // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper. SequenceOverlap overlap; size_t pos_0 = query.find(kmer); size_t pos_1 = match_sequence.find(kmer); // If there is a single occurrence of the kmer in each read, // use that position to seed the overlap calculation if(pos_0 == std::string::npos || pos_1 == std::string::npos || query.find(kmer, pos_0 + 1) != std::string::npos || match_sequence.find(kmer, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlap = Overlapper::computeOverlap(query, match_sequence); } else { overlap = Overlapper::extendMatch(query, match_sequence, pos_0, pos_1, 2); } bool bPassedOverlap = overlap.getOverlapLength() >= m_minOverlap; bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= m_minIdentity; if(bPassedOverlap && bPassedIdentity) { SequenceOverlapPair op; op.sequence[0] = query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = is_reverse; out_vector->push_back(op); } } } }
void _approximateSeededMatch(const std::string& in_query, int min_overlap, double min_identity, int bandwidth, int max_interval, bool do_reverse, const BWTIndexSet& indices, SequenceOverlapPairVector& out_vector) { Timer timer("test", true); assert(indices.pBWT != NULL); assert(indices.pSSA != NULL); assert(indices.pCache != NULL); static size_t n_calls = 0; static size_t n_candidates = 0; static size_t n_output = 0; static double t_time = 0; n_calls++; int target_seed_length = 41; int seed_stride = target_seed_length / 2; size_t d = 1; SequenceOverlapPairVector overlap_vector; std::string strand_query = do_reverse ? reverseComplement(in_query) : in_query; // Initialize seeds int seed_end = strand_query.size(); int q = indices.pCache->getCachedLength(); std::queue<ApproxSeed> seeds; while(seed_end > target_seed_length) { // For the last q bases of the seed, create all strings within edit distance d std::string qmer = strand_query.substr(seed_end - q, q); assert((int)qmer.size() == q); // 0-distance seed ApproxSeed seed; seed.query_index = seed_end - q; seed.interval = indices.pCache->lookup(qmer.c_str()); seed.length = q; seeds.push(seed); for(int i = 0; i < q; ++i) { // Switch base to other 3 symbols char o = qmer[i]; for(size_t j = 0; j < 4; ++j) { char b = "ACGT"[j]; if(b != o) { qmer[i] = b; ApproxSeed seed; seed.query_index = seed_end - q; seed.interval = indices.pCache->lookup(qmer.c_str()); seed.length = q; seed.edits.push_back(SeedEdit(i + seed.query_index, b)); seeds.push(seed); } } qmer[i] = o; } seed_end -= seed_stride; } // Extend seeds std::vector<ApproxSeed> finished_seeds; while(!seeds.empty()) { ApproxSeed& seed = seeds.front(); // query_index is the index of the last base // in the seed. get the next base char qb = strand_query[seed.query_index - 1]; // Branch to inexact match if(seed.edits.size() < d) { for(size_t j = 0; j < 4; ++j) { char b = "ACGT"[j]; if(b != qb) { ApproxSeed new_seed; new_seed.query_index = seed.query_index - 1; new_seed.interval = seed.interval; BWTAlgorithms::updateInterval(new_seed.interval, b, indices.pBWT); if(new_seed.interval.isValid()) { new_seed.length = seed.length + 1; new_seed.edits = seed.edits; new_seed.edits.push_back(SeedEdit(new_seed.query_index, b)); if(new_seed.length < target_seed_length && new_seed.query_index > 0) seeds.push(new_seed); else finished_seeds.push_back(new_seed); } } } } // Extend with the actual query base without branching seed.query_index = seed.query_index - 1; seed.length += 1; BWTAlgorithms::updateInterval(seed.interval, qb, indices.pBWT); if(!seed.interval.isValid() || seed.length >= target_seed_length || seed.query_index == 0) { if(seed.interval.isValid()) finished_seeds.push_back(seed); seeds.pop(); } } std::set<size_t> rank_set; for(size_t i = 0; i < finished_seeds.size(); ++i) { if(finished_seeds[i].interval.size() > max_interval) continue; //std::cout << finished_seeds[i] << "\n"; std::string query_seed = strand_query.substr(finished_seeds[i].query_index, target_seed_length); std::string match_seed = query_seed; // Apply edits to the new sequence for(size_t j = 0; j < finished_seeds[i].edits.size(); ++j) match_seed[finished_seeds[i].edits[j].index - finished_seeds[i].query_index] = finished_seeds[i].edits[j].base; // Flip the seeds to match the strand of the query if(do_reverse) { query_seed = reverseComplement(query_seed); match_seed = reverseComplement(match_seed); } // Extract the prefix of every occurrence of this seed RankedPrefixVector extensions = BWTAlgorithms::extractRankedPrefixes(indices.pBWT, finished_seeds[i].interval); // Extend the seeds to the full-length string for(size_t j = 0; j < extensions.size(); ++j) { size_t rank = extensions[j].rank; // The second element of the returned pair is // false if the set already contains this rank if(!rank_set.insert(rank).second) continue; // Extract the reminder of the read std::string& prefix = extensions[j].prefix; int64_t start_index_of_read = indices.pSSA->lookupLexoRank(rank); std::string suffix = BWTAlgorithms::extractUntilInterval(indices.pBWT, start_index_of_read, finished_seeds[i].interval); std::string match_sequence = prefix + suffix; // Ignore identical matches if(match_sequence == strand_query) continue; // Change strands if(do_reverse) match_sequence = reverseComplement(match_sequence); // Compute the overlap SequenceOverlap overlap; size_t pos_0 = in_query.find(query_seed); size_t pos_1 = match_sequence.find(match_seed); assert(pos_0 != std::string::npos); assert(pos_1 != std::string::npos); if(in_query.find(query_seed, pos_0 + 1) != std::string::npos || match_sequence.find(match_seed, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlap = Overlapper::computeOverlap(in_query, match_sequence); } else { overlap = Overlapper::extendMatch(in_query, match_sequence, pos_0, pos_1, bandwidth); } n_candidates += 1; bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap; bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= min_identity; if(bPassedOverlap && bPassedIdentity) { //printf("Rank\t%zu\t%zu\t%s\t%.2lf\t%d\n", n_calls, // rank, match_sequence.c_str(), overlap.getPercentIdentity(), overlap.getOverlapLength()); SequenceOverlapPair op; op.sequence[0] = in_query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = do_reverse; out_vector.push_back(op); n_output += 1; } } } t_time += timer.getElapsedCPUTime(); if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0) printf("[approx seeds] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time); }
SequenceOverlapPairVector KmerOverlaps::PacBioRetrieveMatches(const std::string& query, size_t k, int min_overlap, double min_identity, int bandwidth, const BWTIndexSet& indices, KmerDistribution& kd, int round) { PROFILE_FUNC("OverlapHaplotypeBuilder::PacBioRetrieveMatches") assert(indices.pBWT != NULL); assert(indices.pSSA != NULL); //size_t numStringCount[query.size()+1] = 0; int64_t intervalSum = 0; static size_t n_calls = 0; static size_t n_candidates = 0; static size_t n_output = 0; static double t_time = 0; size_t count = 0; size_t numKmer = 0; size_t numRepeatKmer = 0; size_t totalKmer = 0; size_t numNoSeedRead = 0; size_t repeatCutoff = kd.getRepeatKmerCutoff(); size_t errorCutoff = kd.getMedian() - kd.getSdv(); Timer timer("test", true); n_calls++; //std::cout<<"PacBioRetrieveMatches\n"; std::cout<<"\tk :\t"<<k<<"\n"; SequenceOverlapPairVector overlap_vector; std::vector<long> identityVector(100); for(int j = 0;j < identityVector.size(); j++) identityVector[j] = 0; // Use the FM-index to look up intervals for each kmer of the read. Each index // in the interval is stored individually in the KmerMatchMap. We then // backtrack to map these kmer indices to read IDs. As reads can share // multiple kmers, we use the map to avoid redundant lookups. // There is likely a faster algorithm which performs direct decompression // of the read sequences without having to expand the intervals to individual // indices. The current algorithm suffices for now. KmerMatchMap prematchMap; size_t num_kmers = query.size() - k + 1; clock_t search_seeds_s = clock(), search_seeds_e; for(size_t i = 0; i < num_kmers; i++) { std::string kmer = query.substr(i, k); BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.upper - interval.lower < errorCutoff) numNoSeedRead++; if((interval.upper - interval.lower) > 20 && (interval.upper - interval.lower) < repeatCutoff) { numKmer++; totalKmer++; } //To avoid the repeat region /*if((interval.upper - interval.lower) > repeatCutoff) { numRepeatKmer++; totalKmer++; continue; } else interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;*/ if(interval.isValid() && interval.size()) { //std::cout<<"\tinterval size : "<<interval.upper - interval.lower<<std::endl; for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), false }; prematchMap.insert(std::make_pair(match, false)); } intervalSum += interval.upper - interval.lower; count++; } kmer = reverseComplement(kmer); interval = BWTAlgorithms::findInterval(indices, kmer); interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper; if(interval.isValid() && interval.size()) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), true }; prematchMap.insert(std::make_pair(match, false)); } intervalSum += interval.upper - interval.lower; count++; } } if(numNoSeedRead == num_kmers) std::cout<<"\tnoSeedRead : 1"<<std::endl; std::cout<<"\tnumber of kmer : "<<numKmer<<std::endl; std::cout<<"\tnumber of RepeatKmer : "<<numRepeatKmer<<std::endl; std::cout<<"\tnumber of totalkmer : "<<totalKmer<<std::endl; // Backtrack through the kmer indices to turn them into read indices. // This mirrors the calcSA function in SampledSuffixArray except we mark each entry // as visited once it is processed. //std::cout<<"\tintervalSum : "<<intervalSum<<std::endl; //std::cout<<"\tintervalCount : "<<count<<std::endl; std::cout<<"\tprematchMap :\t"<<prematchMap.size()<<std::endl; KmerMatchSet matches; for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter) { //std::cout<<"iter->first.position : "<<iter->first.position<<std::endl; // This index has been visited if(iter->second) continue; // Mark this as visited iter->second = true; // Backtrack the index until we hit the starting symbol KmerMatch out_match = iter->first; while(1) { char b = indices.pBWT->getChar(out_match.index); out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1); // Check if the hash indicates we have visited this index. If so, stop the backtrack KmerMatchMap::iterator find_iter = prematchMap.find(out_match); if(find_iter != prematchMap.end()) { // We have processed this index already if(find_iter->second) break; else find_iter->second = true; } if(b == '$') { // We've found the lexicographic index for this read. Turn it into a proper ID out_match.index = indices.pSSA->lookupLexoRank(out_match.index); //std::cout<<"out_match.position"<<out_match.position<<std::endl; matches.insert(out_match); break; } } } search_seeds_e = clock(); std::cout<<"\tmatchset :\t"<<matches.size()<<"\n"; // Refine the matches by computing proper overlaps between the sequences // Use the overlaps that meet the thresholds to build a multiple alignment clock_t extrac_s, extrac_e; clock_t overlapE_s, overlapE_e; clock_t overlapC_s, overlapC_e; double extrac_sum = 0.0; double overlapE_sum = 0.0, overlapC_sum = 0.0; int compute_count = 0,extend_count = 0; size_t acNumber = 0; for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter) { extrac_s = clock(); std::string match_sequence;// = BWTAlgorithms::extractString(indices.pBWT, iter->index); if(indices.pReadTable != NULL) match_sequence = indices.pReadTable->getRead(iter->index).seq.toString(); /*else match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);*/ extrac_e = clock(); extrac_sum += (double)extrac_e - extrac_s; if(iter->is_reverse) match_sequence = reverseComplement(match_sequence); // Ignore identical matches if(match_sequence == query) continue; // Compute the overlap. If the kmer match occurs a single time in each sequence we use // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper. SequenceOverlap overlap; std::string match_kmer = query.substr(iter->position, k); size_t pos_0 = iter->position;//query.find(match_kmer); size_t pos_1 = match_sequence.find(match_kmer); assert(pos_0 != std::string::npos && pos_1 != std::string::npos); //Timer* sTimer = new Timer("seeds overlap"); // Check for secondary occurrences /*if(query.find(match_kmer, pos_0 + 1) != std::string::npos || match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlapC_s = clock(); compute_count++; overlap = Overlapper::computeOverlap(query, match_sequence); overlapC_e = clock(); overlapC_sum += (double)overlapC_e - overlapC_s; } else {*/ overlapE_s = clock(); extend_count++; overlap = Overlapper::PacBioExtendMatch(query, match_sequence, pos_0, pos_1, bandwidth); overlapE_e = clock(); overlapE_sum += (double)overlapE_e - overlapE_s; //} //delete sTimer; n_candidates += 1; bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap; bool bPassedIdentity = overlap.getPercentIdentity() >= min_identity; identityVector[(int)overlap.getPercentIdentity()] += 1; //overlap.printTotal_columns(); //overlap.printEdit_distance(); //std::cout<<"min_overlap == "<<overlap.getOverlapLength()<<"\n"; //std::cout<<"overlap.getOverlapLength() / 100 == "<<overlap.getOverlapLength() / 100<<"\n"; //std::cout<<"min_identity == "<<min_identity<<"\n"; //std::cout<<"bPassedOverlap == "<<bPassedOverlap<<"\n"; //std::cout<<"bPassedIdentity == "<<bPassedIdentity<<"\n"; //std::cout<<match_sequence<<"\n"; if(bPassedOverlap && bPassedIdentity) { SequenceOverlapPair op; op.sequence[0] = query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = iter->is_reverse; overlap_vector.push_back(op); n_output += 1; acNumber += 1; //numStringCount } } std::cout<<"\tacceptable number of seeds == "<<acNumber<<"\n"; std::cout<<"\tsearch seeds time : "<<(double)(search_seeds_e - search_seeds_s)/CLOCKS_PER_SEC<<std::endl; std::cout<<"\textract time : "<<extrac_sum/CLOCKS_PER_SEC<<std::endl; //std::cout<<"\tcompute_count : "<<compute_count<<std::endl; //std::cout<<"\tbanded_count : "<<extend_count<<std::endl; //std::cout<<"\tcompute overlap time : "<<overlapC_sum/CLOCKS_PER_SEC<<std::endl; //std::cout<<"\tbanded overlap time : "<<overlapE_sum/CLOCKS_PER_SEC<<std::endl; /*------------------output-identity------------------------------------ double mean = 0.0, temp_mean = 0.0,temp = 0.0; for(int i = 0; i < 100; i++) { //count*identity mean+=identityVector[i]*i; temp+=identityVector[i]; } mean=mean/temp; for(int i = 0; i < 100; i++) //count*identity^2 temp_mean+=identityVector[i]*pow(i,2); std::cout<<"-----------outputIdentity------------"<<std::endl; std::cout<<"\tround "<<round; std::cout<<"\tmean identity :\t"<<mean<<std::endl; std::cout<<"\tSD identity :\t"<<sqrt(temp_mean/temp - pow(mean,2))<<std::endl; std::cout<<"-------------------------------------\n"<<std::endl; /*---------------------------------------------------------------------*/ t_time += timer.getElapsedCPUTime(); if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0) printf("[kmer overlaps] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time); return overlap_vector; }