SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query, size_t k, int min_overlap, double min_identity, int bandwidth, const BWTIndexSet& indices) { PROFILE_FUNC("OverlapHaplotypeBuilder::retrieveMatches") assert(indices.pBWT != NULL); assert(indices.pSSA != NULL); int64_t max_interval_size = 200; SequenceOverlapPairVector overlap_vector; // Use the FM-index to look up intervals for each kmer of the read. Each index // in the interval is stored individually in the KmerMatchMap. We then // backtrack to map these kmer indices to read IDs. As reads can share // multiple kmers, we use the map to avoid redundant lookups. // There is likely a faster algorithm which performs direct decompression // of the read sequences without having to expand the intervals to individual // indices. The current algorithm suffices for now. KmerMatchMap prematchMap; size_t num_kmers = query.size() - k + 1; for(size_t i = 0; i < num_kmers; ++i) { std::string kmer = query.substr(i, k); BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.isValid() && interval.size() < max_interval_size) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), false }; prematchMap.insert(std::make_pair(match, false)); } } kmer = reverseComplement(kmer); interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.isValid() && interval.size() < max_interval_size) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), true }; prematchMap.insert(std::make_pair(match, false)); } } } // Backtrack through the kmer indices to turn them into read indices. // This mirrors the calcSA function in SampledSuffixArray except we mark each entry // as visited once it is processed. KmerMatchSet matches; for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter) { // This index has been visited if(iter->second) continue; // Mark this as visited iter->second = true; // Backtrack the index until we hit the starting symbol KmerMatch out_match = iter->first; while(1) { char b = indices.pBWT->getChar(out_match.index); out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1); // Check if the hash indicates we have visited this index. If so, stop the backtrack KmerMatchMap::iterator find_iter = prematchMap.find(out_match); if(find_iter != prematchMap.end()) { // We have processed this index already if(find_iter->second) break; else find_iter->second = true; } if(b == '$') { // We've found the lexicographic index for this read. Turn it into a proper ID out_match.index = indices.pSSA->lookupLexoRank(out_match.index); matches.insert(out_match); break; } } } // Refine the matches by computing proper overlaps between the sequences // Use the overlaps that meet the thresholds to build a multiple alignment for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter) { std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index); if(iter->is_reverse) match_sequence = reverseComplement(match_sequence); // Ignore identical matches if(match_sequence == query) continue; // Compute the overlap. If the kmer match occurs a single time in each sequence we use // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper. SequenceOverlap overlap; std::string match_kmer = query.substr(iter->position, k); size_t pos_0 = query.find(match_kmer); size_t pos_1 = match_sequence.find(match_kmer); assert(pos_0 != std::string::npos && pos_1 != std::string::npos); // Check for secondary occurrences if(query.find(match_kmer, pos_0 + 1) != std::string::npos || match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlap = Overlapper::computeOverlap(query, match_sequence); } else { overlap = Overlapper::extendMatch(query, match_sequence, pos_0, pos_1, bandwidth); } bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap; bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= min_identity; if(bPassedOverlap && bPassedIdentity) { SequenceOverlapPair op; op.sequence[0] = query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = iter->is_reverse; overlap_vector.push_back(op); } } return overlap_vector; }
SequenceOverlapPairVector KmerOverlaps::PacBioRetrieveMatches(const std::string& query, size_t k, int min_overlap, double min_identity, int bandwidth, const BWTIndexSet& indices, KmerDistribution& kd, int round) { PROFILE_FUNC("OverlapHaplotypeBuilder::PacBioRetrieveMatches") assert(indices.pBWT != NULL); assert(indices.pSSA != NULL); //size_t numStringCount[query.size()+1] = 0; int64_t intervalSum = 0; static size_t n_calls = 0; static size_t n_candidates = 0; static size_t n_output = 0; static double t_time = 0; size_t count = 0; size_t numKmer = 0; size_t numRepeatKmer = 0; size_t totalKmer = 0; size_t numNoSeedRead = 0; size_t repeatCutoff = kd.getRepeatKmerCutoff(); size_t errorCutoff = kd.getMedian() - kd.getSdv(); Timer timer("test", true); n_calls++; //std::cout<<"PacBioRetrieveMatches\n"; std::cout<<"\tk :\t"<<k<<"\n"; SequenceOverlapPairVector overlap_vector; std::vector<long> identityVector(100); for(int j = 0;j < identityVector.size(); j++) identityVector[j] = 0; // Use the FM-index to look up intervals for each kmer of the read. Each index // in the interval is stored individually in the KmerMatchMap. We then // backtrack to map these kmer indices to read IDs. As reads can share // multiple kmers, we use the map to avoid redundant lookups. // There is likely a faster algorithm which performs direct decompression // of the read sequences without having to expand the intervals to individual // indices. The current algorithm suffices for now. KmerMatchMap prematchMap; size_t num_kmers = query.size() - k + 1; clock_t search_seeds_s = clock(), search_seeds_e; for(size_t i = 0; i < num_kmers; i++) { std::string kmer = query.substr(i, k); BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.upper - interval.lower < errorCutoff) numNoSeedRead++; if((interval.upper - interval.lower) > 20 && (interval.upper - interval.lower) < repeatCutoff) { numKmer++; totalKmer++; } //To avoid the repeat region /*if((interval.upper - interval.lower) > repeatCutoff) { numRepeatKmer++; totalKmer++; continue; } else interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;*/ if(interval.isValid() && interval.size()) { //std::cout<<"\tinterval size : "<<interval.upper - interval.lower<<std::endl; for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), false }; prematchMap.insert(std::make_pair(match, false)); } intervalSum += interval.upper - interval.lower; count++; } kmer = reverseComplement(kmer); interval = BWTAlgorithms::findInterval(indices, kmer); interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper; if(interval.isValid() && interval.size()) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), true }; prematchMap.insert(std::make_pair(match, false)); } intervalSum += interval.upper - interval.lower; count++; } } if(numNoSeedRead == num_kmers) std::cout<<"\tnoSeedRead : 1"<<std::endl; std::cout<<"\tnumber of kmer : "<<numKmer<<std::endl; std::cout<<"\tnumber of RepeatKmer : "<<numRepeatKmer<<std::endl; std::cout<<"\tnumber of totalkmer : "<<totalKmer<<std::endl; // Backtrack through the kmer indices to turn them into read indices. // This mirrors the calcSA function in SampledSuffixArray except we mark each entry // as visited once it is processed. //std::cout<<"\tintervalSum : "<<intervalSum<<std::endl; //std::cout<<"\tintervalCount : "<<count<<std::endl; std::cout<<"\tprematchMap :\t"<<prematchMap.size()<<std::endl; KmerMatchSet matches; for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter) { //std::cout<<"iter->first.position : "<<iter->first.position<<std::endl; // This index has been visited if(iter->second) continue; // Mark this as visited iter->second = true; // Backtrack the index until we hit the starting symbol KmerMatch out_match = iter->first; while(1) { char b = indices.pBWT->getChar(out_match.index); out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1); // Check if the hash indicates we have visited this index. If so, stop the backtrack KmerMatchMap::iterator find_iter = prematchMap.find(out_match); if(find_iter != prematchMap.end()) { // We have processed this index already if(find_iter->second) break; else find_iter->second = true; } if(b == '$') { // We've found the lexicographic index for this read. Turn it into a proper ID out_match.index = indices.pSSA->lookupLexoRank(out_match.index); //std::cout<<"out_match.position"<<out_match.position<<std::endl; matches.insert(out_match); break; } } } search_seeds_e = clock(); std::cout<<"\tmatchset :\t"<<matches.size()<<"\n"; // Refine the matches by computing proper overlaps between the sequences // Use the overlaps that meet the thresholds to build a multiple alignment clock_t extrac_s, extrac_e; clock_t overlapE_s, overlapE_e; clock_t overlapC_s, overlapC_e; double extrac_sum = 0.0; double overlapE_sum = 0.0, overlapC_sum = 0.0; int compute_count = 0,extend_count = 0; size_t acNumber = 0; for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter) { extrac_s = clock(); std::string match_sequence;// = BWTAlgorithms::extractString(indices.pBWT, iter->index); if(indices.pReadTable != NULL) match_sequence = indices.pReadTable->getRead(iter->index).seq.toString(); /*else match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);*/ extrac_e = clock(); extrac_sum += (double)extrac_e - extrac_s; if(iter->is_reverse) match_sequence = reverseComplement(match_sequence); // Ignore identical matches if(match_sequence == query) continue; // Compute the overlap. If the kmer match occurs a single time in each sequence we use // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper. SequenceOverlap overlap; std::string match_kmer = query.substr(iter->position, k); size_t pos_0 = iter->position;//query.find(match_kmer); size_t pos_1 = match_sequence.find(match_kmer); assert(pos_0 != std::string::npos && pos_1 != std::string::npos); //Timer* sTimer = new Timer("seeds overlap"); // Check for secondary occurrences /*if(query.find(match_kmer, pos_0 + 1) != std::string::npos || match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlapC_s = clock(); compute_count++; overlap = Overlapper::computeOverlap(query, match_sequence); overlapC_e = clock(); overlapC_sum += (double)overlapC_e - overlapC_s; } else {*/ overlapE_s = clock(); extend_count++; overlap = Overlapper::PacBioExtendMatch(query, match_sequence, pos_0, pos_1, bandwidth); overlapE_e = clock(); overlapE_sum += (double)overlapE_e - overlapE_s; //} //delete sTimer; n_candidates += 1; bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap; bool bPassedIdentity = overlap.getPercentIdentity() >= min_identity; identityVector[(int)overlap.getPercentIdentity()] += 1; //overlap.printTotal_columns(); //overlap.printEdit_distance(); //std::cout<<"min_overlap == "<<overlap.getOverlapLength()<<"\n"; //std::cout<<"overlap.getOverlapLength() / 100 == "<<overlap.getOverlapLength() / 100<<"\n"; //std::cout<<"min_identity == "<<min_identity<<"\n"; //std::cout<<"bPassedOverlap == "<<bPassedOverlap<<"\n"; //std::cout<<"bPassedIdentity == "<<bPassedIdentity<<"\n"; //std::cout<<match_sequence<<"\n"; if(bPassedOverlap && bPassedIdentity) { SequenceOverlapPair op; op.sequence[0] = query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = iter->is_reverse; overlap_vector.push_back(op); n_output += 1; acNumber += 1; //numStringCount } } std::cout<<"\tacceptable number of seeds == "<<acNumber<<"\n"; std::cout<<"\tsearch seeds time : "<<(double)(search_seeds_e - search_seeds_s)/CLOCKS_PER_SEC<<std::endl; std::cout<<"\textract time : "<<extrac_sum/CLOCKS_PER_SEC<<std::endl; //std::cout<<"\tcompute_count : "<<compute_count<<std::endl; //std::cout<<"\tbanded_count : "<<extend_count<<std::endl; //std::cout<<"\tcompute overlap time : "<<overlapC_sum/CLOCKS_PER_SEC<<std::endl; //std::cout<<"\tbanded overlap time : "<<overlapE_sum/CLOCKS_PER_SEC<<std::endl; /*------------------output-identity------------------------------------ double mean = 0.0, temp_mean = 0.0,temp = 0.0; for(int i = 0; i < 100; i++) { //count*identity mean+=identityVector[i]*i; temp+=identityVector[i]; } mean=mean/temp; for(int i = 0; i < 100; i++) //count*identity^2 temp_mean+=identityVector[i]*pow(i,2); std::cout<<"-----------outputIdentity------------"<<std::endl; std::cout<<"\tround "<<round; std::cout<<"\tmean identity :\t"<<mean<<std::endl; std::cout<<"\tSD identity :\t"<<sqrt(temp_mean/temp - pow(mean,2))<<std::endl; std::cout<<"-------------------------------------\n"<<std::endl; /*---------------------------------------------------------------------*/ t_time += timer.getElapsedCPUTime(); if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0) printf("[kmer overlaps] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time); return overlap_vector; }