// Update the bit vector with the kmers that were assembled into str void MetAssemble::markSequenceKmers(const std::string& str) { assert(str.size() >= m_parameters.kmer); size_t n = str.size() - m_parameters.kmer + 1; for(size_t i = 0; i < n; ++i) { std::string kseq = str.substr(i, m_parameters.kmer); BWTInterval interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, kseq); if(interval.isValid()) { for(int64_t j = interval.lower; j <= interval.upper; ++j) m_parameters.pBitVector->updateCAS(j, false, true); } // Mark the reverse complement k-mers too std::string rc_kseq = reverseComplement(kseq); interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, rc_kseq); if(interval.isValid()) { for(int64_t j = interval.lower; j <= interval.upper; ++j) m_parameters.pBitVector->updateCAS(j, false, true); } } }
// Find the interval in pBWT corresponding to w // using a cache of short k-mer intervals to avoid // some of the iterations BWTInterval BWTAlgorithms::findIntervalWithCache(const BWT* pBWT, const BWTIntervalCache* pIntervalCache, const std::string& w) { size_t cacheLen = pIntervalCache->getCachedLength(); if(w.size() < cacheLen) return findInterval(pBWT, w); // Compute the interval using the cache for the last k bases int len = w.size(); int j = len - cacheLen; // Check whether the input string has a '$' in it. // We don't cache these strings so if it does // we have to do a direct lookup if(index(w.c_str() + j, '$') != NULL) return findInterval(pBWT, w); BWTInterval interval = pIntervalCache->lookup(w.c_str() + j); j -= 1; for(;j >= 0; --j) { char curr = w[j]; updateInterval(interval, curr, pBWT); if(!interval.isValid()) return interval; } return interval; }
size_t BWTAlgorithms::countSequenceOccurrencesSingleStrand(const std::string& w, const BWTIndexSet& indices) { assert(indices.pBWT != NULL); assert(indices.pCache != NULL); BWTInterval interval = findIntervalWithCache(indices.pBWT, indices.pCache, w); return interval.isValid() ? interval.size() : 0; }
// Add a seed read to the cluster. Overlaps will be found for // each seed read to grow the cluster ClusterNode ReadCluster::addSeed(const std::string& sequence, bool bCheckInIndex) { // Check if this read is a substring SeqRecord tempRecord; tempRecord.id = "cluster-seed"; tempRecord.seq = sequence; OverlapBlockList tempBlockList; OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(tempRecord, &tempBlockList); if(overlapResult.isSubstring) { // If bCheckInIndex is true, then we are extending clusters from reads that are in the FM-index // In this case, seeding a substring read is an error and we abort. If the seed is NOT in the index // we just emit a warning and ignore the seed if(bCheckInIndex) { std::cerr << "Error: The seed used for sga-cluster-extend is a substring of some read.\n"; std::cerr << "Please run sga rmdup before cluster\n"; std::cerr << "Sequence: " << sequence << "\n"; exit(1); } else { std::cerr << "Warning: The cluster sequence to extend is a substring of some read. This seed is being skipped\n"; std::cerr << "Sequence: " << sequence << "\n"; ClusterNode emptyNode; // Set an invalid interval emptyNode.interval.lower = 0; emptyNode.interval.upper = -1; return emptyNode; } } // Find the interval in the fm-index containing the read const BWT* pBWT = m_pOverlapper->getBWT(); BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, sequence); BWTAlgorithms::updateInterval(readInterval, '$', pBWT); // When building primary clusters, we require each read to be in the index. if(bCheckInIndex) { if(!readInterval.isValid()) { std::cerr << "sga cluster error: The seed read is not part of the FM-index.\n"; exit(EXIT_FAILURE); } } ClusterNode node; node.sequence = sequence; node.interval = readInterval; node.isReverseInterval = false; m_usedIndex.insert(readInterval.lower); m_queue.push(node); return node; }
// Extract all strings found from a backwards search starting at the given interval RankedPrefixVector BWTAlgorithms::extractRankedPrefixes(const BWT* pBWT, BWTInterval interval) { std::string curr; RankedPrefixVector output; output.reserve(interval.size()); _extractRankedPrefixes(pBWT, interval, curr, &output); return output; }
// Find the interval in pBWT corresponding to w // If w does not exist in the BWT, the interval // coordinates [l, u] will be such that l > u BWTInterval BWTAlgorithms::findInterval(const BWT* pBWT, const std::string& w) { int len = w.size(); int j = len - 1; char curr = w[j]; BWTInterval interval; initInterval(interval, curr, pBWT); --j; for(;j >= 0; --j) { curr = w[j]; updateInterval(interval, curr, pBWT); if(!interval.isValid()) return interval; } return interval; }
SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query, size_t k, int min_overlap, double min_identity, int bandwidth, const BWTIndexSet& indices) { PROFILE_FUNC("OverlapHaplotypeBuilder::retrieveMatches") assert(indices.pBWT != NULL); assert(indices.pSSA != NULL); int64_t max_interval_size = 200; SequenceOverlapPairVector overlap_vector; // Use the FM-index to look up intervals for each kmer of the read. Each index // in the interval is stored individually in the KmerMatchMap. We then // backtrack to map these kmer indices to read IDs. As reads can share // multiple kmers, we use the map to avoid redundant lookups. // There is likely a faster algorithm which performs direct decompression // of the read sequences without having to expand the intervals to individual // indices. The current algorithm suffices for now. KmerMatchMap prematchMap; size_t num_kmers = query.size() - k + 1; for(size_t i = 0; i < num_kmers; ++i) { std::string kmer = query.substr(i, k); BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.isValid() && interval.size() < max_interval_size) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), false }; prematchMap.insert(std::make_pair(match, false)); } } kmer = reverseComplement(kmer); interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.isValid() && interval.size() < max_interval_size) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), true }; prematchMap.insert(std::make_pair(match, false)); } } } // Backtrack through the kmer indices to turn them into read indices. // This mirrors the calcSA function in SampledSuffixArray except we mark each entry // as visited once it is processed. KmerMatchSet matches; for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter) { // This index has been visited if(iter->second) continue; // Mark this as visited iter->second = true; // Backtrack the index until we hit the starting symbol KmerMatch out_match = iter->first; while(1) { char b = indices.pBWT->getChar(out_match.index); out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1); // Check if the hash indicates we have visited this index. If so, stop the backtrack KmerMatchMap::iterator find_iter = prematchMap.find(out_match); if(find_iter != prematchMap.end()) { // We have processed this index already if(find_iter->second) break; else find_iter->second = true; } if(b == '$') { // We've found the lexicographic index for this read. Turn it into a proper ID out_match.index = indices.pSSA->lookupLexoRank(out_match.index); matches.insert(out_match); break; } } } // Refine the matches by computing proper overlaps between the sequences // Use the overlaps that meet the thresholds to build a multiple alignment for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter) { std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index); if(iter->is_reverse) match_sequence = reverseComplement(match_sequence); // Ignore identical matches if(match_sequence == query) continue; // Compute the overlap. If the kmer match occurs a single time in each sequence we use // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper. SequenceOverlap overlap; std::string match_kmer = query.substr(iter->position, k); size_t pos_0 = query.find(match_kmer); size_t pos_1 = match_sequence.find(match_kmer); assert(pos_0 != std::string::npos && pos_1 != std::string::npos); // Check for secondary occurrences if(query.find(match_kmer, pos_0 + 1) != std::string::npos || match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlap = Overlapper::computeOverlap(query, match_sequence); } else { overlap = Overlapper::extendMatch(query, match_sequence, pos_0, pos_1, bandwidth); } bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap; bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= min_identity; if(bPassedOverlap && bPassedIdentity) { SequenceOverlapPair op; op.sequence[0] = query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = iter->is_reverse; overlap_vector.push_back(op); } } return overlap_vector; }
// Extract reads from an FM-index that have a k-mer match to any given haplotypes // Returns true if the reads were successfully extracted, false if there are // more reads than maxReads bool HapgenUtil::extractHaplotypeReads(const StringVector& haplotypes, const BWTIndexSet& indices, int k, bool doReverse, size_t maxReads, int64_t maxIntervalSize, SeqRecordVector* pOutReads, SeqRecordVector* pOutMates) { PROFILE_FUNC("HapgenUtil::extractHaplotypeReads") // Extract the set of reads that have at least one kmer shared with these haplotypes // This is a bit of a lengthy procedure with a few steps: // 1) extract all the kmers in the haplotypes // 2) find the intervals for the kmers in the fm-index // 3) compute the set of read indices of the reads from the intervals (using the sampled suffix array) // 4) finally, extract the read sequences from the index // Make a set of kmers from the haplotypes std::set<std::string> kmerSet; for(size_t i = 0; i < haplotypes.size(); ++i) { const std::string& h = haplotypes[i]; if((int)h.size() < k) continue; for(size_t j = 0; j < h.size() - k + 1; ++j) { std::string ks = h.substr(j, k); if(doReverse) ks = reverseComplement(ks); kmerSet.insert(ks); } } // Compute suffix array intervals for the kmers std::vector<BWTInterval> intervals; for(std::set<std::string>::const_iterator iter = kmerSet.begin(); iter != kmerSet.end(); ++iter) { BWTInterval interval = BWTAlgorithms::findInterval(indices, *iter); if(interval.size() < maxIntervalSize) intervals.push_back(interval); } // Compute the set of reads ids std::set<int64_t> readIndices; for(size_t i = 0; i < intervals.size(); ++i) { BWTInterval interval = intervals[i]; for(int64_t j = interval.lower; j <= interval.upper; ++j) { // Get index from sampled suffix array SAElem elem = indices.pSSA->calcSA(j, indices.pBWT); readIndices.insert(elem.getID()); } } // Check if we have hit the limit of extracting too many reads if(readIndices.size() > maxReads) return false; for(std::set<int64_t>::const_iterator iter = readIndices.begin(); iter != readIndices.end(); ++iter) { int64_t idx = *iter; // Extract the read std::stringstream namer; namer << "idx-" << idx; SeqRecord record; record.id = namer.str(); record.seq = BWTAlgorithms::extractString(indices.pBWT, idx); assert(indices.pQualityTable != NULL); record.qual = indices.pQualityTable->getQualityString(idx, record.seq.length()); if(!record.seq.empty()) pOutReads->push_back(record); // Optionally extract its mate // If the index is constructed properly, // paired reads are in adjacent indices with the // first read at even indices if(pOutMates != NULL) { int64_t mateIdx = idx; if(idx % 2 == 0) mateIdx += 1; else mateIdx -= 1; std::stringstream mateName; mateName << "idx-" << mateIdx; SeqRecord mateRecord; mateRecord.id = mateName.str(); mateRecord.seq = BWTAlgorithms::extractString(indices.pBWT, mateIdx); mateRecord.qual = indices.pQualityTable->getQualityString(mateIdx, mateRecord.seq.length()); if(!record.seq.empty() && !mateRecord.seq.empty()) pOutMates->push_back(mateRecord); } } return true; }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id); const DNAString& ref_sequence = ref_record.seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query); if(overlap.score < 0 || !overlap.isValid()) continue; int alignment_start = ref_start + overlap.match[0].start; int alignment_end = ref_start + overlap.match[0].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events bool has_indel = false; int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') { num_events -= (len - 1); has_indel = true; } } // Skip poor alignments double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f); if(mismatch_rate > 0.05f || overlap.total_columns < 50) { if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } continue; } bool is_snp = !has_indel && overlap.edit_distance == 1; HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, num_events, is_reverse, is_snp); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); if(Verbosity::Instance().getPrintLevel() > 4) { printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start); } // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); else if(Verbosity::Instance().getPrintLevel() > 3) printf("Haplotype Alignment - Ignoring alignment with too many events (%d)\n", event_count_vector[i]); } }
// Align the haplotype to the reference genome represented by the BWT/SSA pair void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k, const std::string& haplotype, const BWTIndexSet& referenceIndex, const ReadTable* pReferenceTable, HapgenAlignmentVector& outAlignments) { PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer") int64_t max_interval_size = 4; if(haplotype.size() < k) return; std::vector<int> event_count_vector; std::vector<HapgenAlignment> tmp_alignments; int min_events = std::numeric_limits<int>::max(); // Align forward and reverse haplotype to reference for(size_t i = 0; i <= 1; ++i) { bool is_reverse = i == 1; std::string query = is_reverse ? reverseComplement(haplotype) : haplotype; // Find shared kmers between the haplotype and the reference CandidateVector candidates; size_t nqk = query.size() - k + 1; for(size_t j = 0; j < nqk; ++j) { std::string kmer = query.substr(j, k); // Find the interval of this kmer in the reference BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer); if(!interval.isValid() || interval.size() >= max_interval_size) continue; // not found or too repetitive // Extract the reference location of these hits for(int64_t k = interval.lower; k <= interval.upper; ++k) { SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT); // Make a candidate alignment CandidateKmerAlignment candidate; candidate.query_index = j; candidate.target_index = elem.getPos(); candidate.target_extrapolated_start = candidate.target_index - candidate.query_index; candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size(); candidate.target_sequence_id = elem.getID(); candidates.push_back(candidate); } } // Remove duplicate candidates std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart); CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart); candidates.resize(new_end - candidates.begin()); for(size_t j = 0; j < candidates.size(); ++j) { // Extract window around reference size_t window_size = 200; int ref_start = candidates[j].target_extrapolated_start - window_size; int ref_end = candidates[j].target_extrapolated_end + window_size; const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq; if(ref_start < 0) ref_start = 0; if(ref_end > (int)ref_sequence.length()) ref_end = ref_sequence.length(); std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start); // Align haplotype to the reference SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring); // Skip terrible alignments double percent_aligned = (double)overlap.getOverlapLength() / query.size(); if(percent_aligned < 0.95f) continue; /* // Skip alignments that are not full-length matches of the haplotype if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1) continue; */ int alignment_start = ref_start + overlap.match[1].start; int alignment_end = ref_start + overlap.match[1].end; // inclusive int alignment_length = alignment_end - alignment_start + 1; // Crude count of the number of distinct variation events int num_events = overlap.edit_distance; std::stringstream c_parser(overlap.cigar); int len; char t; while(c_parser >> len >> t) { assert(len > 0); // Only count one event per insertion/deletion if(t == 'D' || t == 'I') num_events -= (len - 1); } HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse); tmp_alignments.push_back(aln); event_count_vector.push_back(num_events); // Record the best edit distance if(num_events < min_events) min_events = num_events; } } // Copy the best alignments into the output int MAX_DIFF_TO_BEST = 10; int MAX_EVENTS = 8; assert(event_count_vector.size() == tmp_alignments.size()); for(size_t i = 0; i < event_count_vector.size(); ++i) { if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST) outAlignments.push_back(tmp_alignments[i]); } }
AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(const std::string str, const BWT* pBWT, EdgeDir direction, const BWTIntervalCache* pFwdCache) { size_t k = str.size(); size_t p = k - 1; std::string pmer; // In the sense direction, we extend from the 3' end if(direction == ED_SENSE) pmer = str.substr(1, p); else pmer = str.substr(0, p); assert(pmer.length() == p); std::string rc_pmer = reverseComplement(pmer); // As we only have a single index, we can only directly look up // the extensions for either the pmer or its reverse complement // In the SENSE extension direction, we directly look up for // the reverse complement. In ANTISENSE we directly look up for // the pmer. // Get the extension bases AlphaCount64 extensions; AlphaCount64 rc_extensions; // Set up pointers to the data to fill in/query // depending on the direction of the extension AlphaCount64* pDirectEC; AlphaCount64* pIndirectEC; std::string* pDirectStr; std::string* pIndirectStr; if(direction == ED_SENSE) { pDirectEC = &rc_extensions; pDirectStr = &rc_pmer; pIndirectEC = &extensions; pIndirectStr = &pmer; } else { pDirectEC = &extensions; pDirectStr = &pmer; pIndirectEC = &rc_extensions; pIndirectStr = &rc_pmer; } // Get the interval for the direct query string BWTInterval interval; // Use interval cache if available if(pFwdCache) interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, *pDirectStr); else interval = BWTAlgorithms::findInterval(pBWT, *pDirectStr); // Fill in the direct count if(interval.isValid()) *pDirectEC = BWTAlgorithms::getExtCount(interval, pBWT); // Now, for the non-direct index, query the 4 possible k-mers that are adjacent to the pmer // Setup the query sequence std::string query(k, 'A'); int varIdx = query.size() - 1; query.replace(0, p, *pIndirectStr); for(int i = 0; i < BWT_ALPHABET::size; ++i) { // Transform the query char b = BWT_ALPHABET::getChar(i); query[varIdx] = b; // Perform lookup if(pFwdCache) interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, query); else interval = BWTAlgorithms::findInterval(pBWT, query); // Update the extension count if(interval.isValid()) pIndirectEC->add(b, interval.size()); } // Switch the reverse-complement extensions to the same strand as the str rc_extensions.complement(); extensions += rc_extensions; return extensions; }
size_t BWTAlgorithms::countSequenceOccurrencesSingleStrand(const std::string& w, const BWT* pBWT) { BWTInterval interval = findInterval(pBWT, w); return interval.isValid() ? interval.size() : 0; }
ClusterResult ClusterProcess::process(const SequenceWorkItem& item) { // Calculate the intervals in the forward FM-index for this read const BWT* pBWT = m_pOverlapper->getBWT(); // Check if this read is a substring OverlapBlockList tempBlockList; OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList); if(overlapResult.isSubstring) { std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n"; exit(1); } // Find the interval in the fm-index containing the read std::string readString = item.read.seq.toString(); BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString); BWTAlgorithms::updateInterval(readInterval, '$', pBWT); // The read must be present in the index assert(readInterval.isValid()); // Check if this read has been used yet bool used = false; for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i) { if(m_pMarkedReads->test(i)) { used = true; break; } } ClusterResult result; if(used) return result; // already part of a cluster, return nothing // Compute a new cluster around this read std::set<int64_t> usedIndex; ClusterNodeQueue queue; ClusterNode node; node.sequence = item.read.seq.toString(); node.interval = readInterval; node.isReverseInterval = false; usedIndex.insert(readInterval.lower); queue.push(node); while(!queue.empty()) { ClusterNode node = queue.front(); queue.pop(); // Update the used index and the result structure with this node's data result.clusterNodes.push_back(node); SeqRecord tempRecord; tempRecord.id = "cluster"; tempRecord.seq = node.sequence; OverlapBlockList blockList; OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList); //m_pOverlapper->buildForwardHistory(&blockList); // Parse each member of the block list and potentially expand the cluster for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter) { // Check if the reads in this block are part of the cluster already BWTInterval canonicalInterval = iter->getCanonicalInterval(); int64_t canonicalIndex = canonicalInterval.lower; if(usedIndex.count(canonicalIndex) == 0) { usedIndex.insert(canonicalIndex); ClusterNode newNode; newNode.sequence = iter->getFullString(node.sequence); newNode.interval = canonicalInterval; newNode.isReverseInterval = iter->flags.isTargetRev(); queue.push(newNode); } } } // If some work was performed, update the bitvector so other threads do not try to merge the same set of reads. // This uses compare-and-swap instructions to ensure the uppdate is atomic. // If some other thread has merged this set (and updated // the bitvector), we discard all the merged data. // As a given set of reads should all be merged together, we only need to make sure we atomically update // the bit for the read with the lowest index in the set. // Sort the intervals into ascending order and remove any duplicate intervals (which can occur // if the subgraph has a simple cycle) std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare); std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::equal); size_t oldSize = result.clusterNodes.size(); result.clusterNodes.erase(newEnd, result.clusterNodes.end()); size_t newSize = result.clusterNodes.size(); if(oldSize != newSize) std::cout << "Warning: duplicate cluster nodes were found\n"; // Check if the bit in the vector has already been set for the lowest read index // If it has some other thread has already output this set so we do nothing int64_t lowestIndex = result.clusterNodes.front().interval.lower; bool currentValue = m_pMarkedReads->test(lowestIndex); bool updateSuccess = false; if(currentValue == false) { // Attempt to update the bit vector with an atomic CAS. If this returns false // the bit was set by some other thread updateSuccess = m_pMarkedReads->updateCAS(lowestIndex, currentValue, true); } if(updateSuccess) { // We successfully atomically set the bit for the first read in this set // to true. We can safely update the rest of the bits and keep the merged sequences // for output. std::vector<ClusterNode>::const_iterator iter = result.clusterNodes.begin(); for(; iter != result.clusterNodes.end(); ++iter) { for(int64_t i = iter->interval.lower; i <= iter->interval.upper; ++i) { if(i == lowestIndex) //already set continue; currentValue = m_pMarkedReads->test(i); if(currentValue) { // This value should not be true, emit a warning std::cout << "Warning: Bit " << i << " was set outside of critical section\n"; std::cout << "Read: " << readString << "\n"; } else { m_pMarkedReads->updateCAS(i, currentValue, true); } } } } else { // Some other thread merged these reads already, discard the intermediate // data and set the result to false result.clusterNodes.clear(); } return result; }
GraphCompareResult GraphCompare::process(const SequenceWorkItem& item) const { PROFILE_FUNC("GraphCompare::process") GraphCompareResult result; SeqRecord currRead = item.read; std::string w = item.read.seq.toString(); int len = w.size(); int num_kmers = len - m_parameters.kmer + 1; // Check if the read is long enough to be used if(w.size() < m_parameters.kmer) return result; // Process the kmers that have not been previously visited // We only try to find variants from one k-mer per read. // This heurestic handles the situation where we process a kmer, // spend a lot of time trying to assemble it into a string and fail, // then try again with the next k-mer in the read. Since all the k-mers // in a read are connected in the de Bruijn graph, if the first attempt // to assemble a k-mer into a variant fails, it is very unlikely that // subsequent attempts will succeed. bool variantAttempted = false; for(int j = 0; j < num_kmers; ++j) { std::string kmer = w.substr(j, m_parameters.kmer); std::string rc_kmer = reverseComplement(kmer); // Use the lexicographically lower of the kmer and its pair as the key in the bloom filter std::string& key_kmer = kmer < rc_kmer ? kmer : rc_kmer; // Check if this k-mer is marked as used by the bloom filter if(m_parameters.pBloomFilter->test(key_kmer.c_str(), key_kmer.size())) continue; // Get the interval for this kmer BWTInterval interval = BWTAlgorithms::findInterval(m_parameters.variantIndex, kmer); BWTInterval rc_interval = BWTAlgorithms::findInterval(m_parameters.variantIndex, rc_kmer); size_t count = interval.size(); if(rc_interval.isValid()) count += rc_interval.size(); bool both_strands = interval.size() > 0 && rc_interval.size() > 0; size_t min_base_coverage = 1; if(count >= m_parameters.minDiscoveryCount && count < m_parameters.maxDiscoveryCount && !variantAttempted && both_strands) { // Update the bloom filter to contain this kmer m_parameters.pBloomFilter->add(key_kmer.c_str(), key_kmer.size()); // Check if this k-mer is present in the other base index size_t base_count = BWTAlgorithms::countSequenceOccurrences(kmer, m_parameters.baseIndex); if(Verbosity::Instance().getPrintLevel() > 6) std::cout << "Read: " << currRead.id << " k: " << j << " CV: " << interval.size() << "/" << rc_interval.size() << " " << base_count << "\n"; // k-mer present in the base read set, skip it if(base_count >= min_base_coverage) continue; if(Verbosity::Instance().getPrintLevel() > 0) std::cout << "Variant read: " << w << "\n"; // variant k-mer, attempt to assemble it into haplotypes GraphBuildResult build_result = processVariantKmer(kmer, count); variantAttempted = true; // Mark the kmers of the variant haplotypes as being visited for(size_t vhi = 0; vhi < build_result.variant_haplotypes.size(); ++vhi) markVariantSequenceKmers(build_result.variant_haplotypes[vhi]); // If we assembled anything, run Dindel on the haplotypes if(build_result.variant_haplotypes.size() > 0) { if(Verbosity::Instance().getPrintLevel() > 0) std::cout << "Running dindel\n"; std::stringstream baseVCFSS; std::stringstream variantVCFSS; std::stringstream callsVCFSS; DindelReadReferenceAlignmentVector alignments; DindelReturnCode drc = DindelUtil::runDindelPairMatePair(kmer, build_result.base_haplotypes, build_result.variant_haplotypes, m_parameters, baseVCFSS, variantVCFSS, callsVCFSS, &alignments); // if(Verbosity::Instance().getPrintLevel() > 0) { std::cout << "Dindel returned " << drc << "\n"; std::cout << "base vcf records:\n" << baseVCFSS.str() << "\n"; std::cout << "variant vcf records:\n" << variantVCFSS.str() << "\n"; } // DINDEL ran without error, push its results to the output if(drc == DRC_OK) { result.baseVCFStrings.push_back(baseVCFSS.str()); result.variantVCFStrings.push_back(variantVCFSS.str()); result.calledVCFStrings.push_back(callsVCFSS.str()); result.varStrings.insert(result.varStrings.end(), build_result.variant_haplotypes.begin(), build_result.variant_haplotypes.end()); result.projectedReadAlignments = alignments; } } } } return result; }
MetAssembleResult MetAssemble::process(const SequenceWorkItem& item) { MetAssembleResult result; SeqRecord currRead = item.read; std::string w = item.read.seq.toString(); if(w.size() <= m_parameters.kmer) { return result; } // Perform a backwards search using the read sequence // Check which k-mers have already been visited using the // shared bitvector. If any bit in the range [l,u] is set // for a suffix of the read, then we do not visit those kmers // later int len = w.size(); int num_kmers = len - m_parameters.kmer + 1; std::vector<bool> visitedKmers(false, num_kmers); int j = len - 1; char curr = w[j]; BWTInterval interval; BWTAlgorithms::initInterval(interval, curr, m_parameters.pBWT); --j; for(;j >= 0; --j) { curr = w[j]; BWTAlgorithms::updateInterval(interval, curr, m_parameters.pBWT); assert(interval.isValid()); // At this point interval represents the suffix [j,len) // Check if the starting point of this interval is set if(j < num_kmers) visitedKmers[j] = m_parameters.pBitVector->test(interval.lower); } // Process the kmers that have not been previously visited for(j = 0; j < num_kmers; ++j) { if(visitedKmers[j]) continue; // skip std::string kmer = w.substr(j, m_parameters.kmer); // Get the interval for this kmer BWTInterval interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, kmer); // Check if this interval has been marked by a previous iteration of the loop assert(interval.isValid()); if(m_parameters.pBitVector->test(interval.lower)) continue; BWTInterval rc_interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, reverseComplement(kmer)); size_t count = interval.size(); if(rc_interval.isValid()) count += rc_interval.size(); if(count >= m_parameters.kmerThreshold) { // Process the kmer std::string contig = processKmer(kmer, count); // We must determine if this contig has been assembled by another thread. // Break the contig into lexicographically ordered set of kmers. The lowest kmer is chosen // to represent the contig. If this kmer has been marked as visited, we discard the contig // otherwise we mark all kmers in the contig and output the contig. StringVector kmers = getLexicographicKmers(contig); // Get the lowest kmer in the set assert(!kmers.empty()); std::string lowest = kmers.front(); BWTInterval lowInterval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, lowest); BWTInterval lowRCInterval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, reverseComplement(lowest)); bool marked = false; // If the kmer exists in the read set (the interval is valid), we attempt to mark it // Otherwise, we attempt to mark its reverse complement. If either call succeeds // we output the contig. This block of code gives the synchronization between threads. // If multiple threads attempt to assemble the same contig, marked will be true for only // one of the threads. if(lowInterval.isValid()) marked = m_parameters.pBitVector->updateCAS(lowInterval.lower, false, true); else marked = m_parameters.pBitVector->updateCAS(lowRCInterval.lower, false, true); // Mark all the kmers in the contig so they will not be visited again markSequenceKmers(contig); // If the collision check passed, output the contig if(marked) { if(contig.size() >= m_parameters.minLength) result.contigs.push_back(contig); } } // Update the bit vector for the source kmer for(int64_t i = interval.lower; i <= interval.upper; ++i) m_parameters.pBitVector->updateCAS(i, false, true); for(int64_t i = rc_interval.lower; i <= rc_interval.upper; ++i) m_parameters.pBitVector->updateCAS(i, false, true); } return result; }
SequenceOverlapPairVector KmerOverlaps::PacBioRetrieveMatches(const std::string& query, size_t k, int min_overlap, double min_identity, int bandwidth, const BWTIndexSet& indices, KmerDistribution& kd, int round) { PROFILE_FUNC("OverlapHaplotypeBuilder::PacBioRetrieveMatches") assert(indices.pBWT != NULL); assert(indices.pSSA != NULL); //size_t numStringCount[query.size()+1] = 0; int64_t intervalSum = 0; static size_t n_calls = 0; static size_t n_candidates = 0; static size_t n_output = 0; static double t_time = 0; size_t count = 0; size_t numKmer = 0; size_t numRepeatKmer = 0; size_t totalKmer = 0; size_t numNoSeedRead = 0; size_t repeatCutoff = kd.getRepeatKmerCutoff(); size_t errorCutoff = kd.getMedian() - kd.getSdv(); Timer timer("test", true); n_calls++; //std::cout<<"PacBioRetrieveMatches\n"; std::cout<<"\tk :\t"<<k<<"\n"; SequenceOverlapPairVector overlap_vector; std::vector<long> identityVector(100); for(int j = 0;j < identityVector.size(); j++) identityVector[j] = 0; // Use the FM-index to look up intervals for each kmer of the read. Each index // in the interval is stored individually in the KmerMatchMap. We then // backtrack to map these kmer indices to read IDs. As reads can share // multiple kmers, we use the map to avoid redundant lookups. // There is likely a faster algorithm which performs direct decompression // of the read sequences without having to expand the intervals to individual // indices. The current algorithm suffices for now. KmerMatchMap prematchMap; size_t num_kmers = query.size() - k + 1; clock_t search_seeds_s = clock(), search_seeds_e; for(size_t i = 0; i < num_kmers; i++) { std::string kmer = query.substr(i, k); BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer); if(interval.upper - interval.lower < errorCutoff) numNoSeedRead++; if((interval.upper - interval.lower) > 20 && (interval.upper - interval.lower) < repeatCutoff) { numKmer++; totalKmer++; } //To avoid the repeat region /*if((interval.upper - interval.lower) > repeatCutoff) { numRepeatKmer++; totalKmer++; continue; } else interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;*/ if(interval.isValid() && interval.size()) { //std::cout<<"\tinterval size : "<<interval.upper - interval.lower<<std::endl; for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), false }; prematchMap.insert(std::make_pair(match, false)); } intervalSum += interval.upper - interval.lower; count++; } kmer = reverseComplement(kmer); interval = BWTAlgorithms::findInterval(indices, kmer); interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper; if(interval.isValid() && interval.size()) { for(int64_t j = interval.lower; j <= interval.upper; ++j) { KmerMatch match = { i, static_cast<size_t>(j), true }; prematchMap.insert(std::make_pair(match, false)); } intervalSum += interval.upper - interval.lower; count++; } } if(numNoSeedRead == num_kmers) std::cout<<"\tnoSeedRead : 1"<<std::endl; std::cout<<"\tnumber of kmer : "<<numKmer<<std::endl; std::cout<<"\tnumber of RepeatKmer : "<<numRepeatKmer<<std::endl; std::cout<<"\tnumber of totalkmer : "<<totalKmer<<std::endl; // Backtrack through the kmer indices to turn them into read indices. // This mirrors the calcSA function in SampledSuffixArray except we mark each entry // as visited once it is processed. //std::cout<<"\tintervalSum : "<<intervalSum<<std::endl; //std::cout<<"\tintervalCount : "<<count<<std::endl; std::cout<<"\tprematchMap :\t"<<prematchMap.size()<<std::endl; KmerMatchSet matches; for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter) { //std::cout<<"iter->first.position : "<<iter->first.position<<std::endl; // This index has been visited if(iter->second) continue; // Mark this as visited iter->second = true; // Backtrack the index until we hit the starting symbol KmerMatch out_match = iter->first; while(1) { char b = indices.pBWT->getChar(out_match.index); out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1); // Check if the hash indicates we have visited this index. If so, stop the backtrack KmerMatchMap::iterator find_iter = prematchMap.find(out_match); if(find_iter != prematchMap.end()) { // We have processed this index already if(find_iter->second) break; else find_iter->second = true; } if(b == '$') { // We've found the lexicographic index for this read. Turn it into a proper ID out_match.index = indices.pSSA->lookupLexoRank(out_match.index); //std::cout<<"out_match.position"<<out_match.position<<std::endl; matches.insert(out_match); break; } } } search_seeds_e = clock(); std::cout<<"\tmatchset :\t"<<matches.size()<<"\n"; // Refine the matches by computing proper overlaps between the sequences // Use the overlaps that meet the thresholds to build a multiple alignment clock_t extrac_s, extrac_e; clock_t overlapE_s, overlapE_e; clock_t overlapC_s, overlapC_e; double extrac_sum = 0.0; double overlapE_sum = 0.0, overlapC_sum = 0.0; int compute_count = 0,extend_count = 0; size_t acNumber = 0; for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter) { extrac_s = clock(); std::string match_sequence;// = BWTAlgorithms::extractString(indices.pBWT, iter->index); if(indices.pReadTable != NULL) match_sequence = indices.pReadTable->getRead(iter->index).seq.toString(); /*else match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);*/ extrac_e = clock(); extrac_sum += (double)extrac_e - extrac_s; if(iter->is_reverse) match_sequence = reverseComplement(match_sequence); // Ignore identical matches if(match_sequence == query) continue; // Compute the overlap. If the kmer match occurs a single time in each sequence we use // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper. SequenceOverlap overlap; std::string match_kmer = query.substr(iter->position, k); size_t pos_0 = iter->position;//query.find(match_kmer); size_t pos_1 = match_sequence.find(match_kmer); assert(pos_0 != std::string::npos && pos_1 != std::string::npos); //Timer* sTimer = new Timer("seeds overlap"); // Check for secondary occurrences /*if(query.find(match_kmer, pos_0 + 1) != std::string::npos || match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) { // One of the reads has a second occurrence of the kmer. Use // the slow overlapper. overlapC_s = clock(); compute_count++; overlap = Overlapper::computeOverlap(query, match_sequence); overlapC_e = clock(); overlapC_sum += (double)overlapC_e - overlapC_s; } else {*/ overlapE_s = clock(); extend_count++; overlap = Overlapper::PacBioExtendMatch(query, match_sequence, pos_0, pos_1, bandwidth); overlapE_e = clock(); overlapE_sum += (double)overlapE_e - overlapE_s; //} //delete sTimer; n_candidates += 1; bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap; bool bPassedIdentity = overlap.getPercentIdentity() >= min_identity; identityVector[(int)overlap.getPercentIdentity()] += 1; //overlap.printTotal_columns(); //overlap.printEdit_distance(); //std::cout<<"min_overlap == "<<overlap.getOverlapLength()<<"\n"; //std::cout<<"overlap.getOverlapLength() / 100 == "<<overlap.getOverlapLength() / 100<<"\n"; //std::cout<<"min_identity == "<<min_identity<<"\n"; //std::cout<<"bPassedOverlap == "<<bPassedOverlap<<"\n"; //std::cout<<"bPassedIdentity == "<<bPassedIdentity<<"\n"; //std::cout<<match_sequence<<"\n"; if(bPassedOverlap && bPassedIdentity) { SequenceOverlapPair op; op.sequence[0] = query; op.sequence[1] = match_sequence; op.overlap = overlap; op.is_reversed = iter->is_reverse; overlap_vector.push_back(op); n_output += 1; acNumber += 1; //numStringCount } } std::cout<<"\tacceptable number of seeds == "<<acNumber<<"\n"; std::cout<<"\tsearch seeds time : "<<(double)(search_seeds_e - search_seeds_s)/CLOCKS_PER_SEC<<std::endl; std::cout<<"\textract time : "<<extrac_sum/CLOCKS_PER_SEC<<std::endl; //std::cout<<"\tcompute_count : "<<compute_count<<std::endl; //std::cout<<"\tbanded_count : "<<extend_count<<std::endl; //std::cout<<"\tcompute overlap time : "<<overlapC_sum/CLOCKS_PER_SEC<<std::endl; //std::cout<<"\tbanded overlap time : "<<overlapE_sum/CLOCKS_PER_SEC<<std::endl; /*------------------output-identity------------------------------------ double mean = 0.0, temp_mean = 0.0,temp = 0.0; for(int i = 0; i < 100; i++) { //count*identity mean+=identityVector[i]*i; temp+=identityVector[i]; } mean=mean/temp; for(int i = 0; i < 100; i++) //count*identity^2 temp_mean+=identityVector[i]*pow(i,2); std::cout<<"-----------outputIdentity------------"<<std::endl; std::cout<<"\tround "<<round; std::cout<<"\tmean identity :\t"<<mean<<std::endl; std::cout<<"\tSD identity :\t"<<sqrt(temp_mean/temp - pow(mean,2))<<std::endl; std::cout<<"-------------------------------------\n"<<std::endl; /*---------------------------------------------------------------------*/ t_time += timer.getElapsedCPUTime(); if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0) printf("[kmer overlaps] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time); return overlap_vector; }