// Recursive traversal to extract all the strings needed for the above function void _extractRankedPrefixes(const BWT* pBWT, BWTInterval interval, const std::string& curr, RankedPrefixVector* pOutput) { AlphaCount64 extensions = BWTAlgorithms::getExtCount(interval, pBWT); for(size_t i = 0; i < 4; ++i) { char b = "ACGT"[i]; if(extensions.get(b) > 0) { BWTInterval ni = interval; BWTAlgorithms::updateInterval(ni, b, pBWT); _extractRankedPrefixes(pBWT, ni, curr + b, pOutput); } } // If we have extended the prefix as far as possible, stop BWTAlgorithms::updateInterval(interval, '$', pBWT); for(int64_t i = interval.lower; i <= interval.upper; ++i) { // backwards search gives a reversed prefix, fix it RankedPrefix rp = { (size_t)i, reverse(curr) }; pOutput->push_back(rp); } }
std::string get_valid_dbg_neighbors_coverage_and_ratio(const std::string& kmer, const BWTIndexSet& index_set, size_t min_coverage, double min_ratio, EdgeDir dir) { std::string out; AlphaCount64 counts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(kmer, index_set.pBWT, dir, index_set.pCache); if(!counts.hasDNAChar()) return out; // no extensions char max_b = counts.getMaxDNABase(); size_t max_c = counts.get(max_b); for(size_t j = 0; j < 4; ++j) { char b = "ACGT"[j]; size_t c = counts.get(b); if(c >= min_coverage && (double)c / max_c >= min_ratio) out.push_back(b); } return out; }
// Write out the next BWT for the next cycle. This updates BCRVector // and suffixSymbolCounts. Returns the number of symbols written to writeBWT size_t BWTCA::outputPartialCycle(int cycle, const DNAEncodedStringVector* pReadSequences, BCRVector& bcrVector, const DNAEncodedString& readBWT, size_t total_read_symbols, DNAEncodedString& writeBWT, AlphaCount64& suffixStartCounts) { // We track the rank of each symbol as it is copied/inserted // into the new bwt AlphaCount64 rank; // Counters size_t num_copied = 0; size_t num_inserted = 0; size_t num_wrote = 0; for(size_t i = 0; i < bcrVector.size(); ++i) { BCRElem& ne = bcrVector[i]; // Copy elements from the read bwt until we reach the target position while(num_copied + num_inserted < ne.position) { char c = readBWT.get(num_copied++); writeBWT.set(num_wrote++, c); rank.increment(c); } // Now insert the incoming symbol int rl = pReadSequences->at(ne.index).length(); char c = '$'; // If the cycle number is greater than the read length, we are // on the final iteration and we just add in the '$' characters if(cycle <= rl) c = pReadSequences->at(ne.index).get(rl - cycle); //std::cout << "Inserting " << c << " at position " << num_copied + num_inserted << "\n"; writeBWT.set(num_wrote++, c); num_inserted += 1; // Update the nvector element ne.sym = c; // Record the rank of the inserted symbol ne.position = rank.get(c); // Update the rank and the number of suffixes that start with c rank.increment(c); suffixStartCounts.increment(c); } // Copy any remaining symbols in the bwt while(num_copied < total_read_symbols) writeBWT.set(num_wrote++, readBWT.get(num_copied++)); return num_wrote; }
// Validate that the sampled occurrence array is correct void Occurrence::validate(const BWTString& bwStr) const { size_t l = bwStr.length(); AlphaCount64 sum; for(size_t i = 0; i < l; ++i) { char currB = bwStr.get(i); sum.increment(currB); AlphaCount64 calculated = get(bwStr, i); for(int i = 0; i < ALPHABET_SIZE; ++i) assert(calculated.get(ALPHABET[i]) == sum.get(ALPHABET[i])); } }
void BWTCA::calculateAbsolutePositions(BCRVector& bcrVector, const AlphaCount64& suffixSymbolCounts) { // Calculate a predecessor array from the suffix symbol counts AlphaCount64 predCounts; for(int i = 0; i < BWT_ALPHABET::size; ++i) { char b = RANK_ALPHABET[i]; int64_t pc = suffixSymbolCounts.getLessThan(b); predCounts.set(b, pc); } for(size_t i = 0; i < bcrVector.size(); ++i) bcrVector[i].position += predCounts.get(bcrVector[i].sym); }
// Perform duplicate check // Look up the interval of the read in the BWT. If the index of the read DuplicateCheckResult QCProcess::performDuplicateCheck(const SequenceWorkItem& workItem) { assert(m_params.pSharedBV != NULL); std::string w = workItem.read.seq.toString(); std::string rc_w = reverseComplement(w); // Look up the interval of the sequence and its reverse complement BWTIntervalPair fwdIntervals = BWTAlgorithms::findIntervalPair(m_params.pBWT, m_params.pRevBWT, w); BWTIntervalPair rcIntervals = BWTAlgorithms::findIntervalPair(m_params.pBWT, m_params.pRevBWT, rc_w); // Check if this read is a substring of any other // This is indicated by the presence of a non-$ extension in the left or right direction AlphaCount64 fwdECL = BWTAlgorithms::getExtCount(fwdIntervals.interval[0], m_params.pBWT); AlphaCount64 fwdECR = BWTAlgorithms::getExtCount(fwdIntervals.interval[1], m_params.pRevBWT); AlphaCount64 rcECL = BWTAlgorithms::getExtCount(rcIntervals.interval[0], m_params.pBWT); AlphaCount64 rcECR = BWTAlgorithms::getExtCount(rcIntervals.interval[1], m_params.pRevBWT); if(fwdECL.hasDNAChar() || fwdECR.hasDNAChar() || rcECL.hasDNAChar() || rcECR.hasDNAChar()) { // Substring reads are always removed so no need to update the bit vector return DCR_SUBSTRING; } // Calculate the lexicographic intervals for the fwd and reverse intervals BWTAlgorithms::updateBothL(fwdIntervals, '$', m_params.pBWT); BWTAlgorithms::updateBothL(rcIntervals, '$', m_params.pBWT); // Calculate the canonical index for this string - the lowest // value in the two lexicographic index int64_t fi = fwdIntervals.interval[0].isValid() ? fwdIntervals.interval[0].lower : std::numeric_limits<int64_t>::max(); int64_t ri = rcIntervals.interval[0].isValid() ? rcIntervals.interval[0].lower : std::numeric_limits<int64_t>::max(); int64_t canonicalIdx = std::min(fi, ri); // Check if the bit reprsenting the canonical index is set in the shared bit vector if(!m_params.pSharedBV->test(canonicalIdx)) { // This read is not a duplicate // Attempt to atomically set the bit from false to true if(m_params.pSharedBV->updateCAS(canonicalIdx, false, true)) { // Call succeed, return that this read is not a duplicate return DCR_UNIQUE; } else { // Call failed, some other thread set the bit before // this thread. Return that the reead is a duplicate return DCR_FULL_LENGTH_DUPLICATE; } } else { // this read is duplicate return DCR_FULL_LENGTH_DUPLICATE; } }
// Calculate the successors of this node in the implicit deBruijn graph StringVector StringThreader::getDeBruijnExtensions(StringThreaderNode* pNode) { WARN_ONCE("TODO: Refactor StringThreader to use new deBruijn code"); // Get the last k-1 bases of the node std::string pmer = pNode->getSuffix(m_kmer - 1); std::string rc_pmer = reverseComplement(pmer); // Get an interval for the p-mer and its reverse complement BWTIntervalPair ip = BWTAlgorithms::findIntervalPair(m_pBWT, m_pRevBWT, pmer); BWTIntervalPair rc_ip = BWTAlgorithms::findIntervalPair(m_pBWT, m_pRevBWT, rc_pmer); // Get the extension bases AlphaCount64 extensions; AlphaCount64 rc_extensions; if(ip.interval[1].isValid()) extensions += BWTAlgorithms::getExtCount(ip.interval[1], m_pRevBWT); if(rc_ip.interval[1].isValid()) rc_extensions = BWTAlgorithms::getExtCount(rc_ip.interval[0], m_pBWT); rc_extensions.complement(); extensions += rc_extensions; // Loop over the DNA symbols, if there is are more than two characters create a branch // otherwise just perform an extension. bool hasExtension = extensions.hasDNAChar(); StringVector out; if(hasExtension) { for(int i = 0; i < DNA_ALPHABET::size; ++i) { char b = DNA_ALPHABET::getBase(i); if(extensions.get(b) > 0) { // extend to b std::string tmp; tmp.append(1,b); out.push_back(tmp); } } } // If the node branched, return true so the outer function can remove it from the leaf list return out; }
// Initialize the counts from the bwt string b void Occurrence::initialize(const BWTString& bwStr, int sampleRate) { m_sampleRate = sampleRate; m_shift = calculateShiftValue(m_sampleRate); size_t l = bwStr.length(); int num_samples = (l % m_sampleRate == 0) ? (l / m_sampleRate) : (l / m_sampleRate + 1); m_values.resize(num_samples); AlphaCount64 sum; for(size_t i = 0; i < l; ++i) { char currB = bwStr.get(i); sum.increment(currB); if(i % m_sampleRate == 0) m_values[i / m_sampleRate] = sum; } }
// Check if sequence is composed of predominantely a single base // Returns true if the sequence is not degenrate bool QCProcess::performDegenerateCheck(const SequenceWorkItem& item) { std::string w = item.read.seq.toString(); AlphaCount64 bc; for(size_t i = 0; i < w.size(); ++i) { bc.increment(w[i]); } size_t maxCount = bc.getMaxCount(); double prop = (double)maxCount / w.size(); if(prop > m_params.degenProportion) { if(m_params.verbose > 0) std::cout << "Read " << w << " failed degenerate filter\n"; return false; } return true; }
// Fill in the FM-index data structures void SBWT::initializeFMIndex(int sampleRate) { // initialize the occurance table m_occurrence.initialize(m_bwStr, sampleRate); // Calculate the C(a) array // Calculate the total number of occurances of each character in the BW str AlphaCount64 tmp; for(size_t i = 0; i < m_bwStr.length(); ++i) { tmp.increment(m_bwStr.get(i)); } m_predCount.set('$', 0); m_predCount.set('A', tmp.get('$')); m_predCount.set('C', m_predCount.get('A') + tmp.get('A')); m_predCount.set('G', m_predCount.get('C') + tmp.get('C')); m_predCount.set('T', m_predCount.get('G') + tmp.get('G')); }
// Update N and the output BWT for the initial cycle, corresponding to the sentinel suffixes // the symbolCounts vector is updated to hold the number of times each symbol has been inserted // into the bwt void BWTCA::outputInitialCycle(const DNAEncodedStringVector* pReadSequences, BCRVector& bcrVector, DNAEncodedString& bwt, AlphaCount64& suffixSymbolCounts) { AlphaCount64 incomingSymbolCounts; size_t n = pReadSequences->size(); size_t first_read_len = pReadSequences->at(0).length(); for(size_t i = 0; i < n; ++i) { size_t rl = pReadSequences->at(i).length(); // Check that all reads are the same length if(rl != first_read_len) { std::cout << "Error: This implementation of BCR requires all reads to be the same length\n"; exit(EXIT_FAILURE); } char c = pReadSequences->at(i).get(rl - 1); bwt.set(i, c); assert(rl > 1); // Load the elements of the N vector with the next symbol bcrVector[i].sym = c; bcrVector[i].index = i; // Set the relative position of the symbol that is being inserted bcrVector[i].position = incomingSymbolCounts.get(c); // Increment the count of the first base of the suffix of the // incoming strings. This is $ for the initial cycle suffixSymbolCounts.increment('$'); // Update the inserted symbols incomingSymbolCounts.increment(c); } suffixSymbolCounts += incomingSymbolCounts; }
void FMIndex::loadSGABWT(const std::string& filename) { FMIndexBuilder builder(filename, m_smallSampleRate, m_largeSampleRate); size_t n = 0; // Load the compressed string from the file std::ifstream str_reader(builder.getStringFilename().c_str()); n = builder.getNumStringBytes(); m_string.resize(n); str_reader.read(reinterpret_cast<char*>(&m_string[0]), n); // Load the small markers from the file std::ifstream sm_reader(builder.getSmallMarkerFilename().c_str()); n = builder.getNumSmallMarkers(); m_smallMarkers.resize(n); sm_reader.read(reinterpret_cast<char*>(&m_smallMarkers[0]), sizeof(SmallMarker) * n); // Load the large markers from the file std::ifstream lm_reader(builder.getLargeMarkerFilename().c_str()); n = builder.getNumLargeMarkers(); m_largeMarkers.resize(n); lm_reader.read(reinterpret_cast<char*>(&m_largeMarkers[0]), sizeof(LargeMarker) * n); m_numStrings = builder.getNumStrings(); m_numSymbols = builder.getNumSymbols(); AlphaCount64 totals = builder.getSymbolCounts(); assert(totals.get('$') + totals.get('A') + totals.get('C') + totals.get('G') + totals.get('T') == m_numSymbols); m_predCount.set('$', 0); m_predCount.set('A', totals.get('$')); m_predCount.set('C', m_predCount.get('A') + totals.get('A')); m_predCount.set('G', m_predCount.get('C') + totals.get('C')); m_predCount.set('T', m_predCount.get('G') + totals.get('G')); assert(m_predCount.get('T') + totals.get('T') == m_numSymbols); m_decoder = builder.getDecoder(); printInfo(); }
AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(const std::string str, const BWT* pBWT, EdgeDir direction, const BWTIntervalCache* pFwdCache) { size_t k = str.size(); size_t p = k - 1; std::string pmer; // In the sense direction, we extend from the 3' end if(direction == ED_SENSE) pmer = str.substr(1, p); else pmer = str.substr(0, p); assert(pmer.length() == p); std::string rc_pmer = reverseComplement(pmer); // As we only have a single index, we can only directly look up // the extensions for either the pmer or its reverse complement // In the SENSE extension direction, we directly look up for // the reverse complement. In ANTISENSE we directly look up for // the pmer. // Get the extension bases AlphaCount64 extensions; AlphaCount64 rc_extensions; // Set up pointers to the data to fill in/query // depending on the direction of the extension AlphaCount64* pDirectEC; AlphaCount64* pIndirectEC; std::string* pDirectStr; std::string* pIndirectStr; if(direction == ED_SENSE) { pDirectEC = &rc_extensions; pDirectStr = &rc_pmer; pIndirectEC = &extensions; pIndirectStr = &pmer; } else { pDirectEC = &extensions; pDirectStr = &pmer; pIndirectEC = &rc_extensions; pIndirectStr = &rc_pmer; } // Get the interval for the direct query string BWTInterval interval; // Use interval cache if available if(pFwdCache) interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, *pDirectStr); else interval = BWTAlgorithms::findInterval(pBWT, *pDirectStr); // Fill in the direct count if(interval.isValid()) *pDirectEC = BWTAlgorithms::getExtCount(interval, pBWT); // Now, for the non-direct index, query the 4 possible k-mers that are adjacent to the pmer // Setup the query sequence std::string query(k, 'A'); int varIdx = query.size() - 1; query.replace(0, p, *pIndirectStr); for(int i = 0; i < BWT_ALPHABET::size; ++i) { // Transform the query char b = BWT_ALPHABET::getChar(i); query[varIdx] = b; // Perform lookup if(pFwdCache) interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, query); else interval = BWTAlgorithms::findInterval(pBWT, query); // Update the extension count if(interval.isValid()) pIndirectEC->add(b, interval.size()); } // Switch the reverse-complement extensions to the same strand as the str rc_extensions.complement(); extensions += rc_extensions; return extensions; }
// Run the bubble construction process HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplotypes) { PROFILE_FUNC("GraphCompare::buildVariantStringGraph") assert(!m_startingKmer.empty()); std::map<std::string, int> kmerCountMap; // We search until we find the first common vertex in each direction size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2; size_t MAX_ITERATIONS = 2000; size_t MAX_SIMULTANEOUS_BRANCHES = 40; size_t MAX_TOTAL_BRANCHES = 50; // Tracking stats size_t max_simul_branches_used = 0; size_t total_branches = 0; size_t iterations = 0; // Initialize the graph StringGraph* pGraph = new StringGraph; BuilderExtensionQueue queue; Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(m_startingKmer, m_startingKmer); pVertex->setColor(GC_BLACK); pGraph->addVertex(pVertex); // Add the vertex to the extension queue queue.push(BuilderExtensionNode(pVertex, ED_SENSE)); queue.push(BuilderExtensionNode(pVertex, ED_ANTISENSE)); std::vector<Vertex*> sense_join_vector; std::vector<Vertex*> antisense_join_vector; // Perform the extension. The while conditions are heuristics to avoid searching // the graph too much while(!queue.empty() && iterations++ < MAX_ITERATIONS && queue.size() < MAX_SIMULTANEOUS_BRANCHES && total_branches < MAX_TOTAL_BRANCHES) { if(queue.size() > max_simul_branches_used) max_simul_branches_used = queue.size(); BuilderExtensionNode curr = queue.front(); queue.pop(); // Calculate de Bruijn extensions for this node std::string vertStr = curr.pVertex->getSeq().toString(); AlphaCount64 extensionCounts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction); std::string extensionsUsed; for(size_t i = 0; i < DNA_ALPHABET::size; ++i) { char b = DNA_ALPHABET::getBase(i); size_t count = extensionCounts.get(b); bool acceptExt = count >= m_parameters.minDBGCount; if(!acceptExt) continue; extensionsUsed.push_back(b); std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, b, curr.direction); kmerCountMap[newStr] = count; // Create the new vertex and edge in the graph // Skip if the vertex already exists if(pGraph->getVertex(newStr) != NULL) continue; // Allocate the new vertex and add it to the graph Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(newStr, newStr); pVertex->setColor(GC_BLACK); pGraph->addVertex(pVertex); // Add edges VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, curr.pVertex, pVertex, curr.direction); // Check if this sequence is present in the FM-index of the target // If so, it is the join point of the de Bruijn graph and we extend no further. size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex); if(targetCount >= MIN_TARGET_COUNT) { if(curr.direction == ED_SENSE) sense_join_vector.push_back(pVertex); else antisense_join_vector.push_back(pVertex); } else { // Add the vertex to the extension queue queue.push(BuilderExtensionNode(pVertex, curr.direction)); } } // Update the total number of times we branches the search if(!extensionsUsed.empty()) total_branches += extensionsUsed.size() - 1; } // If the graph construction was successful, walk the graph // between the endpoints to make a string // Generate haplotypes between every pair of antisense/sense join vertices for(size_t i = 0; i < antisense_join_vector.size(); ++i) { for(size_t j = 0; j < sense_join_vector.size(); ++j) { SGWalkVector outWalks; SGSearch::findWalks(antisense_join_vector[i], sense_join_vector[j], ED_SENSE, 100000, // max distance to search 10000, // max nodes to search true, // exhaustive search outWalks); for(size_t k = 0; k < outWalks.size(); ++k) out_haplotypes.push_back(outWalks[k].getString(SGWT_START_TO_END)); } } delete pGraph; return HBRC_OK; }