// Write the final BWT to a file. size_t BWTCA::outputFinalBWT(BCRVector& bcrVector, const DNAEncodedString& readBWT, size_t partial_bwt_symbols, BWTWriterBinary* pBWTWriter, SAWriter* pSAWriter) { // Counters size_t num_copied = 0; size_t num_inserted = 0; for(size_t i = 0; i < bcrVector.size(); ++i) { BCRElem& ne = bcrVector[i]; // Copy elements from the read bwt until we reach the target position while(num_copied + num_inserted < ne.position) pBWTWriter->writeBWChar(readBWT.get(num_copied++)); // Write a single $, terminating this string pBWTWriter->writeBWChar('$'); pSAWriter->writeElem(SAElem(ne.index, 0)); num_inserted += 1; } // Copy any remaining symbols in the bwt while(num_copied < partial_bwt_symbols) pBWTWriter->writeBWChar(readBWT.get(num_copied++)); pBWTWriter->finalize(); return num_copied + num_inserted; }
// Initialize a suffix array for the strings in RT void SuffixArray::initialize(const ReadTable& rt) { size_t n = rt.countSumLengths() + rt.getCount(); initialize(n, rt.getCount()); // Fill the data table with the linear ordering of the suffixes size_t count = 0; for(size_t i = 0; i < rt.getCount(); ++i) { // + 1 below is for the empty suffix (is it actually needed?) for(size_t j = 0; j < rt.getRead(i).seq.length() + 1; ++j) { m_data[count++] = SAElem(i, j); } } }
// Implementation of induced copying algorithm by // Nong, Zhang, Chan // Follows implementation given as an appendix to their 2008 paper // '\0' is the sentinenl in this algorithm void saca_induced_copying(SuffixArray* pSA, const ReadTable* pRT, int numThreads) { // In the multiple strings case, we need a 2D bit array // to hold the L/S types for the suffixes size_t num_strings = pRT->getCount(); char** type_array = new char*[num_strings]; for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; size_t num_bytes = (s_len / 8) + 1; type_array[i] = new char[num_bytes]; assert(type_array[i] != 0); memset(type_array[i], 0, num_bytes); } // Classify each suffix as being L or S type for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; // The empty suffix ($) for each string is defined to be S type // and hence the next suffix must be L type setBit(type_array, i, s_len - 1, 1); setBit(type_array, i, s_len - 2, 0); for(int64_t j = s_len - 3; j >= 0; --j) { char curr_c = GET_CHAR(i, j); char next_c = GET_CHAR(i, j + 1); bool s_type = (curr_c < next_c || (curr_c == next_c && getBit(type_array, i, j + 1) == 1)); setBit(type_array, i, j, s_type); } } // setup buckets const int ALPHABET_SIZE = 5; int64_t bucket_counts[ALPHABET_SIZE]; int64_t buckets[ALPHABET_SIZE]; // find the ends of the buckets countBuckets(pRT, bucket_counts, ALPHABET_SIZE); getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); // Initialize the suffix array size_t num_suffixes = buckets[ALPHABET_SIZE - 1]; pSA->initialize(num_suffixes, pRT->getCount()); // Copy all the LMS substrings into the first n1 places in the SA size_t n1 = 0; for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; for(size_t j = 0; j < s_len; ++j) { if(isLMS(i,j)) pSA->set(n1++, SAElem(i, j)); } } /* //induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false); //induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true); // Compact all the sorted substrings into the first portion of the SA size_t n1 = 0; for(size_t i = 0; i < num_suffixes; ++i) { SAElem elem = pSA->get(i); if(!elem.isEmpty() && isLMS(elem.getID(), elem.getPos())) { pSA->set(n1++, elem); } } */ double ratio = (double)n1 / (double)num_suffixes; std::cout << "[saca] calling mkqs on " << n1 << " suffixes " << ratio << " using " << numThreads << " threads \n"; // Call MKQS, first on the sequence and then on the index in the read table SuffixCompareRadix radix_compare(pRT, 6); SuffixCompareIndex index_compare; //SuffixCompareID id_compare(pRT); if(numThreads <= 1) mkqs2(&pSA->m_data[0], n1, 0, radix_compare, index_compare); else parallel_mkqs(&pSA->m_data[0], n1, numThreads, radix_compare, index_compare); std::cout << "[saca] mkqs finished\n"; // Induction sort the remaining suffixes for(size_t i = n1; i < num_suffixes; ++i) pSA->set(i, SAElem()); // Find the ends of the buckets getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); for(int64_t i = n1 - 1; i >= 0; --i) { SAElem elem_i = pSA->get(i); pSA->set(i, SAElem()); // empty char c = GET_CHAR(elem_i.getID(), elem_i.getPos()); pSA->set(--buckets[GET_BKT(c)], elem_i); } induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false); induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true); // deallocate t array for(size_t i = 0; i < num_strings; ++i) { delete [] type_array[i]; } delete [] type_array; }