SuffixArray* build(const DNASeqList& reads, size_t threads = 1) { assert(!reads.empty()); size_t num_strings = reads.size(); // In the multiple strings case, we need a 2D bit array // to hold the L/S types for the suffixes char** type_array = new char*[num_strings]; for (size_t i = 0; i < num_strings; ++i) { const DNASeq& read = reads[i]; size_t num_bytes = (read.seq.length() + 1) / 8 + 1; type_array[i] = new char[num_bytes]; memset(type_array[i], 0, num_bytes); } // Classify each suffix as being L or S type for (size_t i = 0; i < num_strings; ++i) { const DNASeq& read = reads[i]; size_t len = read.seq.length() + 1; // The empty suffix ($) for each string is defined to be S type // and hence the next suffix must be L type setBit(type_array, i, len - 1, 1); if (!read.seq.empty()) { setBit(type_array, i, len - 2, 0); for (size_t j = len - 2; j > 0; --j) { char curr = read.seq[j - 1], next = read.seq[j]; bool type = (curr < next || (curr == next && getBit(type_array, i, j) == 1)); setBit(type_array, i, j - 1, type); } } } // setup buckets size_t bucket_counts[DNAAlphabet::ALL_SIZE]; size_t buckets[DNAAlphabet::ALL_SIZE]; // find the ends of the buckets countBuckets(reads, bucket_counts, DNAAlphabet::ALL_SIZE); //getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true); // Initialize the suffix array size_t num_suffixes = std::accumulate(&bucket_counts[0], &bucket_counts[0] + DNAAlphabet::ALL_SIZE, (size_t)0); LOG4CXX_DEBUG(logger, boost::format("initialize SA, strings: %d, suffixes: %d") % num_strings % num_suffixes); SuffixArray* sa = new SuffixArray(num_strings, num_suffixes); // Copy all the LMS substrings into the first n1 places in the SA size_t n1 = 0; for (size_t i = 0; i < num_strings; ++i) { const DNASeq& read = reads[i]; for (size_t j = 0; j < read.seq.length() + 1; ++j) { if (isLMS(type_array, i, j)) { SuffixArray::Elem& ele = (*sa)[n1++]; ele.i = i; ele.j = j; } } } // Call MKQS, first on the sequence and then on the index in the read table LOG4CXX_DEBUG(logger, boost::format("calling mkqs on %d of %d suffixes(%f), using %d threads") % n1 % num_suffixes % ((double)n1 / num_suffixes) % threads); { SuffixRadixCmp radixcmp(reads); SuffixIndexCmp indexcmp; if (threads <= 1) { mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp); } else { mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp); } } LOG4CXX_DEBUG(logger, "mkqs finished"); // Induction sort the remaining suffixes for (size_t i = n1; i < num_suffixes; ++i) { (*sa)[i] = SuffixArray::Elem(); } // Find the ends of the buckets getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true); for (size_t i = n1; i > 0; --i) { SuffixArray::Elem elem = (*sa)[i - 1]; (*sa)[i - 1] = SuffixArray::Elem(); // empty const DNASeq& read = reads[elem.i]; char c = read.seq[elem.j]; (*sa)[--buckets[DNAAlphabet::torank(c)]] = elem; } induceSAl(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, false); induceSAs(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, true); // deallocate t array for (size_t i = 0; i < num_strings; ++i) { SAFE_DELETE_ARRAY(type_array[i]); } SAFE_DELETE_ARRAY(type_array); return sa; }
// Implementation of induced copying algorithm by // Nong, Zhang, Chan // Follows implementation given as an appendix to their 2008 paper // '\0' is the sentinenl in this algorithm void saca_induced_copying(SuffixArray* pSA, const ReadTable* pRT, int numThreads) { // In the multiple strings case, we need a 2D bit array // to hold the L/S types for the suffixes size_t num_strings = pRT->getCount(); char** type_array = new char*[num_strings]; for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; size_t num_bytes = (s_len / 8) + 1; type_array[i] = new char[num_bytes]; assert(type_array[i] != 0); memset(type_array[i], 0, num_bytes); } // Classify each suffix as being L or S type for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; // The empty suffix ($) for each string is defined to be S type // and hence the next suffix must be L type setBit(type_array, i, s_len - 1, 1); setBit(type_array, i, s_len - 2, 0); for(int64_t j = s_len - 3; j >= 0; --j) { char curr_c = GET_CHAR(i, j); char next_c = GET_CHAR(i, j + 1); bool s_type = (curr_c < next_c || (curr_c == next_c && getBit(type_array, i, j + 1) == 1)); setBit(type_array, i, j, s_type); } } // setup buckets const int ALPHABET_SIZE = 5; int64_t bucket_counts[ALPHABET_SIZE]; int64_t buckets[ALPHABET_SIZE]; // find the ends of the buckets countBuckets(pRT, bucket_counts, ALPHABET_SIZE); getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); // Initialize the suffix array size_t num_suffixes = buckets[ALPHABET_SIZE - 1]; pSA->initialize(num_suffixes, pRT->getCount()); // Copy all the LMS substrings into the first n1 places in the SA size_t n1 = 0; for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; for(size_t j = 0; j < s_len; ++j) { if(isLMS(i,j)) pSA->set(n1++, SAElem(i, j)); } } /* //induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false); //induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true); // Compact all the sorted substrings into the first portion of the SA size_t n1 = 0; for(size_t i = 0; i < num_suffixes; ++i) { SAElem elem = pSA->get(i); if(!elem.isEmpty() && isLMS(elem.getID(), elem.getPos())) { pSA->set(n1++, elem); } } */ double ratio = (double)n1 / (double)num_suffixes; std::cout << "[saca] calling mkqs on " << n1 << " suffixes " << ratio << " using " << numThreads << " threads \n"; // Call MKQS, first on the sequence and then on the index in the read table SuffixCompareRadix radix_compare(pRT, 6); SuffixCompareIndex index_compare; //SuffixCompareID id_compare(pRT); if(numThreads <= 1) mkqs2(&pSA->m_data[0], n1, 0, radix_compare, index_compare); else parallel_mkqs(&pSA->m_data[0], n1, numThreads, radix_compare, index_compare); std::cout << "[saca] mkqs finished\n"; // Induction sort the remaining suffixes for(size_t i = n1; i < num_suffixes; ++i) pSA->set(i, SAElem()); // Find the ends of the buckets getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); for(int64_t i = n1 - 1; i >= 0; --i) { SAElem elem_i = pSA->get(i); pSA->set(i, SAElem()); // empty char c = GET_CHAR(elem_i.getID(), elem_i.getPos()); pSA->set(--buckets[GET_BKT(c)], elem_i); } induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false); induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true); // deallocate t array for(size_t i = 0; i < num_strings; ++i) { delete [] type_array[i]; } delete [] type_array; }