/* compute SA */ static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) { int *b, i, j; int c0, c1; /* compute SAl */ if (C == B) getCounts(T, C, n, k, cs); getBuckets(C, B, k, 0); /* find starts of buckets */ j = n - 1; b = SA + B[c1 = chr(j)]; *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; for (i = 0; i < n; ++i) { j = SA[i], SA[i] = ~j; if (0 < j) { --j; if ((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; } } /* compute SAs */ if (C == B) getCounts(T, C, n, k, cs); getBuckets(C, B, k, 1); /* find ends of buckets */ for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { if (0 < (j = SA[i])) { --j; if ((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; } else SA[i] = ~j; } }
/* sort all type LMS suffixes */ static void LMSsort1(const void *T, sais_index_type *SA, sais_index_type *C, sais_index_type *B, sais_index_type n, sais_index_type k, int cs) { sais_index_type bb, i, j; sais_index_type c0, c1; /* compute SAl */ if(C == B) { getCounts(T, C, n, k, cs); } getBuckets(C, B, k, 0); /* find starts of buckets */ j = n - 1; bb = B[c1 = chr(j)]; --j; SA[bb++] = (chr(j) < c1) ? ~j : j; for(i = 0; i < n; ++i) { if(0 < (j = SA[i])) { assert(chr(j) >= chr(j + 1)); if((c0 = chr(j)) != c1) { B[c1] = bb; bb = B[c1 = c0]; } assert(i < bb); --j; SA[bb] = (chr(j) < c1) ? ~j : j; ++bb; SA[i] = 0; } else if(j < 0) { SA[i] = ~j; } } /* compute SAs */ if(C == B) { getCounts(T, C, n, k, cs); } getBuckets(C, B, k, 1); /* find ends of buckets */ for(i = n - 1, bb = B[c1 = 0]; 0 <= i; --i) { if(0 < (j = SA[i])) { assert(chr(j) <= chr(j + 1)); if((c0 = chr(j)) != c1) { B[c1] = bb; bb = B[c1 = c0]; } assert((bb) <= i); --j; SA[--bb] = (chr(j) > c1) ? ~(j + 1) : j; SA[i] = 0; } } }
void LMSsort1(string_type T, sarray_type SA, bucketC_type C, bucketB_type B, index_type n, index_type k, bool recount) { typedef typename TextValueType<string_type>::value_type char_type; sarray_type b; index_type i, j; char_type c0, c1; /* compute SAl */ if(recount != false) { getCounts(T, C, n, k); } getBuckets(C, B, k, false); /* find starts of buckets */ j = n - 1; b = SA + B[c1 = T[j]]; --j; *b++ = (T[j] < c1) ? ~j : j; for(i = 0; i < n; ++i) { if(0 < (j = SA[i])) { assert(T[j] >= T[j + 1]); if((c0 = T[j]) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } assert(i < (b - SA)); --j; *b++ = (T[j] < c1) ? ~j : j; SA[i] = 0; } else if(j < 0) { SA[i] = ~j; } } /* compute SAs */ if(recount != false) { getCounts(T, C, n, k); } getBuckets(C, B, k, true); /* find ends of buckets */ for(i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { if(0 < (j = SA[i])) { assert(T[j] <= T[j + 1]); if((c0 = T[j]) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } assert((b - SA) <= i); --j; *--b = (T[j] > c1) ? ~(j + 1) : j; SA[i] = 0; } } }
/* sort all type LMS suffixes */ static void LMSsort1(const void *T, int *SA, int *C, int *B, int n, int k, int cs) { int *b, i, j; int c0, c1; /* compute SAl */ if(C == B) { getCounts(T, C, n, k, cs); } getBuckets(C, B, k, 0); /* find starts of buckets */ j = n - 1; b = SA + B[c1 = chr(j)]; --j; *b++ = (chr(j) < c1) ? ~j : j; for(i = 0; i < n; ++i) { if(0 < (j = SA[i])) { assert(chr(j) >= chr(j + 1)); if((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } assert(i < (b - SA)); --j; *b++ = (chr(j) < c1) ? ~j : j; SA[i] = 0; } else if(j < 0) { SA[i] = ~j; } } /* compute SAs */ if(C == B) { getCounts(T, C, n, k, cs); } getBuckets(C, B, k, 1); /* find ends of buckets */ for(i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { if(0 < (j = SA[i])) { assert(chr(j) <= chr(j + 1)); if((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } assert((b - SA) <= i); --j; *--b = (chr(j) > c1) ? ~(j + 1) : j; SA[i] = 0; } } }
void putSuffix0(long *SA, unsigned char *s, long *bkt, long n, long K, long n1) { long i, j; // find the end of each bucket. getBuckets(s, bkt, n, K, 1); // put the suffixes into their buckets. for (i = n1 - 1; i > 0; i--) { j = SA[i]; SA[i] = 0; SA[bkt[s[j]]--] = j; } SA[0] = n - 1; // set the single sentinel suffix. }
void putSuffix0(unsigned int *SA, const unsigned char *s, unsigned int *bkt, unsigned int n, unsigned int K, int n1) { unsigned int i, j; // find the end of each bucket. getBuckets(s, bkt, n, K, true); // put the suffixes into their buckets. for(i=n1-1; i>0; i--) { j=SA[i]; SA[i]=0; SA[bkt[s[j]]--]=j; } SA[0]=n-1; // set the single sentinel suffix. }
void induceSAs(const DNASeqList& reads, SuffixArray* sa, char** type_array, size_t* counts, size_t* buckets, size_t n, size_t K, bool end) { getBuckets(counts, buckets, K, end); for (size_t i = n; i > 0; --i) { const SuffixArray::Elem& ielem = (*sa)[i - 1]; if (!ielem.empty() && ielem.j > 0) { LOG4CXX_TRACE(logger, boost::format("Curr: %d %d") % ielem.i % ielem.j); SuffixArray::Elem jelem(ielem.i, ielem.j - 1); if (getBit(type_array, jelem.i, jelem.j)) { const DNASeq& read = reads[jelem.i]; char c = read.seq[jelem.j]; LOG4CXX_TRACE(logger, boost::format("<iSA1>Placing %d %d at position %d") % jelem.i % jelem.j % (buckets[DNAAlphabet::torank(c)] - 1)); (*sa)[--buckets[DNAAlphabet::torank(c)]] = jelem; } } } }
void induceSAs0(long *SA, unsigned char *s, long *bkt, long n, long K, char suffix) { long i, j; // find the end of each bucket. getBuckets(s, bkt, n, K, 1); for (i = n - 1; i > 0; i--) if (SA[i] > 0) { j = SA[i] - 1; if (s[j] <= s[j + 1] && bkt[s[j]] < i) { SA[bkt[s[j]]] = j; bkt[s[j]]--; if (!suffix) SA[i] = 0; } } }
void induceSAs0(unsigned int *SA, const unsigned char *s, unsigned int *bkt, unsigned int n, unsigned int K, bool suffix) { unsigned int i, j; // find the end of each bucket. getBuckets(s, bkt, n, K, true); for(i=n-1; i>0; i--) if(SA[i]>0) { j=SA[i]-1; if(s[j]<=s[j+1] && bkt[s[j]]<i) { SA[bkt[s[j]]]=j; bkt[s[j]]--; if(!suffix) SA[i]=0; } } }
void induceSAl0(long *SA, unsigned char *s, long *bkt, long n, long K, char suffix) { long i, j; // find the head of each bucket. getBuckets(s, bkt, n, K, 0); bkt[0]++; // skip the virtual sentinel. for (i = 0; i < n; i++) if (SA[i] > 0) { j = SA[i] - 1; if (s[j] >= s[j + 1]) { SA[bkt[s[j]]] = j; bkt[s[j]]++; if (!suffix && i > 0) SA[i] = 0; } } }
void induceSAs(const ReadTable* pRT, SuffixArray* pSA, char** p_array, int64_t* counts, int64_t* buckets, size_t n, int K, bool end) { getBuckets(counts, buckets, K, end); for(int64_t i = n - 1; i >= 0; --i) { const SAElem& elem_i = pSA->get(i); if(!elem_i.isEmpty() && elem_i.getPos() > 0) { //std::cout << "<isas>Curr: " << elem_i << "\n"; SAElem elem_j(elem_i.getID(), elem_i.getPos() - 1); if(getBit(p_array, elem_j.getID(), elem_j.getPos())) { char c = GET_CHAR(elem_j.getID(),elem_j.getPos()); //std::cout << "<iSAs>Placing " << elem_j << " at position " << buckets[GET_BKT(c)] - 1 << "\n"; pSA->set(--buckets[GET_BKT(c)], elem_j); } } } }
void induceSAl0(unsigned int *SA, const unsigned char *s, unsigned int *bkt, unsigned int n, unsigned int K, bool suffix) { unsigned int i, j; // find the head of each bucket. getBuckets(s, bkt, n, K, false); bkt[0]++; // skip the virtual sentinel. for(i=0; i<n; i++) if(SA[i]>0) { j=SA[i]-1; if(s[j]>=s[j+1]) { SA[bkt[s[j]]]=j; bkt[s[j]]++; if(!suffix && i>0) SA[i]=0; } } }
/* * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet */ static int sais_main(const unsigned char *T, seqint_t *SA, seqint_t fs, seqint_t n, seqint_t k, int cs) { seqint_t *C, *B, *RA; seqint_t j, c, m, p, q, plen, qlen, name; long int i; seqint_t c0, c1; seqint_t diff; /* stage 1: reduce the problem by at least 1/2 sort all the * S-substrings */ if (k <= fs) { C = SA + n; B = (k <= (fs - k)) ? C + k : C; } else if ((C = B = (seqint_t *) malloc(k * sizeof(seqint_t))) == NULL) return -2; getCounts(T, C, n, k, cs); getBuckets(C, B, k, 1); /* find ends of buckets */ for (i = 0; i < n; ++i) SA[i] = 0; for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { if ((c0 = chr(i)) < (c1 + c)) c = 1; else if (c != 0) SA[--B[c1]] = i + 1, c = 0; } induceSA(T, SA, C, B, n, k, cs); if (fs < k) free(C); /* compact all the sorted substrings into the first m items of SA * 2*m must be not larger than n (proveable) */ for (i = 0, m = 0; i < n; ++i) { p = SA[i]; if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); if ((j < n) && (c0 < c1)) SA[m++] = p; } } for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ /* store the length of all substrings */ for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { if ((c0 = chr(i)) < (c1 + c)) c = 1; else if (c != 0) { SA[m + ((i + 1) >> 1)] = j - i - 1; j = i + 1; c = 0; } }
void putSubstr0(long *SA, unsigned char *s, long *bkt, long n, long K) { long i, cur_t, succ_t; // find the end of each bucket. getBuckets(s, bkt, n, K, 1); // set each item in SA as empty. for (i = 0; i < n; i++) SA[i] = 0; succ_t = 0; // s[n-2] must be L-type. for (i = n - 2; i > 0; i--) { cur_t = (s[i - 1] < s[i] || (s[i - 1] == s[i] && succ_t == 1)) ? 1 : 0; if (cur_t == 0 && succ_t == 1) SA[bkt[s[i]]--] = i; succ_t = cur_t; } // set the single sentinel LMS-substring. SA[0] = n - 1; }
SuffixArray* build(const DNASeqList& reads, size_t threads = 1) { assert(!reads.empty()); size_t num_strings = reads.size(); // In the multiple strings case, we need a 2D bit array // to hold the L/S types for the suffixes char** type_array = new char*[num_strings]; for (size_t i = 0; i < num_strings; ++i) { const DNASeq& read = reads[i]; size_t num_bytes = (read.seq.length() + 1) / 8 + 1; type_array[i] = new char[num_bytes]; memset(type_array[i], 0, num_bytes); } // Classify each suffix as being L or S type for (size_t i = 0; i < num_strings; ++i) { const DNASeq& read = reads[i]; size_t len = read.seq.length() + 1; // The empty suffix ($) for each string is defined to be S type // and hence the next suffix must be L type setBit(type_array, i, len - 1, 1); if (!read.seq.empty()) { setBit(type_array, i, len - 2, 0); for (size_t j = len - 2; j > 0; --j) { char curr = read.seq[j - 1], next = read.seq[j]; bool type = (curr < next || (curr == next && getBit(type_array, i, j) == 1)); setBit(type_array, i, j - 1, type); } } } // setup buckets size_t bucket_counts[DNAAlphabet::ALL_SIZE]; size_t buckets[DNAAlphabet::ALL_SIZE]; // find the ends of the buckets countBuckets(reads, bucket_counts, DNAAlphabet::ALL_SIZE); //getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true); // Initialize the suffix array size_t num_suffixes = std::accumulate(&bucket_counts[0], &bucket_counts[0] + DNAAlphabet::ALL_SIZE, (size_t)0); LOG4CXX_DEBUG(logger, boost::format("initialize SA, strings: %d, suffixes: %d") % num_strings % num_suffixes); SuffixArray* sa = new SuffixArray(num_strings, num_suffixes); // Copy all the LMS substrings into the first n1 places in the SA size_t n1 = 0; for (size_t i = 0; i < num_strings; ++i) { const DNASeq& read = reads[i]; for (size_t j = 0; j < read.seq.length() + 1; ++j) { if (isLMS(type_array, i, j)) { SuffixArray::Elem& ele = (*sa)[n1++]; ele.i = i; ele.j = j; } } } // Call MKQS, first on the sequence and then on the index in the read table LOG4CXX_DEBUG(logger, boost::format("calling mkqs on %d of %d suffixes(%f), using %d threads") % n1 % num_suffixes % ((double)n1 / num_suffixes) % threads); { SuffixRadixCmp radixcmp(reads); SuffixIndexCmp indexcmp; if (threads <= 1) { mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp); } else { mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp); } } LOG4CXX_DEBUG(logger, "mkqs finished"); // Induction sort the remaining suffixes for (size_t i = n1; i < num_suffixes; ++i) { (*sa)[i] = SuffixArray::Elem(); } // Find the ends of the buckets getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true); for (size_t i = n1; i > 0; --i) { SuffixArray::Elem elem = (*sa)[i - 1]; (*sa)[i - 1] = SuffixArray::Elem(); // empty const DNASeq& read = reads[elem.i]; char c = read.seq[elem.j]; (*sa)[--buckets[DNAAlphabet::torank(c)]] = elem; } induceSAl(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, false); induceSAs(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, true); // deallocate t array for (size_t i = 0; i < num_strings; ++i) { SAFE_DELETE_ARRAY(type_array[i]); } SAFE_DELETE_ARRAY(type_array); return sa; }
// Implementation of induced copying algorithm by // Nong, Zhang, Chan // Follows implementation given as an appendix to their 2008 paper // '\0' is the sentinenl in this algorithm void saca_induced_copying(SuffixArray* pSA, const ReadTable* pRT, int numThreads) { // In the multiple strings case, we need a 2D bit array // to hold the L/S types for the suffixes size_t num_strings = pRT->getCount(); char** type_array = new char*[num_strings]; for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; size_t num_bytes = (s_len / 8) + 1; type_array[i] = new char[num_bytes]; assert(type_array[i] != 0); memset(type_array[i], 0, num_bytes); } // Classify each suffix as being L or S type for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; // The empty suffix ($) for each string is defined to be S type // and hence the next suffix must be L type setBit(type_array, i, s_len - 1, 1); setBit(type_array, i, s_len - 2, 0); for(int64_t j = s_len - 3; j >= 0; --j) { char curr_c = GET_CHAR(i, j); char next_c = GET_CHAR(i, j + 1); bool s_type = (curr_c < next_c || (curr_c == next_c && getBit(type_array, i, j + 1) == 1)); setBit(type_array, i, j, s_type); } } // setup buckets const int ALPHABET_SIZE = 5; int64_t bucket_counts[ALPHABET_SIZE]; int64_t buckets[ALPHABET_SIZE]; // find the ends of the buckets countBuckets(pRT, bucket_counts, ALPHABET_SIZE); getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); // Initialize the suffix array size_t num_suffixes = buckets[ALPHABET_SIZE - 1]; pSA->initialize(num_suffixes, pRT->getCount()); // Copy all the LMS substrings into the first n1 places in the SA size_t n1 = 0; for(size_t i = 0; i < num_strings; ++i) { size_t s_len = pRT->getReadLength(i) + 1; for(size_t j = 0; j < s_len; ++j) { if(isLMS(i,j)) pSA->set(n1++, SAElem(i, j)); } } /* //induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false); //induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true); // Compact all the sorted substrings into the first portion of the SA size_t n1 = 0; for(size_t i = 0; i < num_suffixes; ++i) { SAElem elem = pSA->get(i); if(!elem.isEmpty() && isLMS(elem.getID(), elem.getPos())) { pSA->set(n1++, elem); } } */ double ratio = (double)n1 / (double)num_suffixes; std::cout << "[saca] calling mkqs on " << n1 << " suffixes " << ratio << " using " << numThreads << " threads \n"; // Call MKQS, first on the sequence and then on the index in the read table SuffixCompareRadix radix_compare(pRT, 6); SuffixCompareIndex index_compare; //SuffixCompareID id_compare(pRT); if(numThreads <= 1) mkqs2(&pSA->m_data[0], n1, 0, radix_compare, index_compare); else parallel_mkqs(&pSA->m_data[0], n1, numThreads, radix_compare, index_compare); std::cout << "[saca] mkqs finished\n"; // Induction sort the remaining suffixes for(size_t i = n1; i < num_suffixes; ++i) pSA->set(i, SAElem()); // Find the ends of the buckets getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); for(int64_t i = n1 - 1; i >= 0; --i) { SAElem elem_i = pSA->get(i); pSA->set(i, SAElem()); // empty char c = GET_CHAR(elem_i.getID(), elem_i.getPos()); pSA->set(--buckets[GET_BKT(c)], elem_i); } induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false); induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true); // deallocate t array for(size_t i = 0; i < num_strings; ++i) { delete [] type_array[i]; } delete [] type_array; }