Beispiel #1
0
Datei: is.c Projekt: sonwell/bwa
/* compute SA */
static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs)
{
	int *b, i, j;
	int  c0, c1;
	/* compute SAl */
	if (C == B) getCounts(T, C, n, k, cs);
	getBuckets(C, B, k, 0);	/* find starts of buckets */
	j = n - 1;
	b = SA + B[c1 = chr(j)];
	*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
	for (i = 0; i < n; ++i) {
		j = SA[i], SA[i] = ~j;
		if (0 < j) {
			--j;
			if ((c0 = chr(j)) != c1) {
				B[c1] = b - SA;
				b = SA + B[c1 = c0];
			}
			*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
		}
	}
	/* compute SAs */
	if (C == B) getCounts(T, C, n, k, cs);
	getBuckets(C, B, k, 1);	/* find ends of buckets */
	for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
		if (0 < (j = SA[i])) {
			--j;
			if ((c0 = chr(j)) != c1) {
				B[c1] = b - SA;
				b = SA + B[c1 = c0];
			}
			*--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j;
		} else SA[i] = ~j;
	}
}
Beispiel #2
0
/* sort all type LMS suffixes */
static
void
LMSsort1(const void *T, sais_index_type *SA,
         sais_index_type *C, sais_index_type *B,
         sais_index_type n, sais_index_type k, int cs) {
  sais_index_type bb, i, j;
  sais_index_type c0, c1;

  /* compute SAl */
  if(C == B) { getCounts(T, C, n, k, cs); }
  getBuckets(C, B, k, 0); /* find starts of buckets */
  j = n - 1;
  bb = B[c1 = chr(j)];
  --j;
  SA[bb++] = (chr(j) < c1) ? ~j : j;
  for(i = 0; i < n; ++i) {
    if(0 < (j = SA[i])) {
      assert(chr(j) >= chr(j + 1));
      if((c0 = chr(j)) != c1) { B[c1] = bb; bb = B[c1 = c0]; }
      assert(i < bb);
      --j;
      SA[bb] = (chr(j) < c1) ? ~j : j;
      ++bb;
      SA[i] = 0;
    } else if(j < 0) {
      SA[i] = ~j;
    }
  }
  /* compute SAs */
  if(C == B) { getCounts(T, C, n, k, cs); }
  getBuckets(C, B, k, 1); /* find ends of buckets */
  for(i = n - 1, bb = B[c1 = 0]; 0 <= i; --i) {
    if(0 < (j = SA[i])) {
      assert(chr(j) <= chr(j + 1));
      if((c0 = chr(j)) != c1) { B[c1] = bb; bb = B[c1 = c0]; }
      assert((bb) <= i);
      --j;
      SA[--bb] = (chr(j) > c1) ? ~(j + 1) : j;
      SA[i] = 0;
    }
  }
}
Beispiel #3
0
    void
    LMSsort1(string_type T, sarray_type SA,
             bucketC_type C, bucketB_type B,
             index_type n, index_type k, bool recount) {
    typedef typename TextValueType<string_type>::value_type char_type;
      sarray_type b;
      index_type i, j;
      char_type c0, c1;

      /* compute SAl */
      if(recount != false) { getCounts(T, C, n, k); }
      getBuckets(C, B, k, false); /* find starts of buckets */
      j = n - 1;
      b = SA + B[c1 = T[j]];
      --j;
      *b++ = (T[j] < c1) ? ~j : j;
      for(i = 0; i < n; ++i) {
        if(0 < (j = SA[i])) {
          assert(T[j] >= T[j + 1]);
          if((c0 = T[j]) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; }
          assert(i < (b - SA));
          --j;
          *b++ = (T[j] < c1) ? ~j : j;
          SA[i] = 0;
        } else if(j < 0) {
          SA[i] = ~j;
        }
      }
      /* compute SAs */
      if(recount != false) { getCounts(T, C, n, k); }
      getBuckets(C, B, k, true); /* find ends of buckets */
      for(i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
        if(0 < (j = SA[i])) {
          assert(T[j] <= T[j + 1]);
          if((c0 = T[j]) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; }
          assert((b - SA) <= i);
          --j;
          *--b = (T[j] > c1) ? ~(j + 1) : j;
          SA[i] = 0;
        }
      }
    }
Beispiel #4
0
/* sort all type LMS suffixes */
static
void
LMSsort1(const void *T, int *SA,
    int *C, int *B,
    int n, int k, int cs) {
    int *b, i, j;
    int c0, c1;

    /* compute SAl */
    if(C == B) { getCounts(T, C, n, k, cs); }
    getBuckets(C, B, k, 0); /* find starts of buckets */
    j = n - 1;
    b = SA + B[c1 = chr(j)];
    --j;
    *b++ = (chr(j) < c1) ? ~j : j;
    for(i = 0; i < n; ++i) {
        if(0 < (j = SA[i])) {
            assert(chr(j) >= chr(j + 1));
            if((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; }
            assert(i < (b - SA));
            --j;
            *b++ = (chr(j) < c1) ? ~j : j;
            SA[i] = 0;
        } else if(j < 0) {
            SA[i] = ~j;
        }
    }
    /* compute SAs */
    if(C == B) { getCounts(T, C, n, k, cs); }
    getBuckets(C, B, k, 1); /* find ends of buckets */
    for(i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
        if(0 < (j = SA[i])) {
            assert(chr(j) <= chr(j + 1));
            if((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; }
            assert((b - SA) <= i);
            --j;
            *--b = (chr(j) > c1) ? ~(j + 1) : j;
            SA[i] = 0;
        }
    }
}
Beispiel #5
0
void putSuffix0(long *SA, unsigned char *s, long *bkt, long n, long K,
                long n1) {
	long i, j;

	// find the end of each bucket.
	getBuckets(s, bkt, n, K, 1);

	// put the suffixes into their buckets.
	for (i = n1 - 1; i > 0; i--) {
		j = SA[i];
		SA[i] = 0;
		SA[bkt[s[j]]--] = j;
	}
	SA[0] = n - 1; // set the single sentinel suffix.
}
Beispiel #6
0
void putSuffix0(unsigned int *SA, 
  const unsigned char *s, unsigned int *bkt, 
  unsigned int n, unsigned int K, int n1) {
  unsigned int i, j;

  // find the end of each bucket.
  getBuckets(s, bkt, n, K, true);

  // put the suffixes into their buckets.
  for(i=n1-1; i>0; i--) {
    j=SA[i]; SA[i]=0;
    SA[bkt[s[j]]--]=j;
  }
  SA[0]=n-1; // set the single sentinel suffix.
}
    void induceSAs(const DNASeqList& reads, SuffixArray* sa, char** type_array, size_t* counts, size_t* buckets, size_t n, size_t K, bool end) {
        getBuckets(counts, buckets, K, end);
        for (size_t i = n; i > 0; --i) {
            const SuffixArray::Elem& ielem = (*sa)[i - 1];
            if (!ielem.empty() && ielem.j > 0) {
                LOG4CXX_TRACE(logger, boost::format("Curr: %d %d") % ielem.i % ielem.j);

                SuffixArray::Elem jelem(ielem.i, ielem.j - 1);
                if (getBit(type_array, jelem.i, jelem.j)) {
                    const DNASeq& read = reads[jelem.i];
                    char c = read.seq[jelem.j];
                    LOG4CXX_TRACE(logger,  boost::format("<iSA1>Placing %d %d at position %d") % jelem.i % jelem.j % (buckets[DNAAlphabet::torank(c)] - 1));
                    (*sa)[--buckets[DNAAlphabet::torank(c)]] = jelem;
                }
            }
        }
    }
Beispiel #8
0
void induceSAs0(long *SA, unsigned char *s, long *bkt, long n, long K,
                char suffix) {
	long i, j;

	// find the end of each bucket.
	getBuckets(s, bkt, n, K, 1);

	for (i = n - 1; i > 0; i--)
		if (SA[i] > 0) {
			j = SA[i] - 1;
			if (s[j] <= s[j + 1] && bkt[s[j]] < i) {
				SA[bkt[s[j]]] = j;
				bkt[s[j]]--;
				if (!suffix)
					SA[i] = 0;
			}
		}
}
Beispiel #9
0
void induceSAs0(unsigned int *SA,
  const unsigned char *s, unsigned int *bkt,
  unsigned int n, unsigned int K, bool suffix) {
  unsigned int i, j;

  // find the end of each bucket.
  getBuckets(s, bkt, n, K, true);

  for(i=n-1; i>0; i--)
    if(SA[i]>0) {
      j=SA[i]-1;
      if(s[j]<=s[j+1] && bkt[s[j]]<i) {
        SA[bkt[s[j]]]=j;
        bkt[s[j]]--;
        if(!suffix) SA[i]=0;
      }
    }
}
Beispiel #10
0
void induceSAl0(long *SA, unsigned char *s, long *bkt, long n, long K,
                char suffix) {
	long i, j;

	// find the head of each bucket.
	getBuckets(s, bkt, n, K, 0);

	bkt[0]++; // skip the virtual sentinel.
	for (i = 0; i < n; i++)
		if (SA[i] > 0) {
			j = SA[i] - 1;
			if (s[j] >= s[j + 1]) {
				SA[bkt[s[j]]] = j;
				bkt[s[j]]++;
				if (!suffix && i > 0)
					SA[i] = 0;
			}
		}
}
Beispiel #11
0
void induceSAs(const ReadTable* pRT, SuffixArray* pSA, char** p_array, int64_t* counts, int64_t* buckets, size_t n, int K, bool end)
{
    getBuckets(counts, buckets, K, end);
    for(int64_t i = n - 1; i >= 0; --i)
    {
        const SAElem& elem_i = pSA->get(i);
        if(!elem_i.isEmpty() && elem_i.getPos() > 0)
        {
            //std::cout << "<isas>Curr: " << elem_i << "\n";
            SAElem elem_j(elem_i.getID(), elem_i.getPos() - 1);
            if(getBit(p_array, elem_j.getID(), elem_j.getPos()))
            {
                char c = GET_CHAR(elem_j.getID(),elem_j.getPos());
                //std::cout << "<iSAs>Placing " << elem_j << " at position " << buckets[GET_BKT(c)] - 1 << "\n";
                pSA->set(--buckets[GET_BKT(c)], elem_j);
            }
        }
    }
}
Beispiel #12
0
void induceSAl0(unsigned int *SA,
  const unsigned char *s, unsigned int *bkt,
  unsigned int n, unsigned int K, bool suffix) {
  unsigned int i, j;

  // find the head of each bucket.
  getBuckets(s, bkt, n, K, false);

  bkt[0]++; // skip the virtual sentinel.
  for(i=0; i<n; i++)
    if(SA[i]>0) {
      j=SA[i]-1;
      if(s[j]>=s[j+1]) {
        SA[bkt[s[j]]]=j;
        bkt[s[j]]++;
        if(!suffix && i>0) SA[i]=0;
      }
    }
}
Beispiel #13
0
/*
 * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working
 * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
 */
static int sais_main(const unsigned char *T, seqint_t *SA, seqint_t fs, seqint_t n, seqint_t k, int cs)
{
	seqint_t *C, *B, *RA;
	seqint_t  j, c, m, p, q, plen, qlen, name;
	long int i;
	seqint_t  c0, c1;
	seqint_t  diff;

	/* stage 1: reduce the problem by at least 1/2 sort all the
	 * S-substrings */
	if (k <= fs) {
		C = SA + n;
		B = (k <= (fs - k)) ? C + k : C;
	} else if ((C = B = (seqint_t *) malloc(k * sizeof(seqint_t))) == NULL) return -2;
	getCounts(T, C, n, k, cs);
	getBuckets(C, B, k, 1);	/* find ends of buckets */
	for (i = 0; i < n; ++i) SA[i] = 0;
	for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
		if ((c0 = chr(i)) < (c1 + c)) c = 1;
		else if (c != 0) SA[--B[c1]] = i + 1, c = 0;
	}
	induceSA(T, SA, C, B, n, k, cs);
	if (fs < k) free(C);
	/* compact all the sorted substrings into the first m items of SA
	 * 2*m must be not larger than n (proveable) */
	for (i = 0, m = 0; i < n; ++i) {
		p = SA[i];
		if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) {
			for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j);
			if ((j < n) && (c0 < c1)) SA[m++] = p;
		}
	}
	for (i = m; i < n; ++i) SA[i] = 0;	/* init the name array buffer */
	/* store the length of all substrings */
	for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
		if ((c0 = chr(i)) < (c1 + c)) c = 1;
		else if (c != 0) {
			SA[m + ((i + 1) >> 1)] = j - i - 1;
			j = i + 1;
			c = 0;
		}
	}
Beispiel #14
0
void putSubstr0(long *SA, unsigned char *s, long *bkt, long n, long K) {
	long i, cur_t, succ_t;

	// find the end of each bucket.
	getBuckets(s, bkt, n, K, 1);

	// set each item in SA as empty.
	for (i = 0; i < n; i++)
		SA[i] = 0;

	succ_t = 0; // s[n-2] must be L-type.
	for (i = n - 2; i > 0; i--) {
		cur_t = (s[i - 1] < s[i] || (s[i - 1] == s[i] && succ_t == 1)) ? 1 : 0;
		if (cur_t == 0 && succ_t == 1)
			SA[bkt[s[i]]--] = i;
		succ_t = cur_t;
	}

	// set the single sentinel LMS-substring.
	SA[0] = n - 1;
}
    SuffixArray* build(const DNASeqList& reads, size_t threads = 1) {
        assert(!reads.empty());

        size_t num_strings = reads.size();

        // In the multiple strings case, we need a 2D bit array
        // to hold the L/S types for the suffixes
        char** type_array = new char*[num_strings];
        for (size_t i = 0; i < num_strings; ++i) {
            const DNASeq& read = reads[i];
            size_t num_bytes = (read.seq.length() + 1) / 8 + 1;
            type_array[i] = new char[num_bytes];
            memset(type_array[i], 0, num_bytes);
        }

        // Classify each suffix as being L or S type
        for (size_t i = 0; i < num_strings; ++i) {
            const DNASeq& read = reads[i];
            size_t len = read.seq.length() + 1;

            // The empty suffix ($) for each string is defined to be S type
            // and hence the next suffix must be L type
            setBit(type_array, i, len - 1, 1);
            if (!read.seq.empty()) {
                setBit(type_array, i, len - 2, 0);
                for (size_t j = len - 2; j > 0; --j) {
                    char curr = read.seq[j - 1], next = read.seq[j];
                    bool type = (curr < next || (curr == next && getBit(type_array, i, j) == 1));
                    setBit(type_array, i, j - 1, type);
                }
            }
        }

        // setup buckets
        size_t bucket_counts[DNAAlphabet::ALL_SIZE];
        size_t buckets[DNAAlphabet::ALL_SIZE];

        // find the ends of the buckets
        countBuckets(reads, bucket_counts, DNAAlphabet::ALL_SIZE);
        //getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true); 

        // Initialize the suffix array
        size_t num_suffixes = std::accumulate(&bucket_counts[0], &bucket_counts[0] + DNAAlphabet::ALL_SIZE, (size_t)0);
        LOG4CXX_DEBUG(logger, boost::format("initialize SA, strings: %d, suffixes: %d") % num_strings % num_suffixes);

        SuffixArray* sa = new SuffixArray(num_strings, num_suffixes);

        // Copy all the LMS substrings into the first n1 places in the SA
        size_t n1 = 0;
        for (size_t i = 0; i < num_strings; ++i) {
            const DNASeq& read = reads[i];
            for (size_t j = 0; j < read.seq.length() + 1; ++j) {
                if (isLMS(type_array, i, j)) {
                    SuffixArray::Elem& ele = (*sa)[n1++];
                    ele.i = i;
                    ele.j = j;
                }
            }
        }

        // Call MKQS, first on the sequence and then on the index in the read table
        LOG4CXX_DEBUG(logger, boost::format("calling mkqs on %d of %d suffixes(%f), using %d threads") % n1 % num_suffixes % ((double)n1 / num_suffixes) % threads);
        {
            SuffixRadixCmp radixcmp(reads);
            SuffixIndexCmp indexcmp;
            if (threads <= 1) {
                mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp);
            } else {
                mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp);
            }
        }
        LOG4CXX_DEBUG(logger, "mkqs finished");

        // Induction sort the remaining suffixes
        for (size_t i = n1; i < num_suffixes; ++i) {
            (*sa)[i] = SuffixArray::Elem();
        }

        // Find the ends of the buckets
        getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true);

        for (size_t i = n1; i > 0; --i) {
            SuffixArray::Elem elem = (*sa)[i - 1];
            (*sa)[i - 1] = SuffixArray::Elem(); // empty
            const DNASeq& read = reads[elem.i];
            char c = read.seq[elem.j];
            (*sa)[--buckets[DNAAlphabet::torank(c)]] = elem;
        }

        induceSAl(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, false);
        induceSAs(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, true);

        // deallocate t array
        for (size_t i = 0; i < num_strings; ++i) {
            SAFE_DELETE_ARRAY(type_array[i]);
        }
        SAFE_DELETE_ARRAY(type_array);
        return sa;
    }
Beispiel #16
0
// Implementation of induced copying algorithm by
// Nong, Zhang, Chan
// Follows implementation given as an appendix to their 2008 paper
// '\0' is the sentinenl in this algorithm
void saca_induced_copying(SuffixArray* pSA, const ReadTable* pRT, int numThreads)
{

    // In the multiple strings case, we need a 2D bit array
    // to hold the L/S types for the suffixes
    size_t num_strings = pRT->getCount();
    char** type_array = new char*[num_strings];
    
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;
        size_t num_bytes = (s_len / 8) + 1;
        type_array[i] = new char[num_bytes];
        assert(type_array[i] != 0);
        memset(type_array[i], 0, num_bytes);
    }

    // Classify each suffix as being L or S type
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;

        // The empty suffix ($) for each string is defined to be S type
        // and hence the next suffix must be L type
        setBit(type_array, i, s_len - 1, 1);
        setBit(type_array, i, s_len - 2, 0);
        for(int64_t j = s_len - 3; j >= 0; --j)
        {
            char curr_c = GET_CHAR(i, j);
            char next_c = GET_CHAR(i, j + 1);

            bool s_type = (curr_c < next_c || (curr_c == next_c && getBit(type_array, i, j + 1) == 1));
            setBit(type_array, i, j, s_type);
        }
    }

    // setup buckets
    const int ALPHABET_SIZE = 5;
    int64_t bucket_counts[ALPHABET_SIZE];
    int64_t buckets[ALPHABET_SIZE];

    // find the ends of the buckets
    countBuckets(pRT, bucket_counts, ALPHABET_SIZE);
    getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); 

    // Initialize the suffix array
    size_t num_suffixes = buckets[ALPHABET_SIZE - 1];
    pSA->initialize(num_suffixes, pRT->getCount());

    // Copy all the LMS substrings into the first n1 places in the SA
    size_t n1 = 0;
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;
        for(size_t j = 0; j < s_len; ++j)
        {
            if(isLMS(i,j))
                pSA->set(n1++, SAElem(i, j));
        }
    }

    /*
    //induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false);
    //induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true);
    
    // Compact all the sorted substrings into the first portion of the SA
    size_t n1 = 0;
    for(size_t i = 0; i < num_suffixes; ++i)
    {
        SAElem elem = pSA->get(i);
        if(!elem.isEmpty() && isLMS(elem.getID(), elem.getPos()))
        {
            pSA->set(n1++, elem);
        }
    }
    */

    double ratio = (double)n1 / (double)num_suffixes;
    std::cout << "[saca] calling mkqs on " << n1 << " suffixes " << ratio << " using " << numThreads << " threads \n";

    // Call MKQS, first on the sequence and then on the index in the read table
    SuffixCompareRadix radix_compare(pRT, 6);
    SuffixCompareIndex index_compare;
    //SuffixCompareID id_compare(pRT);
    
    if(numThreads <= 1)
        mkqs2(&pSA->m_data[0], n1, 0, radix_compare, index_compare);
    else
        parallel_mkqs(&pSA->m_data[0], n1, numThreads, radix_compare, index_compare);
    std::cout << "[saca] mkqs finished\n";

    // Induction sort the remaining suffixes
    for(size_t i = n1; i < num_suffixes; ++i)
        pSA->set(i, SAElem());
    
    // Find the ends of the buckets
    getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true);

    for(int64_t i = n1 - 1; i >= 0; --i)
    {
        SAElem elem_i = pSA->get(i);
        pSA->set(i, SAElem()); // empty
        char c = GET_CHAR(elem_i.getID(), elem_i.getPos());
        pSA->set(--buckets[GET_BKT(c)], elem_i);
    }

    induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false);
    induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true);

    // deallocate t array
    for(size_t i = 0; i < num_strings; ++i)
    {
        delete [] type_array[i];
    }
    delete [] type_array;
}