Пример #1
0
    SuffixArray* build(const DNASeqList& reads, size_t threads = 1) {
        assert(!reads.empty());

        size_t num_strings = reads.size();

        // In the multiple strings case, we need a 2D bit array
        // to hold the L/S types for the suffixes
        char** type_array = new char*[num_strings];
        for (size_t i = 0; i < num_strings; ++i) {
            const DNASeq& read = reads[i];
            size_t num_bytes = (read.seq.length() + 1) / 8 + 1;
            type_array[i] = new char[num_bytes];
            memset(type_array[i], 0, num_bytes);
        }

        // Classify each suffix as being L or S type
        for (size_t i = 0; i < num_strings; ++i) {
            const DNASeq& read = reads[i];
            size_t len = read.seq.length() + 1;

            // The empty suffix ($) for each string is defined to be S type
            // and hence the next suffix must be L type
            setBit(type_array, i, len - 1, 1);
            if (!read.seq.empty()) {
                setBit(type_array, i, len - 2, 0);
                for (size_t j = len - 2; j > 0; --j) {
                    char curr = read.seq[j - 1], next = read.seq[j];
                    bool type = (curr < next || (curr == next && getBit(type_array, i, j) == 1));
                    setBit(type_array, i, j - 1, type);
                }
            }
        }

        // setup buckets
        size_t bucket_counts[DNAAlphabet::ALL_SIZE];
        size_t buckets[DNAAlphabet::ALL_SIZE];

        // find the ends of the buckets
        countBuckets(reads, bucket_counts, DNAAlphabet::ALL_SIZE);
        //getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true); 

        // Initialize the suffix array
        size_t num_suffixes = std::accumulate(&bucket_counts[0], &bucket_counts[0] + DNAAlphabet::ALL_SIZE, (size_t)0);
        LOG4CXX_DEBUG(logger, boost::format("initialize SA, strings: %d, suffixes: %d") % num_strings % num_suffixes);

        SuffixArray* sa = new SuffixArray(num_strings, num_suffixes);

        // Copy all the LMS substrings into the first n1 places in the SA
        size_t n1 = 0;
        for (size_t i = 0; i < num_strings; ++i) {
            const DNASeq& read = reads[i];
            for (size_t j = 0; j < read.seq.length() + 1; ++j) {
                if (isLMS(type_array, i, j)) {
                    SuffixArray::Elem& ele = (*sa)[n1++];
                    ele.i = i;
                    ele.j = j;
                }
            }
        }

        // Call MKQS, first on the sequence and then on the index in the read table
        LOG4CXX_DEBUG(logger, boost::format("calling mkqs on %d of %d suffixes(%f), using %d threads") % n1 % num_suffixes % ((double)n1 / num_suffixes) % threads);
        {
            SuffixRadixCmp radixcmp(reads);
            SuffixIndexCmp indexcmp;
            if (threads <= 1) {
                mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp);
            } else {
                mkqs2(&(*sa)[0], n1, 0, radixcmp, indexcmp);
            }
        }
        LOG4CXX_DEBUG(logger, "mkqs finished");

        // Induction sort the remaining suffixes
        for (size_t i = n1; i < num_suffixes; ++i) {
            (*sa)[i] = SuffixArray::Elem();
        }

        // Find the ends of the buckets
        getBuckets(bucket_counts, buckets, DNAAlphabet::ALL_SIZE, true);

        for (size_t i = n1; i > 0; --i) {
            SuffixArray::Elem elem = (*sa)[i - 1];
            (*sa)[i - 1] = SuffixArray::Elem(); // empty
            const DNASeq& read = reads[elem.i];
            char c = read.seq[elem.j];
            (*sa)[--buckets[DNAAlphabet::torank(c)]] = elem;
        }

        induceSAl(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, false);
        induceSAs(reads, sa, type_array, bucket_counts, buckets, num_suffixes, DNAAlphabet::ALL_SIZE, true);

        // deallocate t array
        for (size_t i = 0; i < num_strings; ++i) {
            SAFE_DELETE_ARRAY(type_array[i]);
        }
        SAFE_DELETE_ARRAY(type_array);
        return sa;
    }
Пример #2
0
// Implementation of induced copying algorithm by
// Nong, Zhang, Chan
// Follows implementation given as an appendix to their 2008 paper
// '\0' is the sentinenl in this algorithm
void saca_induced_copying(SuffixArray* pSA, const ReadTable* pRT, int numThreads)
{

    // In the multiple strings case, we need a 2D bit array
    // to hold the L/S types for the suffixes
    size_t num_strings = pRT->getCount();
    char** type_array = new char*[num_strings];
    
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;
        size_t num_bytes = (s_len / 8) + 1;
        type_array[i] = new char[num_bytes];
        assert(type_array[i] != 0);
        memset(type_array[i], 0, num_bytes);
    }

    // Classify each suffix as being L or S type
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;

        // The empty suffix ($) for each string is defined to be S type
        // and hence the next suffix must be L type
        setBit(type_array, i, s_len - 1, 1);
        setBit(type_array, i, s_len - 2, 0);
        for(int64_t j = s_len - 3; j >= 0; --j)
        {
            char curr_c = GET_CHAR(i, j);
            char next_c = GET_CHAR(i, j + 1);

            bool s_type = (curr_c < next_c || (curr_c == next_c && getBit(type_array, i, j + 1) == 1));
            setBit(type_array, i, j, s_type);
        }
    }

    // setup buckets
    const int ALPHABET_SIZE = 5;
    int64_t bucket_counts[ALPHABET_SIZE];
    int64_t buckets[ALPHABET_SIZE];

    // find the ends of the buckets
    countBuckets(pRT, bucket_counts, ALPHABET_SIZE);
    getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); 

    // Initialize the suffix array
    size_t num_suffixes = buckets[ALPHABET_SIZE - 1];
    pSA->initialize(num_suffixes, pRT->getCount());

    // Copy all the LMS substrings into the first n1 places in the SA
    size_t n1 = 0;
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;
        for(size_t j = 0; j < s_len; ++j)
        {
            if(isLMS(i,j))
                pSA->set(n1++, SAElem(i, j));
        }
    }

    /*
    //induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false);
    //induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true);
    
    // Compact all the sorted substrings into the first portion of the SA
    size_t n1 = 0;
    for(size_t i = 0; i < num_suffixes; ++i)
    {
        SAElem elem = pSA->get(i);
        if(!elem.isEmpty() && isLMS(elem.getID(), elem.getPos()))
        {
            pSA->set(n1++, elem);
        }
    }
    */

    double ratio = (double)n1 / (double)num_suffixes;
    std::cout << "[saca] calling mkqs on " << n1 << " suffixes " << ratio << " using " << numThreads << " threads \n";

    // Call MKQS, first on the sequence and then on the index in the read table
    SuffixCompareRadix radix_compare(pRT, 6);
    SuffixCompareIndex index_compare;
    //SuffixCompareID id_compare(pRT);
    
    if(numThreads <= 1)
        mkqs2(&pSA->m_data[0], n1, 0, radix_compare, index_compare);
    else
        parallel_mkqs(&pSA->m_data[0], n1, numThreads, radix_compare, index_compare);
    std::cout << "[saca] mkqs finished\n";

    // Induction sort the remaining suffixes
    for(size_t i = n1; i < num_suffixes; ++i)
        pSA->set(i, SAElem());
    
    // Find the ends of the buckets
    getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true);

    for(int64_t i = n1 - 1; i >= 0; --i)
    {
        SAElem elem_i = pSA->get(i);
        pSA->set(i, SAElem()); // empty
        char c = GET_CHAR(elem_i.getID(), elem_i.getPos());
        pSA->set(--buckets[GET_BKT(c)], elem_i);
    }

    induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false);
    induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true);

    // deallocate t array
    for(size_t i = 0; i < num_strings; ++i)
    {
        delete [] type_array[i];
    }
    delete [] type_array;
}