// Write the final BWT to a file.
size_t BWTCA::outputFinalBWT(BCRVector& bcrVector, 
                             const DNAEncodedString& readBWT, 
                             size_t partial_bwt_symbols,
                             BWTWriterBinary* pBWTWriter,
                             SAWriter* pSAWriter)
{

    // Counters
    size_t num_copied = 0;
    size_t num_inserted = 0;

    for(size_t i = 0; i < bcrVector.size(); ++i)
    {
        BCRElem& ne = bcrVector[i];
        
        // Copy elements from the read bwt until we reach the target position
        while(num_copied + num_inserted < ne.position)
            pBWTWriter->writeBWChar(readBWT.get(num_copied++));
        
        // Write a single $, terminating this string
        pBWTWriter->writeBWChar('$');
        pSAWriter->writeElem(SAElem(ne.index, 0));
        num_inserted += 1;
    }

    // Copy any remaining symbols in the bwt
    while(num_copied < partial_bwt_symbols)
        pBWTWriter->writeBWChar(readBWT.get(num_copied++));

    pBWTWriter->finalize();
    return num_copied + num_inserted;
}
Esempio n. 2
0
// Initialize a suffix array for the strings in RT
void SuffixArray::initialize(const ReadTable& rt)
{
    size_t n = rt.countSumLengths() + rt.getCount(); 
    initialize(n, rt.getCount());

    // Fill the data table with the linear ordering of the suffixes
    size_t count = 0;
    for(size_t i = 0; i < rt.getCount(); ++i)
    {
        // + 1 below is for the empty suffix (is it actually needed?)
        for(size_t j = 0; j < rt.getRead(i).seq.length() + 1; ++j)
        {
            m_data[count++] = SAElem(i, j);
        }
    }
}
Esempio n. 3
0
// Implementation of induced copying algorithm by
// Nong, Zhang, Chan
// Follows implementation given as an appendix to their 2008 paper
// '\0' is the sentinenl in this algorithm
void saca_induced_copying(SuffixArray* pSA, const ReadTable* pRT, int numThreads)
{

    // In the multiple strings case, we need a 2D bit array
    // to hold the L/S types for the suffixes
    size_t num_strings = pRT->getCount();
    char** type_array = new char*[num_strings];
    
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;
        size_t num_bytes = (s_len / 8) + 1;
        type_array[i] = new char[num_bytes];
        assert(type_array[i] != 0);
        memset(type_array[i], 0, num_bytes);
    }

    // Classify each suffix as being L or S type
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;

        // The empty suffix ($) for each string is defined to be S type
        // and hence the next suffix must be L type
        setBit(type_array, i, s_len - 1, 1);
        setBit(type_array, i, s_len - 2, 0);
        for(int64_t j = s_len - 3; j >= 0; --j)
        {
            char curr_c = GET_CHAR(i, j);
            char next_c = GET_CHAR(i, j + 1);

            bool s_type = (curr_c < next_c || (curr_c == next_c && getBit(type_array, i, j + 1) == 1));
            setBit(type_array, i, j, s_type);
        }
    }

    // setup buckets
    const int ALPHABET_SIZE = 5;
    int64_t bucket_counts[ALPHABET_SIZE];
    int64_t buckets[ALPHABET_SIZE];

    // find the ends of the buckets
    countBuckets(pRT, bucket_counts, ALPHABET_SIZE);
    getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true); 

    // Initialize the suffix array
    size_t num_suffixes = buckets[ALPHABET_SIZE - 1];
    pSA->initialize(num_suffixes, pRT->getCount());

    // Copy all the LMS substrings into the first n1 places in the SA
    size_t n1 = 0;
    for(size_t i = 0; i < num_strings; ++i)
    {
        size_t s_len = pRT->getReadLength(i) + 1;
        for(size_t j = 0; j < s_len; ++j)
        {
            if(isLMS(i,j))
                pSA->set(n1++, SAElem(i, j));
        }
    }

    /*
    //induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false);
    //induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true);
    
    // Compact all the sorted substrings into the first portion of the SA
    size_t n1 = 0;
    for(size_t i = 0; i < num_suffixes; ++i)
    {
        SAElem elem = pSA->get(i);
        if(!elem.isEmpty() && isLMS(elem.getID(), elem.getPos()))
        {
            pSA->set(n1++, elem);
        }
    }
    */

    double ratio = (double)n1 / (double)num_suffixes;
    std::cout << "[saca] calling mkqs on " << n1 << " suffixes " << ratio << " using " << numThreads << " threads \n";

    // Call MKQS, first on the sequence and then on the index in the read table
    SuffixCompareRadix radix_compare(pRT, 6);
    SuffixCompareIndex index_compare;
    //SuffixCompareID id_compare(pRT);
    
    if(numThreads <= 1)
        mkqs2(&pSA->m_data[0], n1, 0, radix_compare, index_compare);
    else
        parallel_mkqs(&pSA->m_data[0], n1, numThreads, radix_compare, index_compare);
    std::cout << "[saca] mkqs finished\n";

    // Induction sort the remaining suffixes
    for(size_t i = n1; i < num_suffixes; ++i)
        pSA->set(i, SAElem());
    
    // Find the ends of the buckets
    getBuckets(bucket_counts, buckets, ALPHABET_SIZE, true);

    for(int64_t i = n1 - 1; i >= 0; --i)
    {
        SAElem elem_i = pSA->get(i);
        pSA->set(i, SAElem()); // empty
        char c = GET_CHAR(elem_i.getID(), elem_i.getPos());
        pSA->set(--buckets[GET_BKT(c)], elem_i);
    }

    induceSAl(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, false);
    induceSAs(pRT, pSA, type_array, bucket_counts, buckets, num_suffixes, ALPHABET_SIZE, true);

    // deallocate t array
    for(size_t i = 0; i < num_strings; ++i)
    {
        delete [] type_array[i];
    }
    delete [] type_array;
}