예제 #1
0
// Compute the initial BWTs for the input file split into blocks of records using the BCR algorithm
MergeVector computeInitialBCR(const BWTDiskParameters& parameters)
{
    SeqReader* pReader = new SeqReader(parameters.inFile);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    DNAEncodedStringVector readSequences;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(parameters.bBuildReverse)
                item.seq.reverse();
            readSequences.push_back(item.seq.toString());
            ++numReadTotal;
        }

        if(readSequences.size() >= parameters.numReadsPerBatch || (done && readSequences.size() > 0))
        {
            std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension);
            std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension);
            BWTCA::runBauerCoxRosone(&readSequences, bwt_temp_filename, sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = parameters.inFile;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            readSequences.clear();
        }
    }
    delete pReader;
    return mergeVector;
}
예제 #2
0
// The algorithm is as follows. We create M BWTs for subsets of 
// the input reads. These are created independently and written
// to disk. They are then merged either sequentially or pairwise
// to create the final BWT
void buildBWTDisk(const BWTDiskParameters& parameters)
{
    // Build the initial bwts for subsets of the data
    MergeVector mergeVector;
    if(parameters.bUseBCR)
        mergeVector = computeInitialBCR(parameters);
    else
        mergeVector = computeInitialSAIS(parameters);

    // Phase 2: Pairwise merge the BWTs
    int groupID = mergeVector.size(); // Initial the name of the next intermediate bwt
    int round = 1;
    MergeVector nextMergeRound;
    while(mergeVector.size() > 1)
    {
        std::cout << "Starting round " << round << "\n";
        SeqReader* pReader = new SeqReader(parameters.inFile);
        SeqRecord record;

        for(size_t i = 0; i < mergeVector.size(); i+=2)
        {
            if(i + 1 != mergeVector.size())
            {
                std::string bwt_merged_name = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension);
                std::string sai_merged_name = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension);

                MergeItem item1 = mergeVector[i];
                MergeItem item2 = mergeVector[i+1];

                // Perform the actual merge
                int64_t curr_idx = merge(pReader, item1, item2, 
                                         bwt_merged_name, sai_merged_name, 
                                         parameters.bBuildReverse, parameters.numThreads, parameters.storageLevel);

                // pReader now points to the end of item1's block of 
                // reads. Skip item2's reads
                assert(curr_idx == item2.start_index);
                while(curr_idx <= item2.end_index)
                {
                    bool eof = !pReader->get(record);
                    assert(!eof);
                    (void)eof;
                    ++curr_idx;
                }

                // Create the merged mergeItem to use in the next round
                MergeItem merged;
                merged.start_index = item1.start_index;
                merged.end_index = item2.end_index;
                merged.bwt_filename = bwt_merged_name;
                merged.sai_filename = sai_merged_name;
                nextMergeRound.push_back(merged);

                // Done with the temp files, remove them
                unlink(item1.bwt_filename.c_str());
                unlink(item2.bwt_filename.c_str());
                unlink(item1.sai_filename.c_str());
                unlink(item2.sai_filename.c_str());

                ++groupID;
            }
            else
            {
                // Singleton, pass through to the next round
                nextMergeRound.push_back(mergeVector[i]);
            }
        }
        delete pReader;
        mergeVector.clear();
        mergeVector.swap(nextMergeRound);
        ++round;
    }
    assert(mergeVector.size() == 1);

    // Done, rename the files to their final name
    std::stringstream bwt_ss;
    bwt_ss << parameters.outPrefix << parameters.bwtExtension << (USE_GZ ? ".gz" : "");
    std::string bwt_final_filename = bwt_ss.str();
    rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str());

    std::stringstream sai_ss;
    sai_ss << parameters.outPrefix << parameters.saiExtension << (USE_GZ ? ".gz" : "");
    std::string sai_final_filename = sai_ss.str();
    rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str());
}
예제 #3
0
// Compute the initial BWTs for the input file split into blocks of records using the SAIS algorithm
MergeVector computeInitialSAIS(const BWTDiskParameters& parameters)
{
    SeqReader* pReader = new SeqReader(parameters.inFile);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    ReadTable* pCurrRT = new ReadTable;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(parameters.bBuildReverse)
                item.seq.reverse();
            pCurrRT->addRead(item);
            ++numReadTotal;
        }

        if(pCurrRT->getCount() >= parameters.numReadsPerBatch || (done && pCurrRT->getCount() > 0))
        {
            // Compute the SA and BWT for this group
            SuffixArray* pSA = new SuffixArray(pCurrRT, 1);

            // Write the BWT to disk                
            std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension);
            pSA->writeBWT(bwt_temp_filename, pCurrRT);

            std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension);
            pSA->writeIndex(sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = parameters.inFile;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Cleanup
            delete pSA;

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            pCurrRT->clear();
        }
    }
    delete pCurrRT;
    delete pReader;
    return mergeVector;
}
예제 #4
0
// The algorithm is as follows. We create M BWTs for subsets of 
// the input reads. These are created independently and written
// to disk. They are then merged either sequentially or pairwise
// to create the final BWT
void buildBWTDisk(const std::string& in_filename, const std::string& out_prefix, 
                  const std::string& bwt_extension, const std::string& sai_extension,
                  bool doReverse, int numThreads, int numReadsPerBatch, int storageLevel)
{
    size_t MAX_READS_PER_GROUP = numReadsPerBatch;

    SeqReader* pReader = new SeqReader(in_filename);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    ReadTable* pCurrRT = new ReadTable;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(doReverse)
                item.seq.reverse();
            pCurrRT->addRead(item);
            ++numReadTotal;
        }

        if(pCurrRT->getCount() >= MAX_READS_PER_GROUP || (done && pCurrRT->getCount() > 0))
        {
            // Compute the SA and BWT for this group
            SuffixArray* pSA = new SuffixArray(pCurrRT, numThreads);

            // Write the BWT to disk                
            std::string bwt_temp_filename = makeTempName(out_prefix, groupID, bwt_extension);
            pSA->writeBWT(bwt_temp_filename, pCurrRT);

            std::string sai_temp_filename = makeTempName(out_prefix, groupID, sai_extension);
            pSA->writeIndex(sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = in_filename;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Cleanup
            delete pSA;

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            pCurrRT->clear();
        }
    }
    delete pCurrRT;
    delete pReader;

    // Phase 2: Pairwise merge the BWTs
    int round = 1;
    MergeVector nextMergeRound;
    while(mergeVector.size() > 1)
    {
        std::cout << "Starting round " << round << "\n";
        pReader = new SeqReader(in_filename);
        for(size_t i = 0; i < mergeVector.size(); i+=2)
        {
            if(i + 1 != mergeVector.size())
            {
                std::string bwt_merged_name = makeTempName(out_prefix, groupID, bwt_extension);
                std::string sai_merged_name = makeTempName(out_prefix, groupID, sai_extension);

                MergeItem item1 = mergeVector[i];
                MergeItem item2 = mergeVector[i+1];

                // Perform the actual merge
                int64_t curr_idx = merge(pReader, item1, item2, 
                                         bwt_merged_name, sai_merged_name, 
                                         doReverse, numThreads, storageLevel);

                // pReader now points to the end of item1's block of 
                // reads. Skip item2's reads
                assert(curr_idx == item2.start_index);
                while(curr_idx <= item2.end_index)
                {
                    bool eof = !pReader->get(record);
                    assert(!eof);
                    (void)eof;
                    ++curr_idx;
                }

                // Create the merged mergeItem to use in the next round
                MergeItem merged;
                merged.start_index = item1.start_index;
                merged.end_index = item2.end_index;
                merged.bwt_filename = bwt_merged_name;
                merged.sai_filename = sai_merged_name;
                nextMergeRound.push_back(merged);

                // Done with the temp files, remove them
                unlink(item1.bwt_filename.c_str());
                unlink(item2.bwt_filename.c_str());
                unlink(item1.sai_filename.c_str());
                unlink(item2.sai_filename.c_str());

                ++groupID;
            }
            else
            {
                // Singleton, pass through to the next round
                nextMergeRound.push_back(mergeVector[i]);
            }
        }
        delete pReader;
        mergeVector.clear();
        mergeVector.swap(nextMergeRound);
        ++round;
    }
    assert(mergeVector.size() == 1);

    // Done, rename the files to their final name
    std::stringstream bwt_ss;
    bwt_ss << out_prefix << bwt_extension << (USE_GZ ? ".gz" : "");
    std::string bwt_final_filename = bwt_ss.str();
    rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str());

    std::stringstream sai_ss;
    sai_ss << out_prefix << sai_extension << (USE_GZ ? ".gz" : "");
    std::string sai_final_filename = sai_ss.str();
    rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str());
}