Exemplo n.º 1
0
// Merge two readsFiles together
void mergeReadFiles(const std::string& readsFile1, const std::string& readsFile2, const std::string& outPrefix)
{
    // If the outfile is the empty string, append the reads in readsFile2 into readsFile1
    // otherwise cat the files together
    std::ostream* pWriter;
    if(outPrefix.empty())
    {
        pWriter = createWriter(readsFile1, std::ios_base::out | std::ios_base::app);
    }
    else
    {
        pWriter = createWriter(makeFilename(outPrefix, ".fa"));

        // Copy reads1 to the outfile
        SeqReader reader(readsFile1);
        SeqRecord record;
        while(reader.get(record))
            record.write(*pWriter);
    }

    // Copy reads2 to writer
    SeqReader reader(readsFile2);
    SeqRecord record;
    while(reader.get(record))
        record.write(*pWriter);
    delete pWriter;
}
Exemplo n.º 2
0
void QCPostProcess::process(const SequenceWorkItem& item, const QCResult& result)
{
    SeqRecord record = item.read;
    if(result.kmerPassed && result.dupPassed && result.hpPassed && result.degenPassed)
    {
        record.write(*m_pCorrectedWriter);
        ++m_readsKept;
    }
    else
    {
        // To be able to rebuild the index after discarding the read, we need to write
        // the rank of the string (its position in the original read file into the read name)
        std::stringstream newID;
        newID << item.read.id << ",seqrank=" << item.idx;
        record.id = newID.str();

        record.write(*m_pDiscardWriter);
        ++m_readsDiscarded;

        if(!result.kmerPassed)
            m_readsFailedKmer += 1;
        else if(!result.dupPassed)
            m_readsFailedDup += 1;
        else if(!result.hpPassed)
            m_readsFailedHP += 1;
        else if(!result.degenPassed)
            m_readsFailedDegen += 1;
    }
}
Exemplo n.º 3
0
// Compute the initial BWTs for the input file split into blocks of records using the BCR algorithm
MergeVector computeInitialBCR(const BWTDiskParameters& parameters)
{
    SeqReader* pReader = new SeqReader(parameters.inFile);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    DNAEncodedStringVector readSequences;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(parameters.bBuildReverse)
                item.seq.reverse();
            readSequences.push_back(item.seq.toString());
            ++numReadTotal;
        }

        if(readSequences.size() >= parameters.numReadsPerBatch || (done && readSequences.size() > 0))
        {
            std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension);
            std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension);
            BWTCA::runBauerCoxRosone(&readSequences, bwt_temp_filename, sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = parameters.inFile;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            readSequences.clear();
        }
    }
    delete pReader;
    return mergeVector;
}
Exemplo n.º 4
0
// Generate a report of the quality of each base
void generate_quality_stats(JSONWriter* pJSONWriter, const std::string& filename)
{
    size_t max_reads = 10000000;
    double sample_rate = 0.05;
    SeqReader reader(filename, SRF_KEEP_CASE | SRF_NO_VALIDATION);
    SeqRecord record;

    size_t n_reads = 0;
    std::vector<size_t> bases_checked;
    std::vector<size_t> sum_quality;
    std::vector<size_t> num_q30;

    while(reader.get(record) && n_reads++ < max_reads)
    {
        if((double)rand() / RAND_MAX < sample_rate && record.qual.length() == record.seq.length())
        {
            size_t l = record.seq.length();
            if(l > bases_checked.size())
            {
                bases_checked.resize(l);
                sum_quality.resize(l);
                num_q30.resize(l);
            }

            for(size_t i = 0; i < l; ++i)
            {
                bases_checked[i]++;
                size_t q = record.getPhredScore(i);
                sum_quality[i] += q;
                num_q30[i] += (q >= 30);
            }
        }
    }

    pJSONWriter->String("QualityScores");
    pJSONWriter->StartObject();
    
    pJSONWriter->String("mean_quality");
    pJSONWriter->StartArray();
    for(size_t i = 0; i < bases_checked.size(); ++i)
        pJSONWriter->Double((float)sum_quality[i] / bases_checked[i]);
    pJSONWriter->EndArray();

    pJSONWriter->String("fraction_q30");
    pJSONWriter->StartArray();
    for(size_t i = 0; i < bases_checked.size(); ++i)
        pJSONWriter->Double((float)num_q30[i] / bases_checked[i]);
    pJSONWriter->EndArray();
    pJSONWriter->EndObject();
}
Exemplo n.º 5
0
void LRCorrectionPostProcess::process(const SequenceWorkItem& item, const LRCorrectionResult& result)
{
    SeqRecord record = item.read;
    record.seq = result.correctedSequence;
    record.qual = "";

    if(!record.seq.empty())
    {
        record.write(*m_pCorrectedWriter);
        m_readsKept += 1;
    }
    else
    {
        m_readsDiscarded += 1;
    }
}
Exemplo n.º 6
0
void ErrorCorrectPostProcess::process(const SequenceWorkItem& item, const ErrorCorrectResult& result)
{

    // Determine if the read should be discarded
    bool readQCPass = true;
    if(result.kmerQC)
    {
        m_kmerQCPassed += 1;
    }
    else if(result.overlapQC)
    {
        m_overlapQCPassed += 1;
    }
    else
    {
        readQCPass = false;
        m_qcFail += 1;
    }

    // Collect metrics for the reads that were actually corrected
    if(m_bCollectMetrics && readQCPass)
    {
        collectMetrics(item.read.seq.toString(),
                       result.correctSequence.toString(),
                       item.read.qual);
    }

    SeqRecord record = item.read;
    record.seq = result.correctSequence;

    if(readQCPass || m_pDiscardWriter == NULL)
    {
        record.write(*m_pCorrectedWriter);
        ++m_readsKept;
    }
    else
    {
        record.write(*m_pDiscardWriter);
        ++m_readsDiscarded;
    }
}
Exemplo n.º 7
0
bool SGPairedPathResolveVisitor::visit(StringGraph* /*pGraph*/, Vertex* /*pVertex*/)
{
    assert(false); 
#if 0
    if(pVertex->getColor() == GC_BLACK)
        return false; // has been resolved already

    // Get the vertex of the pair
    std::string pairID = getPairID(pVertex->getID());
    Vertex* pPair = pGraph->getVertex(pairID);
    if(pPair != NULL)
    {
        PathVector paths;
        // get the expected direction between the vertices based on the PE info
        EdgeDir dir = SGPairedAlgorithms::getDirectionToPair(pVertex->getID());
        SGPairedAlgorithms::searchPaths(pVertex, pPair, dir, 300, paths);   
        pVertex->setColor(GC_BLACK);
        pPair->setColor(GC_BLACK);

        std::cout << "Found " << paths.size() << " paths from " << pVertex->getID()
                  << " to " << pPair->getID() << "\n";

        
        if(paths.size() == 1)
        {
            std::string fragment = SGPairedAlgorithms::pathToString(pVertex, paths[0]);
            SeqRecord record;
            record.id = pVertex->getID();
            record.seq = fragment;
            record.write(*m_pWriter);
        }
        else
        {
            SeqRecord recordX;
            recordX.id = pVertex->getID();
            recordX.seq = pVertex->getSeq().toString();
            recordX.write(*m_pWriter);

            SeqRecord recordY;
            recordY.id = pVertex->getID();
            recordY.seq = pVertex->getSeq().toString();
            recordY.write(*m_pWriter);
        }
    }
#endif
    return false;
}
Exemplo n.º 8
0
    void parseFasta(SeqStream input_stream, string filename, SeqSet &data) {

        data.filename = filename;

        char ch;
        string temp = "";
        string nm;
        unsigned size_guess = 10000; // Seems like it might speed things up

        // Enclose all of this in a while loop that goes to EOF:
        input_stream.get(ch);
        if(ch != '>') {
            throw("Not in FASTA format");
        }


        bool inseq = false;
        bool linebreak = false;
        while(!input_stream.eof()) {
            SeqRecord rec;
            rec.reserve(size_guess);
            nm = "";
            while (true && !inseq) {
                if(!input_stream.good()) {
                    throw("Problem reading file");
                }
                input_stream.get(ch);
                if (ch == '\n' || ch == '\r')
                    inseq = true;
                nm += ch;
            }
            rec.setName(nm);

            temp = "";
            while(inseq){
                input_stream.get(ch);
                if(input_stream.eof())
                    break;

                // ">" after a linebreak means a new name
                if(ch == '>' && linebreak) {
                    inseq = false;
                    linebreak = false;
                    continue;
                }

                // Ignore, but note linebreaks
                linebreak = false;
                if(ch == '\n' || ch == '\r') {
                    linebreak = true;
                    continue;
                }

                // Ignore whitespace
                if(ch == ' ' || ch == '\t') {
                    continue;
                }

                temp += ch;
            }
            rec.append(temp);
            data.append(rec);
            size_guess = rec.getSeq().size();
        }
    }
Exemplo n.º 9
0
    void parseFastq(SeqStream input_stream, string filename, SeqSet &data) {

        data.filename = filename;
        char ch;
        string temp = "";
        string nm = "";
        unsigned size_guess = 150; // Seems like it might speed things up
        unsigned line_num = 0;
        bool linebreak = false;
        bool name = false;
        while(!input_stream.eof()) {
            // Check if stream is okay and read a character
            if(!input_stream.good()) {
                throw("Problem reading file");
            }
            input_stream.get(ch);

            // Check for linebreaks. Treat multiple linebreak characters
            // as one linebreak. Also, count the number of lines, and when
            // four lines have been reach, construct a SeqRecord and reset.
            if(ch == '\n' || ch == '\r') {
                if(!linebreak) {
                    line_num += 1;
                    if(line_num == 4) {
                        line_num = 0;
                        SeqRecord rec;
                        rec.setName(nm);
                        rec.append(temp);
                        data.append(rec);
                        size_guess = rec.getSeq().size();
                        nm = "";
                        temp = "";
                        temp.reserve(size_guess);
                        name = false;
                    }
                }
                linebreak = true;
                continue;
            }

            // If this far, not a linebreak
            linebreak = false;

            // For each line of the fastq file
            if(line_num == 0) {
                // Name
                if(!name and ch != '@') {
                    throw("Not in fastq format");
                }
                if(name) {
                    nm += ch;
                }
                name = true;
            } else if(line_num == 1) {
                // Sequence
                temp += ch;
            } else if(line_num == 2) {
                // Plus line - Ignore
                continue;
            } else if(line_num == 3) {
                // Quality scores - ignore
                continue;
            }
        }
    }
Exemplo n.º 10
0
//
// Main
//
int preprocessMain(int argc, char** argv)
{
    Timer* pTimer = new Timer("sga preprocess");
    parsePreprocessOptions(argc, argv);

    std::cerr << "Parameters:\n";
    std::cerr << "QualTrim: " << opt::qualityTrim << "\n";

    if(opt::qualityFilter >= 0)
        std::cerr << "QualFilter: at most " << opt::qualityFilter << " low quality bases\n";
    else
        std::cerr << "QualFilter: no filtering\n";

    std::cerr << "HardClip: " << opt::hardClip << "\n";
    std::cerr << "Min length: " << opt::minLength << "\n";
    std::cerr << "Sample freq: " << opt::sampleFreq << "\n";
    std::cerr << "PE Mode: " << opt::peMode << "\n";
    std::cerr << "Quality scaling: " << opt::qualityScale << "\n";
    std::cerr << "MinGC: " << opt::minGC << "\n";
    std::cerr << "MaxGC: " << opt::maxGC << "\n";
    std::cerr << "Outfile: " << (opt::outFile.empty() ? "stdout" : opt::outFile) << "\n";
    std::cerr << "Orphan file: " << (opt::orphanFile.empty() ? "none" : opt::orphanFile) << "\n";
    if(opt::bDiscardAmbiguous)
        std::cerr << "Discarding sequences with ambiguous bases\n";
    if(opt::bDustFilter)
        std::cerr << "Dust threshold: " << opt::dustThreshold << "\n";
    if(!opt::suffix.empty())
        std::cerr << "Suffix: " << opt::suffix << "\n";

    if(opt::adapterF.length() && opt::adapterR.length())
    {
        std::cerr << "Adapter sequence fwd: " << opt::adapterF << "\n";
        std::cerr << "Adapter sequence rev: " << opt::adapterR << "\n";
    }

    // Seed the RNG
    srand(time(NULL));

    std::ostream* pWriter;
    if(opt::outFile.empty())
    {
        pWriter = &std::cout;
    }
    else
    {
        std::ostream* pFile = createWriter(opt::outFile);
        pWriter = pFile;
    }

    // Create a filehandle to write orphaned reads to, if necessary
    std::ostream* pOrphanWriter = NULL;
    if(!opt::orphanFile.empty())
        pOrphanWriter = createWriter(opt::orphanFile);

    if(opt::peMode == 0)
    {
        // Treat files as SE data
        while(optind < argc)
        {
            std::string filename = argv[optind++];
            std::cerr << "Processing " << filename << "\n\n";
            SeqReader reader(filename, SRF_NO_VALIDATION);
            SeqRecord record;

            while(reader.get(record))
            {
                bool passed = processRead(record);
                if(passed && samplePass())
                {
                    if(!opt::suffix.empty())
                        record.id.append(opt::suffix);

                    record.write(*pWriter);
                    ++s_numReadsKept;
                    s_numBasesKept += record.seq.length();
                }
            }
        }
    }
    else
    {
        assert(opt::peMode == 1 || opt::peMode == 2);
        int numFiles = argc - optind;
        if(opt::peMode == 1 && numFiles % 2 == 1)
        {
            std::cerr << "Error: An even number of files must be given for pe-mode 1\n";
            exit(EXIT_FAILURE);
        }

        while(optind < argc)
        {
            SeqReader* pReader1;
            SeqReader* pReader2;

            if(opt::peMode == 1)
            {
                // Read from separate files
                std::string filename1 = argv[optind++];
                std::string filename2 = argv[optind++];

                pReader1 = new SeqReader(filename1, SRF_NO_VALIDATION);
                pReader2 = new SeqReader(filename2, SRF_NO_VALIDATION);

                std::cerr << "Processing pe files " << filename1 << ", " << filename2 << "\n";

            }
            else
            {
                // Read from a single file
                std::string filename = argv[optind++];
                pReader1 = new SeqReader(filename, SRF_NO_VALIDATION);
                pReader2 = pReader1;
                std::cerr << "Processing interleaved pe file " << filename << "\n";
            }

            SeqRecord record1;
            SeqRecord record2;
            while(pReader1->get(record1) && pReader2->get(record2))
            {
                // If the names of the records are the same, append a /1 and /2 to them
                if(record1.id == record2.id)
                {
                    if(!opt::suffix.empty())
                    {
                        record1.id.append(opt::suffix);
                        record2.id.append(opt::suffix);
                    }

                    record1.id.append("/1");
                    record2.id.append("/2");
                }

                // Ensure the read names are sensible
                std::string expectedID2 = getPairID(record1.id);
                std::string expectedID1 = getPairID(record2.id);

                if(expectedID1 != record1.id || expectedID2 != record2.id)
                {
                    std::cerr << "Warning: Pair IDs do not match (expected format /1,/2 or /A,/B)\n";
                    std::cerr << "Read1 ID: " << record1.id << "\n";
                    std::cerr << "Read2 ID: " << record2.id << "\n";
                    s_numInvalidPE += 2;
                }

                bool passed1 = processRead(record1);
                bool passed2 = processRead(record2);

                if(!samplePass())
                    continue;

                if(passed1 && passed2)
                {
                    record1.write(*pWriter);
                    record2.write(*pWriter);
                    s_numReadsKept += 2;
                    s_numBasesKept += record1.seq.length();
                    s_numBasesKept += record2.seq.length();
                }
                else if(passed1 && pOrphanWriter != NULL)
                {
                    record1.write(*pOrphanWriter);
                }
                else if(passed2 && pOrphanWriter != NULL)
                {
                    record2.write(*pOrphanWriter);
                }
            }

            if(pReader2 != pReader1)
            {
                // only delete reader2 if it is a distinct pointer
                delete pReader2;
                pReader2 = NULL;
            }
            delete pReader1;
            pReader1 = NULL;

        }

    }

    if(pWriter != &std::cout)
        delete pWriter;
    if(pOrphanWriter != NULL)
        delete pOrphanWriter;

    std::cerr << "\nPreprocess stats:\n";
    std::cerr << "Reads parsed:\t" << s_numReadsRead << "\n";
    std::cerr << "Reads kept:\t" << s_numReadsKept << " (" << (double)s_numReadsKept / (double)s_numReadsRead << ")\n";
    std::cerr << "Reads failed primer screen:\t" << s_numReadsPrimer << " (" << (double)s_numReadsPrimer / (double)s_numReadsRead << ")\n";
    std::cerr << "Bases parsed:\t" << s_numBasesRead << "\n";
    std::cerr << "Bases kept:\t" << s_numBasesKept << " (" << (double)s_numBasesKept / (double)s_numBasesRead << ")\n";
    std::cerr << "Number of incorrectly paired reads that were discarded: " << s_numInvalidPE << "\n";
    if(opt::bDustFilter)
        std::cerr << "Number of reads failed dust filter: " << s_numFailedDust << "\n";
    delete pTimer;
    return 0;
}
Exemplo n.º 11
0
    void parseFasta(string filename, SeqSet &data) {

        //try {
            ifstream input(filename.c_str(), ifstream::in);

            data.filename = filename;

            char ch;
            string temp = "";
            string nm;

            // Enclose all of this in a while loop that goes to EOF:
            input.get(ch);
            if(ch != '>') {
                throw("Not in FASTA format");
            }


            bool inseq = false;
            bool linebreak = false;
            while(!input.eof()) {
                SeqRecord rec;
                nm = "";
                while (true && !inseq) {
                    input.get(ch);
                    if (ch == '\n' || ch == '\r')
                        inseq = true;
                    nm += ch;
                }
                rec.setName(nm);

                temp = "";
                while(inseq){
                    input.get(ch);
                    if(input.eof())
                        break;

                    // ">" after a linebreak means a new name
                    if(ch == '>' && linebreak) {
                        inseq = false;
                        linebreak = false;
                        continue;
                    }

                    // Ignore, but note linebreaks
                    linebreak = false;
                    if(ch == '\n' || ch == '\r') {
                        linebreak = true;
                        continue;
                    }

                    // Ignore whitespace
                    if(ch == ' ' || ch == '\t') {
                        continue;
                    }

                    temp += ch;
                }
                rec.append(temp);
                data.append(rec);
            }
        //} catch (...) {
        //    throw("Problem parsing file");
        //}
    }
Exemplo n.º 12
0
//
// Main
//
int overlapLongMain(int argc, char** argv)
{
    parseOverlapLongOptions(argc, argv);

    // Open output file
    std::ostream* pASQGWriter = createWriter(opt::outFile);

    // Build and write the ASQG header
    ASQG::HeaderRecord headerRecord;
    headerRecord.setOverlapTag(opt::minOverlap);
    headerRecord.setErrorRateTag(opt::errorRate);
    headerRecord.setInputFileTag(opt::readsFile);
    headerRecord.setTransitiveTag(true);
    headerRecord.write(*pASQGWriter);

    // Determine which index files to use. If a target file was provided,
    // use the index of the target reads
    std::string indexPrefix;
    if(!opt::targetFile.empty())
        indexPrefix = stripFilename(opt::targetFile);
    else
        indexPrefix = stripFilename(opt::readsFile);

    BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
    SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI);
    
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Read the sequence file and write vertex records for each
    // Also store the read names in a vector of strings
    ReadTable reads;
    
    SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION);
    SeqRecord record;
    while(pReader->get(record))
    {
        reads.addRead(record.toSeqItem());
        ASQG::VertexRecord vr(record.id, record.seq.toString());
        vr.write(*pASQGWriter);

        if(reads.getCount() % 100000 == 0)
            printf("Read %zu sequences\n", reads.getCount());
    }

    delete pReader;
    pReader = NULL;

    BWTIndexSet index;
    index.pBWT = pBWT;
    index.pSSA = pSSA;
    index.pReadTable = &reads;

    // Make a prefix for the temporary hits files
    size_t n_reads = reads.getCount();

    omp_set_num_threads(opt::numThreads);

#pragma omp parallel for
    for(size_t read_idx = 0; read_idx < n_reads; ++read_idx)
    {
        const SeqItem& curr_read = reads.getRead(read_idx);

        printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length());
        SequenceOverlapPairVector sopv = 
            KmerOverlaps::retrieveMatches(curr_read.seq.toString(),
                                          opt::seedLength,
                                          opt::minOverlap,
                                          1 - opt::errorRate,
                                          100,
                                          index);

        printf("Found %zu matches\n", sopv.size());
        for(size_t i = 0; i < sopv.size(); ++i)
        {
            std::string match_id = reads.getRead(sopv[i].match_idx).id;

            // We only want to output each edge once so skip this overlap
            // if the matched read has a lexicographically lower ID
            if(curr_read.id > match_id)
                continue;

            std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50);
            printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(),
                                                                sopv[i].overlap.match[0].start,
                                                                sopv[i].overlap.match[0].end,
                                                                match_id.c_str(),
                                                                sopv[i].overlap.getOverlapLength(),
                                                                sopv[i].overlap.getPercentIdentity(),
                                                                sopv[i].overlap.cigar.c_str());

            // Convert to ASQG
            SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]);
            SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]);
            
            // KmerOverlaps returns the coordinates of the overlap after flipping the reads
            // to ensure the strand matches. The ASQG file wants the coordinate of the original
            // sequencing strand. Flip here if necessary
            if(sopv[i].is_reversed)
                sc2.flip();

            // Convert the SequenceOverlap the ASQG's overlap format
            Overlap ovr(curr_read.id, sc1, match_id,  sc2, sopv[i].is_reversed, -1);

            ASQG::EdgeRecord er(ovr);
            er.setCigarTag(sopv[i].overlap.cigar);
            er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity());

#pragma omp critical
            {
                er.write(*pASQGWriter);
            }
        }
    }

    // Cleanup
    delete pReader;
    delete pBWT; 
    delete pSSA;
    
    delete pASQGWriter;
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Exemplo n.º 13
0
// Compute the initial BWTs for the input file split into blocks of records using the SAIS algorithm
MergeVector computeInitialSAIS(const BWTDiskParameters& parameters)
{
    SeqReader* pReader = new SeqReader(parameters.inFile);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    ReadTable* pCurrRT = new ReadTable;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(parameters.bBuildReverse)
                item.seq.reverse();
            pCurrRT->addRead(item);
            ++numReadTotal;
        }

        if(pCurrRT->getCount() >= parameters.numReadsPerBatch || (done && pCurrRT->getCount() > 0))
        {
            // Compute the SA and BWT for this group
            SuffixArray* pSA = new SuffixArray(pCurrRT, 1);

            // Write the BWT to disk                
            std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension);
            pSA->writeBWT(bwt_temp_filename, pCurrRT);

            std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension);
            pSA->writeIndex(sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = parameters.inFile;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Cleanup
            delete pSA;

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            pCurrRT->clear();
        }
    }
    delete pCurrRT;
    delete pReader;
    return mergeVector;
}
Exemplo n.º 14
0
// The algorithm is as follows. We create M BWTs for subsets of 
// the input reads. These are created independently and written
// to disk. They are then merged either sequentially or pairwise
// to create the final BWT
void buildBWTDisk(const std::string& in_filename, const std::string& out_prefix, 
                  const std::string& bwt_extension, const std::string& sai_extension,
                  bool doReverse, int numThreads, int numReadsPerBatch, int storageLevel)
{
    size_t MAX_READS_PER_GROUP = numReadsPerBatch;

    SeqReader* pReader = new SeqReader(in_filename);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    ReadTable* pCurrRT = new ReadTable;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(doReverse)
                item.seq.reverse();
            pCurrRT->addRead(item);
            ++numReadTotal;
        }

        if(pCurrRT->getCount() >= MAX_READS_PER_GROUP || (done && pCurrRT->getCount() > 0))
        {
            // Compute the SA and BWT for this group
            SuffixArray* pSA = new SuffixArray(pCurrRT, numThreads);

            // Write the BWT to disk                
            std::string bwt_temp_filename = makeTempName(out_prefix, groupID, bwt_extension);
            pSA->writeBWT(bwt_temp_filename, pCurrRT);

            std::string sai_temp_filename = makeTempName(out_prefix, groupID, sai_extension);
            pSA->writeIndex(sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = in_filename;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Cleanup
            delete pSA;

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            pCurrRT->clear();
        }
    }
    delete pCurrRT;
    delete pReader;

    // Phase 2: Pairwise merge the BWTs
    int round = 1;
    MergeVector nextMergeRound;
    while(mergeVector.size() > 1)
    {
        std::cout << "Starting round " << round << "\n";
        pReader = new SeqReader(in_filename);
        for(size_t i = 0; i < mergeVector.size(); i+=2)
        {
            if(i + 1 != mergeVector.size())
            {
                std::string bwt_merged_name = makeTempName(out_prefix, groupID, bwt_extension);
                std::string sai_merged_name = makeTempName(out_prefix, groupID, sai_extension);

                MergeItem item1 = mergeVector[i];
                MergeItem item2 = mergeVector[i+1];

                // Perform the actual merge
                int64_t curr_idx = merge(pReader, item1, item2, 
                                         bwt_merged_name, sai_merged_name, 
                                         doReverse, numThreads, storageLevel);

                // pReader now points to the end of item1's block of 
                // reads. Skip item2's reads
                assert(curr_idx == item2.start_index);
                while(curr_idx <= item2.end_index)
                {
                    bool eof = !pReader->get(record);
                    assert(!eof);
                    (void)eof;
                    ++curr_idx;
                }

                // Create the merged mergeItem to use in the next round
                MergeItem merged;
                merged.start_index = item1.start_index;
                merged.end_index = item2.end_index;
                merged.bwt_filename = bwt_merged_name;
                merged.sai_filename = sai_merged_name;
                nextMergeRound.push_back(merged);

                // Done with the temp files, remove them
                unlink(item1.bwt_filename.c_str());
                unlink(item2.bwt_filename.c_str());
                unlink(item1.sai_filename.c_str());
                unlink(item2.sai_filename.c_str());

                ++groupID;
            }
            else
            {
                // Singleton, pass through to the next round
                nextMergeRound.push_back(mergeVector[i]);
            }
        }
        delete pReader;
        mergeVector.clear();
        mergeVector.swap(nextMergeRound);
        ++round;
    }
    assert(mergeVector.size() == 1);

    // Done, rename the files to their final name
    std::stringstream bwt_ss;
    bwt_ss << out_prefix << bwt_extension << (USE_GZ ? ".gz" : "");
    std::string bwt_final_filename = bwt_ss.str();
    rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str());

    std::stringstream sai_ss;
    sai_ss << out_prefix << sai_extension << (USE_GZ ? ".gz" : "");
    std::string sai_final_filename = sai_ss.str();
    rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str());
}