Esempio n. 1
0
//
// Main
//
int statsMain(int argc, char** argv)
{
    parseStatsOptions(argc, argv);
    Timer* pTimer = new Timer(PROGRAM_IDENT);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);

    if(opt::bPrintRunLengths)
    {
        pBWT->printInfo();
        pBWT->printRunLengths();
    }

    SeqReader reader(opt::readsFile);
    
    StatsPostProcess postProcessor(opt::bPrintKmerDist);
    if(opt::numThreads <= 1)
    {
        // Serial mode
        StatsProcess processor(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap);

        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         StatsResult, 
                                                         StatsProcess, 
                                                         StatsPostProcess>(reader, &processor, &postProcessor, opt::numReads);
    }
    else
    {
        // Parallel mode
        std::vector<StatsProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            StatsProcess* pProcessor = new StatsProcess(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap);
            processorVector.push_back(pProcessor);
        }
        
        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                           StatsResult, 
                                                           StatsProcess, 
                                                           StatsPostProcess>(reader, processorVector, &postProcessor, opt::numReads);

        for(int i = 0; i < opt::numThreads; ++i)
        {
            delete processorVector[i];
        }
    }

    delete pBWT;
    delete pRBWT;
    delete pTimer;

    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Esempio n. 2
0
int genSSAMain(int argc, char** argv)
{
    Timer t("sga gen-ssa");
    parseGenSSAOptions(argc, argv);
    
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID);
    pBWT->printInfo();

    SampledSuffixArray* pSSA = new SampledSuffixArray();
    pSSA->build(pBWT, pRIT, opt::sampleRate);
    pSSA->printInfo();
    pSSA->writeSSA(opt::prefix + SSA_EXT);

    if(opt::validate)
        pSSA->validate(opt::readsFile, pBWT);

    delete pBWT;
    delete pRIT;
    delete pSSA;

    return 0;
}
Esempio n. 3
0
void CompTool::search_forward_matches(const string seq1_name, const string seq2_name, int* SA, BWT& bwt,
                                      const int kmer_size, const int slide_letters, const int max_num_matches){
    stringstream out_file;
    out_file << seq1_name << "__" << seq2_name << ".match." << kmer_size << ".forward";
    ofstream ofs(out_file.str().c_str());

    ofs << "#" << seq2_name << "\t" << seq1_name << endl;  // header

    for(int i = 0; i < seq2_size_ - kmer_size; i += slide_letters){
        int8_t* query = &seq2_[i];
        int lb, ub;  // lower- and upper-bound of matches in suffix array
        bwt.search(query, kmer_size, lb, ub);
        if(lb <= ub){
            for(int j = lb; j <= ub; j++){
                if(j == lb + max_num_matches) break;
                ofs << i << "\t" << SA[j] << endl;
            }
        }
    }
}
Esempio n. 4
0
void CompTool::search_reverse_matches(const string seq1_name, const string seq2_name, int* SA, BWT& bwt,
                                      const int kmer_size, const int slide_letters, const int max_num_matches){
    stringstream out_file;
    out_file << seq1_name << "__" << seq2_name << ".match." << kmer_size << ".reverse";
    ofstream ofs(out_file.str().c_str());

    ofs << "#" << seq2_name << "\t" << seq1_name << endl;  // header

    int8_t* query = new int8_t[kmer_size];
    for(int i = kmer_size-1; i < seq2_size_ - 1; i += slide_letters){
        // convert k-mers to reverse complements
        for(int j = 0; j < kmer_size; j++)
            query[j] = num_char_ - seq2_[i-j];
        int lb, ub;  // lower- and upper-bound of matches in suffix array
        bwt.search(query, kmer_size, lb, ub);
        if(lb <= ub){
            for(int j = lb; j <= ub; j++){
                if(j == lb + max_num_matches) break;
                ofs << i << "\t" << SA[j] << endl;
            }
        }
    }
    delete[] query;
}
Esempio n. 5
0
void cluster()
{
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true);

    pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f);
    pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f);

    BitVector markedReads(pBWT->getNumStrings());

    std::string preclustersFile = opt::outFile + ".preclusters";
    std::ostream* pPreWriter = createWriter(preclustersFile);
    ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads);
    
    // Set the cluster parameters
    ClusterParameters parameters;
    parameters.pOverlapper = pOverlapper;
    parameters.minOverlap = opt::minOverlap;
    parameters.maxClusterSize = opt::maxSize;
    parameters.maxIterations = opt::maxIterations;
    parameters.pMarkedReads = &markedReads;

    // Read the limit kmer sequences, if provided
    std::set<std::string>* pLimitKmers = NULL;

    if(!opt::limitFile.empty())
    {
        // Read in the limit sequences
        pLimitKmers = new std::set<std::string>;
        readLimitKmers(pLimitKmers);
        parameters.pLimitKmers = pLimitKmers;
        parameters.limitK = opt::limitKmer;
    }
    else
    {
        parameters.pLimitKmers = NULL;
        parameters.limitK = 0;
    }

    // Make pre-clusters from the reads
    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT);
        ClusterProcess processor(parameters);
        
        // If the extend file is empty, build new clusters
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor);
        }
        else
        {
            // Process a set of preexisting clusters
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor);
        }
    }
    else
    {
        printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<ClusterProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ClusterProcess* pProcessor = new ClusterProcess(parameters);
            processorVector.push_back(pProcessor);
        }
        
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor);
        }
        else
        {
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor);
        }
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }
    delete pPreWriter;
    delete pBWT;
    delete pRBWT;
    delete pOverlapper;

    // Deallocate limit kmers
    if(pLimitKmers != NULL)
        delete pLimitKmers;

    // Open the preclusters file and convert them to read names
    SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT);
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings());

    size_t seedIdx = 0;
    std::istream* pPreReader = createReader(preclustersFile);
    std::ostream* pClusterWriter = createWriter(opt::outFile);
    std::string line;
    while(getline(*pPreReader,line))
    {
        std::stringstream parser(line);
        std::string clusterName;
        std::string readSequence;
        size_t clusterSize;
        int64_t lowIdx;
        int64_t highIdx;
        parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx;

        if(lowIdx > highIdx)
        {
            // This is an extra read that is not present in the FM-index
            // Output a record with a fake read ID
            *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n";
        }
        else
        {
            for(int64_t i = lowIdx; i <= highIdx; ++i)
            {
                const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID());
                std::string readName = targetInfo.id;
                *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n";
            }
        }
    }
    unlink(preclustersFile.c_str());

    delete pFwdSAI;
    delete pRIT;
    delete pPreReader;
    delete pClusterWriter;
}
Esempio n. 6
0
//
// Main
//
int filterMain(int argc, char** argv)
{
    parseFilterOptions(argc, argv);
    Timer* pTimer = new Timer(PROGRAM_IDENT);


    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
    //pBWT->printInfo();

    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = createWriter(opt::discardFile);
    QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter);

    // If performing duplicate check, create a bitvector to record
    // which reads are duplicates
    BitVector* pSharedBV = NULL;
    if(opt::dupCheck)
        pSharedBV = new BitVector(pBWT->getNumStrings());

    // Set up QC parameters
    QCParameters params;
    params.pBWT = pBWT;
    params.pRevBWT = pRBWT;
    params.pSharedBV = pSharedBV;

    params.checkDuplicates = opt::dupCheck;
    params.substringOnly = opt::substringOnly;
    params.checkKmer = opt::kmerCheck;
    params.checkHPRuns = opt::hpCheck;
    params.checkDegenerate = opt::lowComplexityCheck;

    params.verbose = opt::verbose;

    params.kmerLength = opt::kmerLength;
    params.kmerThreshold = opt::kmerThreshold;

    params.hpKmerLength = 51;
    params.hpHardAcceptCount = 10;
    params.hpMinProportion = 0.1f;
    params.hpMinLength = 6;

    if(opt::numThreads <= 1)
    {
        // Serial mode
        QCProcess processor(params);
        PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<QCProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            QCProcess* pProcessor = new QCProcess(params);
            processorVector.push_back(pProcessor);
        }

        PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
            delete processorVector[i];
    }

    delete pPostProcessor;
    delete pWriter;
    delete pDiscardWriter;

    delete pBWT;
    delete pRBWT;

    if(pSharedBV != NULL)
        delete pSharedBV;

    std::cout << "RE-building index for " << opt::outFile << " in memory using ropebwt2\n";
    std::string prefix=stripFilename(opt::outFile);
        //BWT *pBWT, *pRBWT;
		#pragma omp parallel
		{
			#pragma omp single nowait
			{	
			    std::string bwt_filename = prefix + BWT_EXT;
				BWTCA::runRopebwt2(opt::outFile, bwt_filename, opt::numThreads, false);
				std::cout << "\t done bwt construction, generating .sai file\n";
				pBWT = new BWT(bwt_filename);
			}
			#pragma omp single nowait
			{	
				std::string rbwt_filename = prefix + RBWT_EXT;
				BWTCA::runRopebwt2(opt::outFile, rbwt_filename, opt::numThreads, true);
				std::cout << "\t done rbwt construction, generating .rsai file\n";
				pRBWT = new BWT(rbwt_filename);
			}
		}
        std::string sai_filename = prefix + SAI_EXT;
		SampledSuffixArray ssa;
        ssa.buildLexicoIndex(pBWT, opt::numThreads);
        ssa.writeLexicoIndex(sai_filename);
        delete pBWT;

        std::string rsai_filename = prefix + RSAI_EXT;
        SampledSuffixArray rssa;
        rssa.buildLexicoIndex(pRBWT, opt::numThreads);
        rssa.writeLexicoIndex(rsai_filename);
        delete pRBWT;

    // Cleanup
    delete pTimer;

    return 0;
}
Esempio n. 7
0
//
// Main
//
int graphDiffMain(int argc, char** argv)
{
    parseGraphDiffOptions(argc, argv);

    // Create BWTS
    std::string basePrefix = stripFilename(opt::baseFile);
    BWT* pBaseBWT = new BWT(basePrefix + BWT_EXT, opt::sampleRate);
    BWT* pBaseRevBWT = new BWT(basePrefix + RBWT_EXT, opt::sampleRate);

    std::string variantPrefix = stripFilename(opt::variantFile);
    BWT* pVariantBWT = new BWT(variantPrefix + BWT_EXT, opt::sampleRate);
    BWT* pVariantRevBWT = new BWT(variantPrefix + RBWT_EXT, opt::sampleRate);
    
    // Create the shared bit vector and shared results aggregator
    BitVector* pSharedBitVector = new BitVector(pVariantBWT->getBWLen());
    GraphCompareAggregateResults* pSharedResults = new GraphCompareAggregateResults(opt::outFile);

    // Create interval caches to speed up k-mer lookups
    BWTIntervalCache varBWTCache(opt::cacheLength, pVariantBWT);
    BWTIntervalCache varRBWTCache(opt::cacheLength, pVariantRevBWT);

    BWTIntervalCache baseBWTCache(opt::cacheLength, pBaseBWT);
    BWTIntervalCache baseRBWTCache(opt::cacheLength, pBaseRevBWT);


    // Set the parameters shared between all threads
    GraphCompareParameters sharedParameters;
    sharedParameters.pVariantBWT = pVariantBWT;
    sharedParameters.pVariantRevBWT = pVariantRevBWT;
    sharedParameters.pBaseBWT = pBaseBWT;
    sharedParameters.pBaseRevBWT = pBaseRevBWT;
    sharedParameters.kmer = opt::kmer;
    sharedParameters.pBitVector = pSharedBitVector;
    sharedParameters.kmerThreshold = 3;
    sharedParameters.maxBranches = opt::maxBranches;

    sharedParameters.pVarBWTCache = &varBWTCache;
    sharedParameters.pVarRevBWTCache = &varRBWTCache;
    sharedParameters.pBaseBWTCache = &baseBWTCache;
    sharedParameters.pBaseRevBWTCache = &baseRBWTCache;

    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode graph diff\n", PROGRAM_IDENT);
        GraphCompare graphCompare(sharedParameters); 
        PROCESS_GDIFF_SERIAL(opt::variantFile, &graphCompare, pSharedResults);
        graphCompare.updateSharedStats(pSharedResults);
    }
    else
    {
        printf("[%s] starting parallel-mode graph diff with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<GraphCompare*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            GraphCompare* pProcessor = new GraphCompare(sharedParameters);
            processorVector.push_back(pProcessor);
        }
        
        PROCESS_GDIFF_PARALLEL(opt::variantFile, processorVector, pSharedResults);
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            // Update the shared stats
            processorVector[i]->updateSharedStats(pSharedResults);

            delete processorVector[i];
            processorVector[i] = NULL;
        }


    }
    pSharedResults->printStats();

    // Cleanup
    delete pBaseBWT;
    delete pBaseRevBWT;
    delete pVariantBWT;
    delete pVariantRevBWT;
    delete pSharedBitVector;
    delete pSharedResults;

    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Esempio n. 8
0
//
// Main
//
int overlapLongMain(int argc, char** argv)
{
    parseOverlapLongOptions(argc, argv);

    // Open output file
    std::ostream* pASQGWriter = createWriter(opt::outFile);

    // Build and write the ASQG header
    ASQG::HeaderRecord headerRecord;
    headerRecord.setOverlapTag(opt::minOverlap);
    headerRecord.setErrorRateTag(opt::errorRate);
    headerRecord.setInputFileTag(opt::readsFile);
    headerRecord.setTransitiveTag(true);
    headerRecord.write(*pASQGWriter);

    // Determine which index files to use. If a target file was provided,
    // use the index of the target reads
    std::string indexPrefix;
    if(!opt::targetFile.empty())
        indexPrefix = stripFilename(opt::targetFile);
    else
        indexPrefix = stripFilename(opt::readsFile);

    BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
    SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI);
    
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Read the sequence file and write vertex records for each
    // Also store the read names in a vector of strings
    ReadTable reads;
    
    SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION);
    SeqRecord record;
    while(pReader->get(record))
    {
        reads.addRead(record.toSeqItem());
        ASQG::VertexRecord vr(record.id, record.seq.toString());
        vr.write(*pASQGWriter);

        if(reads.getCount() % 100000 == 0)
            printf("Read %zu sequences\n", reads.getCount());
    }

    delete pReader;
    pReader = NULL;

    BWTIndexSet index;
    index.pBWT = pBWT;
    index.pSSA = pSSA;
    index.pReadTable = &reads;

    // Make a prefix for the temporary hits files
    size_t n_reads = reads.getCount();

    omp_set_num_threads(opt::numThreads);

#pragma omp parallel for
    for(size_t read_idx = 0; read_idx < n_reads; ++read_idx)
    {
        const SeqItem& curr_read = reads.getRead(read_idx);

        printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length());
        SequenceOverlapPairVector sopv = 
            KmerOverlaps::retrieveMatches(curr_read.seq.toString(),
                                          opt::seedLength,
                                          opt::minOverlap,
                                          1 - opt::errorRate,
                                          100,
                                          index);

        printf("Found %zu matches\n", sopv.size());
        for(size_t i = 0; i < sopv.size(); ++i)
        {
            std::string match_id = reads.getRead(sopv[i].match_idx).id;

            // We only want to output each edge once so skip this overlap
            // if the matched read has a lexicographically lower ID
            if(curr_read.id > match_id)
                continue;

            std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50);
            printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(),
                                                                sopv[i].overlap.match[0].start,
                                                                sopv[i].overlap.match[0].end,
                                                                match_id.c_str(),
                                                                sopv[i].overlap.getOverlapLength(),
                                                                sopv[i].overlap.getPercentIdentity(),
                                                                sopv[i].overlap.cigar.c_str());

            // Convert to ASQG
            SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]);
            SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]);
            
            // KmerOverlaps returns the coordinates of the overlap after flipping the reads
            // to ensure the strand matches. The ASQG file wants the coordinate of the original
            // sequencing strand. Flip here if necessary
            if(sopv[i].is_reversed)
                sc2.flip();

            // Convert the SequenceOverlap the ASQG's overlap format
            Overlap ovr(curr_read.id, sc1, match_id,  sc2, sopv[i].is_reversed, -1);

            ASQG::EdgeRecord er(ovr);
            er.setCigarTag(sopv[i].overlap.cigar);
            er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity());

#pragma omp critical
            {
                er.write(*pASQGWriter);
            }
        }
    }

    // Cleanup
    delete pReader;
    delete pBWT; 
    delete pSSA;
    
    delete pASQGWriter;
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Esempio n. 9
0
//
// Main
//
int filterMain(int argc, char** argv)
{
    parseFilterOptions(argc, argv);
    Timer* pTimer = new Timer(PROGRAM_IDENT);


    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
    pBWT->printInfo();

    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = createWriter(opt::discardFile);
    QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter);

    // If performing duplicate check, create a bitvector to record
    // which reads are duplicates
    BitVector* pSharedBV = NULL;
    if(opt::dupCheck)
        pSharedBV = new BitVector(pBWT->getNumStrings());

    // Set up QC parameters
    QCParameters params;
    params.pBWT = pBWT;
    params.pRevBWT = pRBWT;
    params.pSharedBV = pSharedBV;

    params.checkDuplicates = opt::dupCheck;
    params.substringOnly = opt::substringOnly;
    params.checkKmer = opt::kmerCheck;
    params.kmerBothStrand = opt::kmerBothStrand;
    params.checkHPRuns = opt::hpCheck;
    params.checkDegenerate = opt::lowComplexityCheck;

    params.verbose = opt::verbose;

    params.kmerLength = opt::kmerLength;
    params.kmerThreshold = opt::kmerThreshold;

    params.hpKmerLength = 51;
    params.hpHardAcceptCount = 10;
    params.hpMinProportion = 0.1f;
    params.hpMinLength = 6;

    if(opt::numThreads <= 1)
    {
        // Serial mode
        QCProcess processor(params);
        PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<QCProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            QCProcess* pProcessor = new QCProcess(params);
            processorVector.push_back(pProcessor);
        }

        PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
            delete processorVector[i];
    }

    delete pPostProcessor;
    delete pWriter;
    delete pDiscardWriter;

    delete pBWT;
    delete pRBWT;

    if(pSharedBV != NULL)
        delete pSharedBV;

    // Rebuild the FM-index without the discarded reads
    std::string out_prefix = stripFilename(opt::outFile);
    removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads);
    removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads);

    // Cleanup
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Esempio n. 10
0
//
// Main
//
int overlapMain(int argc, char** argv)
{
    parseOverlapOptions(argc, argv);

    // Prepare the output ASQG file
    assert(opt::outputType == OT_ASQG);

    // Open output file
    std::ostream* pASQGWriter = createWriter(opt::outFile);

    // Build and write the ASQG header
    ASQG::HeaderRecord headerRecord;
    headerRecord.setOverlapTag(opt::minOverlap);
    headerRecord.setErrorRateTag(opt::errorRate);
    headerRecord.setInputFileTag(opt::readsFile);
    headerRecord.setContainmentTag(true); // containments are always present
    headerRecord.setTransitiveTag(!opt::bIrreducibleOnly);
    headerRecord.write(*pASQGWriter);

    // Compute the overlap hits
    StringVector hitsFilenames;

    // Determine which index files to use. If a target file was provided,
    // use the index of the target reads
    std::string indexPrefix;
    if(!opt::prefix.empty())
      indexPrefix = opt::prefix;
    else
    {
      if(!opt::targetFile.empty())
        indexPrefix = stripFilename(opt::targetFile);
      else
        indexPrefix = stripFilename(opt::readsFile);
    }
    BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, 
                                                         opt::errorRate, opt::seedLength, 
                                                         opt::seedStride, opt::bIrreducibleOnly);

    pOverlapper->setExactModeOverlap(opt::errorRate <= 0.0001);
    pOverlapper->setExactModeIrreducible(opt::errorRate <= 0.0001);

    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Make a prefix for the temporary hits files
    std::string outPrefix;
    outPrefix = stripFilename(opt::readsFile);
    if(!opt::targetFile.empty())
    {
        outPrefix.append(1, '.');
        outPrefix.append(stripFilename(opt::targetFile));
    }

    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT);
        computeHitsSerial(outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
    }
    else
    {
        printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        computeHitsParallel(opt::numThreads, outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
    }

    // Get the number of strings in the BWT, this is used to pre-allocated the read table
    delete pOverlapper;
    delete pBWT; 
    delete pRBWT;

    // Parse the hits files and write the overlaps to the ASQG file
    convertHitsToASQG(indexPrefix, hitsFilenames, pASQGWriter);

    // Cleanup
    delete pASQGWriter;
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Esempio n. 11
0
//
// Main
//
int correctMain(int argc, char** argv)
{
    parseCorrectOptions(argc, argv);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = NULL;

    // If the correction mode is k-mer only, then do not load the reverse
    // BWT as it is not needed
    if(opt::algorithm != ECA_KMER)
        pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
    
    BWTIntervalCache intervalCache(opt::intervalCacheLength, pBWT);

    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, NULL, 
                                                         opt::errorRate, opt::seedLength, 
                                                         opt::seedStride, false, opt::branchCutoff);
    

    // Learn the parameters of the kmer corrector
    if(opt::bLearnKmerParams)
    {
        int threshold = learnKmerParameters(pBWT);
        if(threshold != -1)
            CorrectionThresholds::Instance().setBaseMinSupport(threshold);
    }


    // Open outfiles and start a timer
    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Set the error correction parameters
    ErrorCorrectParameters ecParams;
    ecParams.pOverlapper = pOverlapper;
    ecParams.pIntervalCache = &intervalCache;
    ecParams.algorithm = opt::algorithm;

    ecParams.minOverlap = opt::minOverlap;
    ecParams.numOverlapRounds = opt::numOverlapRounds;
    ecParams.conflictCutoff = opt::conflictCutoff;

    ecParams.numKmerRounds = opt::numKmerRounds;
    ecParams.kmerLength = opt::kmerLength;
    ecParams.printOverlaps = opt::verbose > 1;

    // Setup post-processor
    bool bCollectMetrics = !opt::metricsFile.empty();
    ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics);

    if(opt::numThreads <= 1)
    {
        // Serial mode
        ErrorCorrectProcess processor(ecParams); 
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         ErrorCorrectResult, 
                                                         ErrorCorrectProcess, 
                                                         ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<ErrorCorrectProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams);
            processorVector.push_back(pProcessor);
        }
        
        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                           ErrorCorrectResult, 
                                                           ErrorCorrectProcess, 
                                                           ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
        {
            delete processorVector[i];
        }
    }

    if(bCollectMetrics)
    {
        std::ostream* pMetricsWriter = createWriter(opt::metricsFile);
        postProcessor.writeMetrics(pMetricsWriter);
        delete pMetricsWriter;
    }

    delete pBWT;
    if(pRBWT != NULL)
        delete pRBWT;

    delete pOverlapper;
    delete pTimer;
    
    delete pWriter;
    if(pDiscardWriter != NULL)
        delete pDiscardWriter;

    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Esempio n. 12
0
//
// Main
//
int correctMain(int argc, char** argv)
{
    parseCorrectOptions(argc, argv);

    std::cout << "Correcting sequencing errors for " << opt::readsFile << "\n";

    // Load indices
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = NULL;
    SampledSuffixArray* pSSA = NULL;

    if(opt::algorithm == ECA_OVERLAP)
        pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);

    BWTIntervalCache* pIntervalCache = new BWTIntervalCache(opt::intervalCacheLength, pBWT);

    BWTIndexSet indexSet;
    indexSet.pBWT = pBWT;
    indexSet.pRBWT = pRBWT;
    indexSet.pSSA = pSSA;
    indexSet.pCache = pIntervalCache;

    // Learn the parameters of the kmer corrector
    if(opt::bLearnKmerParams)
    {
        int threshold = learnKmerParameters(pBWT);
        if(threshold != -1)
            CorrectionThresholds::Instance().setBaseMinSupport(threshold);
    }

    // Open outfiles and start a timer
    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Set the error correction parameters
    ErrorCorrectParameters ecParams;
    ecParams.pOverlapper = NULL;
    ecParams.indices = indexSet;
    ecParams.algorithm = opt::algorithm;

    ecParams.minOverlap = opt::minOverlap;
    ecParams.numOverlapRounds = opt::numOverlapRounds;
    ecParams.minIdentity = 1.0f - opt::errorRate;
    ecParams.conflictCutoff = opt::conflictCutoff;

    ecParams.numKmerRounds = opt::numKmerRounds;
    ecParams.kmerLength = opt::kmerLength;
    ecParams.printOverlaps = opt::verbose > 0;

    // Setup post-processor
    bool bCollectMetrics = !opt::metricsFile.empty();
    ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics);

    if(opt::numThreads <= 1)
    {
        // Serial mode
        ErrorCorrectProcess processor(ecParams);
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                 ErrorCorrectResult,
                                 ErrorCorrectProcess,
                                 ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<ErrorCorrectProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams);
            processorVector.push_back(pProcessor);
        }

        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                 ErrorCorrectResult,
                                 ErrorCorrectProcess,
                                 ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
        {
            delete processorVector[i];
        }
    }

    if(bCollectMetrics)
    {
        std::ostream* pMetricsWriter = createWriter(opt::metricsFile);
        postProcessor.writeMetrics(pMetricsWriter);
        delete pMetricsWriter;
    }

    delete pBWT;
    delete pIntervalCache;
    if(pRBWT != NULL)
        delete pRBWT;

    if(pSSA != NULL)
        delete pSSA;

    delete pTimer;

    delete pWriter;
    if(pDiscardWriter != NULL)
        delete pDiscardWriter;

    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Esempio n. 13
0
//
// Main
//
int FMMergeMain(int argc, char** argv)
{
    parseFMMergeOptions(argc, argv);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); 
    pOverlapper->setExactModeOverlap(true);
    pOverlapper->setExactModeIrreducible(true);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Construct a bitvector indicating what reads have been used
    // All the processes read from this vector and only the post processor
    // writes to it.
    BitVector markedReads(pBWT->getNumStrings());

    std::ostream* pWriter = createWriter(opt::outFile);
    FMMergePostProcess postProcessor(pWriter, &markedReads);

    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT);
        FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads);
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<FMMergeProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads);
            processorVector.push_back(pProcessor);
        }

        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor);
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }

    // Check that every bit was set in the bit vector
    size_t numSet = 0;
    size_t numTotal = pBWT->getNumStrings();
    for(size_t i = 0; i < numTotal; ++i)
    {
        if(markedReads.test(i))
            ++numSet;
    }

    // Get the number of strings in the BWT, this is used to pre-allocated the read table
    delete pOverlapper;
    delete pBWT; 
    delete pRBWT;
    delete pWriter;

    // Cleanup
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}