Пример #1
0
// Compute the hits for each read in the SeqReader file with threading
// The way this works is we create a vector of numThreads OverlapProcess pointers and 
// pass this to the SequenceProcessFramework which wraps the processes
// in threads and distributes the reads to each thread.
// The number of reads processsed is returned
size_t computeHitsParallel(int numThreads, const std::string& prefix, const std::string& readsFile, 
                           const OverlapAlgorithm* pOverlapper, int minOverlap, 
                           StringVector& filenameVec, std::ostream* pASQGWriter)
{
    std::string filename = prefix + HITS_EXT + GZIP_EXT;

    std::vector<OverlapProcess*> processorVector;
    for(int i = 0; i < numThreads; ++i)
    {
        std::stringstream ss;
        ss << prefix << "-thread" << i << HITS_EXT << GZIP_EXT;
        std::string outfile = ss.str();
        filenameVec.push_back(outfile);
        OverlapProcess* pProcessor = new OverlapProcess(outfile, pOverlapper, minOverlap);
        processorVector.push_back(pProcessor);
    }

    // The post processing is performed serially so only one post processor is created
    OverlapPostProcess postProcessor(pASQGWriter, pOverlapper);
    
    size_t numProcessed = 
           SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                              OverlapResult, 
                                                              OverlapProcess, 
                                                              OverlapPostProcess>(readsFile, processorVector, &postProcessor);
    for(int i = 0; i < numThreads; ++i)
        delete processorVector[i];
    return numProcessed;
}
Пример #2
0
//
// Main
//
int statsMain(int argc, char** argv)
{
    parseStatsOptions(argc, argv);
    Timer* pTimer = new Timer(PROGRAM_IDENT);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);

    if(opt::bPrintRunLengths)
    {
        pBWT->printInfo();
        pBWT->printRunLengths();
    }

    SeqReader reader(opt::readsFile);
    
    StatsPostProcess postProcessor(opt::bPrintKmerDist);
    if(opt::numThreads <= 1)
    {
        // Serial mode
        StatsProcess processor(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap);

        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         StatsResult, 
                                                         StatsProcess, 
                                                         StatsPostProcess>(reader, &processor, &postProcessor, opt::numReads);
    }
    else
    {
        // Parallel mode
        std::vector<StatsProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            StatsProcess* pProcessor = new StatsProcess(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap);
            processorVector.push_back(pProcessor);
        }
        
        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                           StatsResult, 
                                                           StatsProcess, 
                                                           StatsPostProcess>(reader, processorVector, &postProcessor, opt::numReads);

        for(int i = 0; i < opt::numThreads; ++i)
        {
            delete processorVector[i];
        }
    }

    delete pBWT;
    delete pRBWT;
    delete pTimer;

    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Пример #3
0
// Compute the hits for each read in the input file without threading
// Return the number of reads processed
size_t computeHitsSerial(const std::string& prefix, const std::string& readsFile, 
                         const OverlapAlgorithm* pOverlapper, int minOverlap, 
                         StringVector& filenameVec, std::ostream* pASQGWriter)
{
    std::string filename = prefix + HITS_EXT + GZIP_EXT;
    filenameVec.push_back(filename);

    OverlapProcess processor(filename, pOverlapper, minOverlap);
    OverlapPostProcess postProcessor(pASQGWriter, pOverlapper);

    size_t numProcessed = 
           SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                            OverlapResult, 
                                                            OverlapProcess, 
                                                            OverlapPostProcess>(readsFile, &processor, &postProcessor);
    return numProcessed;
}
Пример #4
0
// Compute the gap array for the first n items in pReader
void computeGapArray(SeqReader* pReader, size_t n, const BWT* pBWT, bool doReverse, int numThreads, GapArray* pGapArray, 
                     bool removeMode, size_t& num_strings_read, size_t& num_symbols_read)
{
    // Create the gap array
    size_t gap_array_size = pBWT->getBWLen() + 1;
    pGapArray->resize(gap_array_size);

    // The rank processor calculates the rank of every suffix of a given sequence
    // and returns a vector of ranks. The postprocessor takes in the vector
    // and updates the gap array
    RankPostProcess postProcessor(pGapArray);
    size_t numProcessed = 0;
    if(numThreads <= 1)
    {
        RankProcess processor(pBWT, pGapArray, doReverse, removeMode);

        numProcessed = 
           SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                            RankResult, 
                                                            RankProcess, 
                                                            RankPostProcess>(*pReader, &processor, &postProcessor, n);
    }
    else
    {
        typedef std::vector<RankProcess*> RankProcessVector;
        RankProcessVector rankProcVec;
        for(int i = 0; i < numThreads; ++i)
        {
            RankProcess* pProcess = new RankProcess(pBWT, pGapArray, doReverse, removeMode);
            rankProcVec.push_back(pProcess);
        }
    
        numProcessed = 
           SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                              RankResult, 
                                                              RankProcess, 
                                                              RankPostProcess>(*pReader, rankProcVec, &postProcessor, n);

        for(int i = 0; i < numThreads; ++i)
            delete rankProcVec[i];
    }

    num_strings_read = postProcessor.getNumStringsProcessed();
    num_symbols_read = postProcessor.getNumSymbolsProcessed();
    assert(n == (size_t)-1 || (numProcessed == n));
}
Пример #5
0
void cluster()
{
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true);

    pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f);
    pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f);

    BitVector markedReads(pBWT->getNumStrings());

    std::string preclustersFile = opt::outFile + ".preclusters";
    std::ostream* pPreWriter = createWriter(preclustersFile);
    ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads);
    
    // Set the cluster parameters
    ClusterParameters parameters;
    parameters.pOverlapper = pOverlapper;
    parameters.minOverlap = opt::minOverlap;
    parameters.maxClusterSize = opt::maxSize;
    parameters.maxIterations = opt::maxIterations;
    parameters.pMarkedReads = &markedReads;

    // Read the limit kmer sequences, if provided
    std::set<std::string>* pLimitKmers = NULL;

    if(!opt::limitFile.empty())
    {
        // Read in the limit sequences
        pLimitKmers = new std::set<std::string>;
        readLimitKmers(pLimitKmers);
        parameters.pLimitKmers = pLimitKmers;
        parameters.limitK = opt::limitKmer;
    }
    else
    {
        parameters.pLimitKmers = NULL;
        parameters.limitK = 0;
    }

    // Make pre-clusters from the reads
    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT);
        ClusterProcess processor(parameters);
        
        // If the extend file is empty, build new clusters
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor);
        }
        else
        {
            // Process a set of preexisting clusters
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor);
        }
    }
    else
    {
        printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<ClusterProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ClusterProcess* pProcessor = new ClusterProcess(parameters);
            processorVector.push_back(pProcessor);
        }
        
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor);
        }
        else
        {
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor);
        }
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }
    delete pPreWriter;
    delete pBWT;
    delete pRBWT;
    delete pOverlapper;

    // Deallocate limit kmers
    if(pLimitKmers != NULL)
        delete pLimitKmers;

    // Open the preclusters file and convert them to read names
    SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT);
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings());

    size_t seedIdx = 0;
    std::istream* pPreReader = createReader(preclustersFile);
    std::ostream* pClusterWriter = createWriter(opt::outFile);
    std::string line;
    while(getline(*pPreReader,line))
    {
        std::stringstream parser(line);
        std::string clusterName;
        std::string readSequence;
        size_t clusterSize;
        int64_t lowIdx;
        int64_t highIdx;
        parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx;

        if(lowIdx > highIdx)
        {
            // This is an extra read that is not present in the FM-index
            // Output a record with a fake read ID
            *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n";
        }
        else
        {
            for(int64_t i = lowIdx; i <= highIdx; ++i)
            {
                const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID());
                std::string readName = targetInfo.id;
                *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n";
            }
        }
    }
    unlink(preclustersFile.c_str());

    delete pFwdSAI;
    delete pRIT;
    delete pPreReader;
    delete pClusterWriter;
}
Пример #6
0
//
// Main
//
int FMindexWalkMain(int argc, char** argv)
{
    parseFMWalkOptions(argc, argv);

    // Set the error correction parameters
    FMIndexWalkParameters ecParams;
	BWT *pBWT, *pRBWT;
	SampledSuffixArray* pSSA;

    // Load indices
	#pragma omp parallel
	{
		#pragma omp single nowait
		{	//Initialization of large BWT takes some time, pass the disk to next job
			std::cout << std::endl << "Loading BWT: " << opt::prefix + BWT_EXT << "\n";
			pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
		}
		#pragma omp single nowait
		{
			std::cout << "Loading RBWT: " << opt::prefix + RBWT_EXT << "\n";
			pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
		}
		#pragma omp single nowait
		{
			std::cout << "Loading Sampled Suffix Array: " << opt::prefix + SAI_EXT << "\n";
			pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);
		}
	}

    BWTIndexSet indexSet;
    indexSet.pBWT = pBWT;
    indexSet.pRBWT = pRBWT;
    indexSet.pSSA = pSSA;
    ecParams.indices = indexSet;

	// Sample 100000 kmer counts into KmerDistribution from reverse BWT 
	// Don't sample from forward BWT as Illumina reads are bad at the 3' end
	ecParams.kd = BWTAlgorithms::sampleKmerCounts(opt::minOverlap, 100000, pRBWT);
	ecParams.kd.computeKDAttributes();
	// const size_t RepeatKmerFreq = ecParams.kd.getCutoffForProportion(0.95); 
	std::cout << "Median kmer frequency: " <<ecParams.kd.getMedian() << "\t Std: " <<  ecParams.kd.getSdv() 
					<<"\t 95% kmer frequency: " << ecParams.kd.getCutoffForProportion(0.95)
					<< "\t Repeat frequency cutoff: " << ecParams.kd.getRepeatKmerCutoff() << "\n";
	
    // Open outfiles and start a timer
    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL);
    Timer* pTimer = new Timer(PROGRAM_IDENT);

    ecParams.algorithm = opt::algorithm;
    ecParams.kmerLength = opt::kmerLength;
    ecParams.printOverlaps = opt::verbose > 0;
	ecParams.maxLeaves = opt::maxLeaves;
	ecParams.maxInsertSize = opt::maxInsertSize;
    ecParams.minOverlap = opt::minOverlap;
    ecParams.maxOverlap = opt::maxOverlap;
	
    // Setup post-processor
    FMIndexWalkPostProcess postProcessor(pWriter, pDiscardWriter, ecParams);

    std::cout << "Merge paired end reads into long reads for " << opt::readsFile << " using \n" 
				<< "min overlap=" <<  ecParams.minOverlap << "\t"
				<< "max overlap=" <<  ecParams.maxOverlap << "\t"
				<< "max leaves=" << opt::maxLeaves << "\t"
				<< "max Insert size=" << opt::maxInsertSize << "\t"
				<< "kmer size=" << opt::kmerLength << "\n\n";

    if(opt::numThreads <= 1)
    {
        // Serial mode
        FMIndexWalkProcess processor(ecParams);

		if (ecParams.algorithm == FMW_HYBRID || ecParams.algorithm == FMW_MERGE)
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItemPair,
                                                         FMIndexWalkResult,
                                                         FMIndexWalkProcess,
                                                         FMIndexWalkPostProcess>(opt::readsFile, &processor, &postProcessor);

		else
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         FMIndexWalkResult,
                                                         FMIndexWalkProcess,
                                                         FMIndexWalkPostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<FMIndexWalkProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            FMIndexWalkProcess* pProcessor = new FMIndexWalkProcess(ecParams);
            processorVector.push_back(pProcessor);
        }

		if (ecParams.algorithm == FMW_HYBRID || ecParams.algorithm == FMW_MERGE)
        SequenceProcessFramework::processSequencesParallel<SequenceWorkItemPair,
                                                           FMIndexWalkResult,
                                                           FMIndexWalkProcess,
                                                           FMIndexWalkPostProcess>(opt::readsFile, processorVector, &postProcessor);

		else
        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                           FMIndexWalkResult,
                                                           FMIndexWalkProcess,
                                                           FMIndexWalkPostProcess>(opt::readsFile, processorVector, &postProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
        {
            delete processorVector[i];
        }
    }

    delete pBWT;
    if(pRBWT != NULL)
        delete pRBWT;

    if(pSSA != NULL)
        delete pSSA;

    delete pTimer;

    delete pWriter;
    if(pDiscardWriter != NULL)
        delete pDiscardWriter;
	
    return 0;
}
Пример #7
0
//
// Main
//
int correctMain(int argc, char** argv)
{
    parseCorrectOptions(argc, argv);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = NULL;

    // If the correction mode is k-mer only, then do not load the reverse
    // BWT as it is not needed
    if(opt::algorithm != ECA_KMER)
        pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
    
    BWTIntervalCache intervalCache(opt::intervalCacheLength, pBWT);

    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, NULL, 
                                                         opt::errorRate, opt::seedLength, 
                                                         opt::seedStride, false, opt::branchCutoff);
    

    // Learn the parameters of the kmer corrector
    if(opt::bLearnKmerParams)
    {
        int threshold = learnKmerParameters(pBWT);
        if(threshold != -1)
            CorrectionThresholds::Instance().setBaseMinSupport(threshold);
    }


    // Open outfiles and start a timer
    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Set the error correction parameters
    ErrorCorrectParameters ecParams;
    ecParams.pOverlapper = pOverlapper;
    ecParams.pIntervalCache = &intervalCache;
    ecParams.algorithm = opt::algorithm;

    ecParams.minOverlap = opt::minOverlap;
    ecParams.numOverlapRounds = opt::numOverlapRounds;
    ecParams.conflictCutoff = opt::conflictCutoff;

    ecParams.numKmerRounds = opt::numKmerRounds;
    ecParams.kmerLength = opt::kmerLength;
    ecParams.printOverlaps = opt::verbose > 1;

    // Setup post-processor
    bool bCollectMetrics = !opt::metricsFile.empty();
    ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics);

    if(opt::numThreads <= 1)
    {
        // Serial mode
        ErrorCorrectProcess processor(ecParams); 
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         ErrorCorrectResult, 
                                                         ErrorCorrectProcess, 
                                                         ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<ErrorCorrectProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams);
            processorVector.push_back(pProcessor);
        }
        
        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                           ErrorCorrectResult, 
                                                           ErrorCorrectProcess, 
                                                           ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
        {
            delete processorVector[i];
        }
    }

    if(bCollectMetrics)
    {
        std::ostream* pMetricsWriter = createWriter(opt::metricsFile);
        postProcessor.writeMetrics(pMetricsWriter);
        delete pMetricsWriter;
    }

    delete pBWT;
    if(pRBWT != NULL)
        delete pRBWT;

    delete pOverlapper;
    delete pTimer;
    
    delete pWriter;
    if(pDiscardWriter != NULL)
        delete pDiscardWriter;

    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
//
// Main
//
int PacBioCorrectionMain(int argc, char** argv)
{
	parsePacBioCorrectionOptions(argc, argv);

	// Set the error correction parameters
	PacBioCorrectionParameters ecParams;
	BWT *pBWT, *pRBWT;
	SampledSuffixArray* pSSA;

	// Load indices
	#pragma omp parallel
	{
		#pragma omp single nowait
		{	//Initialization of large BWT takes some time, pass the disk to next job
			std::cout << std::endl << "Loading BWT: " << opt::prefix + BWT_EXT << "\n";
			pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
		}
		#pragma omp single nowait
		{
			std::cout << "Loading RBWT: " << opt::prefix + RBWT_EXT << "\n";
			pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
		}
		#pragma omp single nowait
		{
			std::cout << "Loading Sampled Suffix Array: " << opt::prefix + SAI_EXT << "\n";
			pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);
		}
	}

	// Sample 100000 kmer counts into KmerDistribution from reverse BWT 
	// Don't sample from forward BWT as Illumina reads are bad at the 3' end
	// ecParams.kd = BWTAlgorithms::sampleKmerCounts(opt::kmerLength, 100000, pBWT);
	// ecParams.kd.computeKDAttributes();
	// ecParams.kd.print(100);
	// const size_t RepeatKmerFreq = ecParams.kd.getCutoffForProportion(0.95); 
	// std::cout << "Median kmer frequency: " <<ecParams.kd.getMedian() << "\t Std: " <<  ecParams.kd.getSdv() 
				// <<"\t 95% kmer frequency: " << ecParams.kd.getCutoffForProportion(0.95)
				// << "\t Repeat frequency cutoff: " << ecParams.kd.getRepeatKmerCutoff() << "\n";
	
	BWTIndexSet indexSet;
	indexSet.pBWT = pBWT;
	indexSet.pRBWT = pRBWT;
	indexSet.pSSA = pSSA;
	ecParams.indices = indexSet;

	
	// Open outfiles and start a timer
	std::ostream* pWriter = createWriter(opt::outFile);
	std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL);
	Timer* pTimer = new Timer(PROGRAM_IDENT);

	ecParams.algorithm = opt::algorithm;
	ecParams.kmerLength = opt::kmerLength;
	ecParams.maxLeaves = opt::maxLeaves;
	ecParams.minOverlap = opt::minOverlap;
	ecParams.maxOverlap = opt::maxOverlap;
	ecParams.minKmerLength = opt::minKmerLength;
	ecParams.seedKmerThreshold = opt::seedKmerThreshold;
	ecParams.FMWKmerThreshold = opt::kmerThreshold;
	ecParams.numOfNextTarget = opt::numOfNextTarget;
	ecParams.collectedSeeds = opt::collect;
	ecParams.isSplit = opt::split;
	ecParams.isFirst = opt::isFirst;
	ecParams.maxSeedInterval = opt::maxSeedInterval;
	
	if(ecParams.algorithm == PBC_SELF)
	{
		std::cout << std::endl << "Correcting PacBio reads for " << opt::readsFile << " using--" << std::endl
		<< "number of threads:\t" << opt::numThreads << std::endl
		<< "large kmer size:\t" << ecParams.kmerLength << std::endl 
		<< "large kmer freq. cutoff:\t" << ecParams.seedKmerThreshold << std::endl
		<< "small kmer size:\t" << ecParams.minKmerLength << std::endl
		<< "small kmer freq. cutoff:\t" << ecParams.FMWKmerThreshold << std::endl
		<< "max leaves:\t" << ecParams.maxLeaves  << std::endl
		<< "max depth:\t1.2~0.8* (length between two seeds +- 20)" << std::endl
		<< "num of next Targets:\t" << ecParams.numOfNextTarget << std::endl;
	}
	else if(ecParams.algorithm == PBC_HYBRID)
	{
		std::cout << std::endl << "Correcting PacBio reads for " << opt::readsFile << " using--" << std::endl
		<< "number of threads:\t" << opt::numThreads << std::endl
		<< "max kmer size:\t" << ecParams.kmerLength << std::endl 
		<< "min kmer size:\t" << ecParams.minKmerLength << std::endl
		<< "seed kmer threshold:\t" << ecParams.seedKmerThreshold << std::endl
		<< "max distance of searching seed:\t2* tendency distance" << std::endl							
		<< "max overlap:\t" <<  ecParams.maxOverlap << std::endl 
		<< "max leaves:\t" << ecParams.maxLeaves  << std::endl
		<< "search depth:\t1.2~0.8* (length between two seeds +- 10)" << std::endl
		<< "kmer threshold:\t" << ecParams.FMWKmerThreshold << std::endl << std::endl;
		
		// computing distance of various continuous matches length (dk)
		for(int i = 0 ; i <= ecParams.kmerLength ; i++)
		{
			if(i >= ecParams.minKmerLength && i <= ecParams.kmerLength)
			ecParams.seedWalkDistance.push_back(2*3.8649*pow(2.7183,0.1239*i));
			else
			ecParams.seedWalkDistance.push_back(0);
		}
	}
	
	// Setup post-processor
	PacBioCorrectionPostProcess postProcessor(pWriter, pDiscardWriter, ecParams);

	if(opt::numThreads <= 1)
	{
		// Serial mode
		PacBioCorrectionProcess processor(ecParams);

		SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
		PacBioCorrectionResult,
		PacBioCorrectionProcess,
		PacBioCorrectionPostProcess>(opt::readsFile, &processor, &postProcessor);
	}
	else
	{
		// Parallel mode
		std::vector<PacBioCorrectionProcess*> processorVector;
		for(int i = 0; i < opt::numThreads; ++i)
		{
			PacBioCorrectionProcess* pProcessor = new PacBioCorrectionProcess(ecParams);
			processorVector.push_back(pProcessor);
		}

		SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
		PacBioCorrectionResult,
		PacBioCorrectionProcess,
		PacBioCorrectionPostProcess>(opt::readsFile, processorVector, &postProcessor);

		// SequenceProcessFramework::processSequencesParallelOpenMP<SequenceWorkItem,
		// PacBioCorrectionResult,
		// PacBioCorrectionProcess,
		// PacBioCorrectionPostProcess>(opt::readsFile, processorVector, &postProcessor);
		
		for(int i = 0; i < opt::numThreads; ++i)
		{
			delete processorVector[i];
		}
	}

	delete pBWT;
	if(pRBWT != NULL)
	delete pRBWT;

	if(pSSA != NULL)
	delete pSSA;

	delete pTimer;

	delete pWriter;
	if(pDiscardWriter != NULL)
	delete pDiscardWriter;
	
	return 0;
}
Пример #9
0
//
// Main
//
int correctMain(int argc, char** argv)
{
    parseCorrectOptions(argc, argv);

    std::cout << "Correcting sequencing errors for " << opt::readsFile << "\n";

    // Load indices
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = NULL;
    SampledSuffixArray* pSSA = NULL;

    if(opt::algorithm == ECA_OVERLAP)
        pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);

    BWTIntervalCache* pIntervalCache = new BWTIntervalCache(opt::intervalCacheLength, pBWT);

    BWTIndexSet indexSet;
    indexSet.pBWT = pBWT;
    indexSet.pRBWT = pRBWT;
    indexSet.pSSA = pSSA;
    indexSet.pCache = pIntervalCache;

    // Learn the parameters of the kmer corrector
    if(opt::bLearnKmerParams)
    {
        int threshold = learnKmerParameters(pBWT);
        if(threshold != -1)
            CorrectionThresholds::Instance().setBaseMinSupport(threshold);
    }

    // Open outfiles and start a timer
    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Set the error correction parameters
    ErrorCorrectParameters ecParams;
    ecParams.pOverlapper = NULL;
    ecParams.indices = indexSet;
    ecParams.algorithm = opt::algorithm;

    ecParams.minOverlap = opt::minOverlap;
    ecParams.numOverlapRounds = opt::numOverlapRounds;
    ecParams.minIdentity = 1.0f - opt::errorRate;
    ecParams.conflictCutoff = opt::conflictCutoff;

    ecParams.numKmerRounds = opt::numKmerRounds;
    ecParams.kmerLength = opt::kmerLength;
    ecParams.printOverlaps = opt::verbose > 0;

    // Setup post-processor
    bool bCollectMetrics = !opt::metricsFile.empty();
    ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics);

    if(opt::numThreads <= 1)
    {
        // Serial mode
        ErrorCorrectProcess processor(ecParams);
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                 ErrorCorrectResult,
                                 ErrorCorrectProcess,
                                 ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<ErrorCorrectProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams);
            processorVector.push_back(pProcessor);
        }

        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                 ErrorCorrectResult,
                                 ErrorCorrectProcess,
                                 ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
        {
            delete processorVector[i];
        }
    }

    if(bCollectMetrics)
    {
        std::ostream* pMetricsWriter = createWriter(opt::metricsFile);
        postProcessor.writeMetrics(pMetricsWriter);
        delete pMetricsWriter;
    }

    delete pBWT;
    delete pIntervalCache;
    if(pRBWT != NULL)
        delete pRBWT;

    if(pSSA != NULL)
        delete pSSA;

    delete pTimer;

    delete pWriter;
    if(pDiscardWriter != NULL)
        delete pDiscardWriter;

    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Пример #10
0
//
// Main
//
int FMMergeMain(int argc, char** argv)
{
    parseFMMergeOptions(argc, argv);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); 
    pOverlapper->setExactModeOverlap(true);
    pOverlapper->setExactModeIrreducible(true);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Construct a bitvector indicating what reads have been used
    // All the processes read from this vector and only the post processor
    // writes to it.
    BitVector markedReads(pBWT->getNumStrings());

    std::ostream* pWriter = createWriter(opt::outFile);
    FMMergePostProcess postProcessor(pWriter, &markedReads);

    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT);
        FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads);
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<FMMergeProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads);
            processorVector.push_back(pProcessor);
        }

        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor);
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }

    // Check that every bit was set in the bit vector
    size_t numSet = 0;
    size_t numTotal = pBWT->getNumStrings();
    for(size_t i = 0; i < numTotal; ++i)
    {
        if(markedReads.test(i))
            ++numSet;
    }

    // Get the number of strings in the BWT, this is used to pre-allocated the read table
    delete pOverlapper;
    delete pBWT; 
    delete pRBWT;
    delete pWriter;

    // Cleanup
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}