// Compute the hits for each read in the SeqReader file with threading // The way this works is we create a vector of numThreads OverlapProcess pointers and // pass this to the SequenceProcessFramework which wraps the processes // in threads and distributes the reads to each thread. // The number of reads processsed is returned size_t computeHitsParallel(int numThreads, const std::string& prefix, const std::string& readsFile, const OverlapAlgorithm* pOverlapper, int minOverlap, StringVector& filenameVec, std::ostream* pASQGWriter) { std::string filename = prefix + HITS_EXT + GZIP_EXT; std::vector<OverlapProcess*> processorVector; for(int i = 0; i < numThreads; ++i) { std::stringstream ss; ss << prefix << "-thread" << i << HITS_EXT << GZIP_EXT; std::string outfile = ss.str(); filenameVec.push_back(outfile); OverlapProcess* pProcessor = new OverlapProcess(outfile, pOverlapper, minOverlap); processorVector.push_back(pProcessor); } // The post processing is performed serially so only one post processor is created OverlapPostProcess postProcessor(pASQGWriter, pOverlapper); size_t numProcessed = SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, OverlapResult, OverlapProcess, OverlapPostProcess>(readsFile, processorVector, &postProcessor); for(int i = 0; i < numThreads; ++i) delete processorVector[i]; return numProcessed; }
// // Main // int statsMain(int argc, char** argv) { parseStatsOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); if(opt::bPrintRunLengths) { pBWT->printInfo(); pBWT->printRunLengths(); } SeqReader reader(opt::readsFile); StatsPostProcess postProcessor(opt::bPrintKmerDist); if(opt::numThreads <= 1) { // Serial mode StatsProcess processor(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, StatsResult, StatsProcess, StatsPostProcess>(reader, &processor, &postProcessor, opt::numReads); } else { // Parallel mode std::vector<StatsProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { StatsProcess* pProcessor = new StatsProcess(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, StatsResult, StatsProcess, StatsPostProcess>(reader, processorVector, &postProcessor, opt::numReads); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } delete pBWT; delete pRBWT; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// Compute the hits for each read in the input file without threading // Return the number of reads processed size_t computeHitsSerial(const std::string& prefix, const std::string& readsFile, const OverlapAlgorithm* pOverlapper, int minOverlap, StringVector& filenameVec, std::ostream* pASQGWriter) { std::string filename = prefix + HITS_EXT + GZIP_EXT; filenameVec.push_back(filename); OverlapProcess processor(filename, pOverlapper, minOverlap); OverlapPostProcess postProcessor(pASQGWriter, pOverlapper); size_t numProcessed = SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, OverlapResult, OverlapProcess, OverlapPostProcess>(readsFile, &processor, &postProcessor); return numProcessed; }
// Compute the gap array for the first n items in pReader void computeGapArray(SeqReader* pReader, size_t n, const BWT* pBWT, bool doReverse, int numThreads, GapArray* pGapArray, bool removeMode, size_t& num_strings_read, size_t& num_symbols_read) { // Create the gap array size_t gap_array_size = pBWT->getBWLen() + 1; pGapArray->resize(gap_array_size); // The rank processor calculates the rank of every suffix of a given sequence // and returns a vector of ranks. The postprocessor takes in the vector // and updates the gap array RankPostProcess postProcessor(pGapArray); size_t numProcessed = 0; if(numThreads <= 1) { RankProcess processor(pBWT, pGapArray, doReverse, removeMode); numProcessed = SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, RankResult, RankProcess, RankPostProcess>(*pReader, &processor, &postProcessor, n); } else { typedef std::vector<RankProcess*> RankProcessVector; RankProcessVector rankProcVec; for(int i = 0; i < numThreads; ++i) { RankProcess* pProcess = new RankProcess(pBWT, pGapArray, doReverse, removeMode); rankProcVec.push_back(pProcess); } numProcessed = SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, RankResult, RankProcess, RankPostProcess>(*pReader, rankProcVec, &postProcessor, n); for(int i = 0; i < numThreads; ++i) delete rankProcVec[i]; } num_strings_read = postProcessor.getNumStringsProcessed(); num_symbols_read = postProcessor.getNumSymbolsProcessed(); assert(n == (size_t)-1 || (numProcessed == n)); }
void cluster() { BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true); pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f); pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f); BitVector markedReads(pBWT->getNumStrings()); std::string preclustersFile = opt::outFile + ".preclusters"; std::ostream* pPreWriter = createWriter(preclustersFile); ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads); // Set the cluster parameters ClusterParameters parameters; parameters.pOverlapper = pOverlapper; parameters.minOverlap = opt::minOverlap; parameters.maxClusterSize = opt::maxSize; parameters.maxIterations = opt::maxIterations; parameters.pMarkedReads = &markedReads; // Read the limit kmer sequences, if provided std::set<std::string>* pLimitKmers = NULL; if(!opt::limitFile.empty()) { // Read in the limit sequences pLimitKmers = new std::set<std::string>; readLimitKmers(pLimitKmers); parameters.pLimitKmers = pLimitKmers; parameters.limitK = opt::limitKmer; } else { parameters.pLimitKmers = NULL; parameters.limitK = 0; } // Make pre-clusters from the reads if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT); ClusterProcess processor(parameters); // If the extend file is empty, build new clusters if(opt::extendFile.empty()) { PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor); } else { // Process a set of preexisting clusters ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor); } } else { printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<ClusterProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ClusterProcess* pProcessor = new ClusterProcess(parameters); processorVector.push_back(pProcessor); } if(opt::extendFile.empty()) { PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor); } else { ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor); } for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } delete pPreWriter; delete pBWT; delete pRBWT; delete pOverlapper; // Deallocate limit kmers if(pLimitKmers != NULL) delete pLimitKmers; // Open the preclusters file and convert them to read names SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT); ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings()); size_t seedIdx = 0; std::istream* pPreReader = createReader(preclustersFile); std::ostream* pClusterWriter = createWriter(opt::outFile); std::string line; while(getline(*pPreReader,line)) { std::stringstream parser(line); std::string clusterName; std::string readSequence; size_t clusterSize; int64_t lowIdx; int64_t highIdx; parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx; if(lowIdx > highIdx) { // This is an extra read that is not present in the FM-index // Output a record with a fake read ID *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n"; } else { for(int64_t i = lowIdx; i <= highIdx; ++i) { const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID()); std::string readName = targetInfo.id; *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n"; } } } unlink(preclustersFile.c_str()); delete pFwdSAI; delete pRIT; delete pPreReader; delete pClusterWriter; }
// // Main // int FMindexWalkMain(int argc, char** argv) { parseFMWalkOptions(argc, argv); // Set the error correction parameters FMIndexWalkParameters ecParams; BWT *pBWT, *pRBWT; SampledSuffixArray* pSSA; // Load indices #pragma omp parallel { #pragma omp single nowait { //Initialization of large BWT takes some time, pass the disk to next job std::cout << std::endl << "Loading BWT: " << opt::prefix + BWT_EXT << "\n"; pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); } #pragma omp single nowait { std::cout << "Loading RBWT: " << opt::prefix + RBWT_EXT << "\n"; pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); } #pragma omp single nowait { std::cout << "Loading Sampled Suffix Array: " << opt::prefix + SAI_EXT << "\n"; pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI); } } BWTIndexSet indexSet; indexSet.pBWT = pBWT; indexSet.pRBWT = pRBWT; indexSet.pSSA = pSSA; ecParams.indices = indexSet; // Sample 100000 kmer counts into KmerDistribution from reverse BWT // Don't sample from forward BWT as Illumina reads are bad at the 3' end ecParams.kd = BWTAlgorithms::sampleKmerCounts(opt::minOverlap, 100000, pRBWT); ecParams.kd.computeKDAttributes(); // const size_t RepeatKmerFreq = ecParams.kd.getCutoffForProportion(0.95); std::cout << "Median kmer frequency: " <<ecParams.kd.getMedian() << "\t Std: " << ecParams.kd.getSdv() <<"\t 95% kmer frequency: " << ecParams.kd.getCutoffForProportion(0.95) << "\t Repeat frequency cutoff: " << ecParams.kd.getRepeatKmerCutoff() << "\n"; // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); ecParams.algorithm = opt::algorithm; ecParams.kmerLength = opt::kmerLength; ecParams.printOverlaps = opt::verbose > 0; ecParams.maxLeaves = opt::maxLeaves; ecParams.maxInsertSize = opt::maxInsertSize; ecParams.minOverlap = opt::minOverlap; ecParams.maxOverlap = opt::maxOverlap; // Setup post-processor FMIndexWalkPostProcess postProcessor(pWriter, pDiscardWriter, ecParams); std::cout << "Merge paired end reads into long reads for " << opt::readsFile << " using \n" << "min overlap=" << ecParams.minOverlap << "\t" << "max overlap=" << ecParams.maxOverlap << "\t" << "max leaves=" << opt::maxLeaves << "\t" << "max Insert size=" << opt::maxInsertSize << "\t" << "kmer size=" << opt::kmerLength << "\n\n"; if(opt::numThreads <= 1) { // Serial mode FMIndexWalkProcess processor(ecParams); if (ecParams.algorithm == FMW_HYBRID || ecParams.algorithm == FMW_MERGE) SequenceProcessFramework::processSequencesSerial<SequenceWorkItemPair, FMIndexWalkResult, FMIndexWalkProcess, FMIndexWalkPostProcess>(opt::readsFile, &processor, &postProcessor); else SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, FMIndexWalkResult, FMIndexWalkProcess, FMIndexWalkPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<FMIndexWalkProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { FMIndexWalkProcess* pProcessor = new FMIndexWalkProcess(ecParams); processorVector.push_back(pProcessor); } if (ecParams.algorithm == FMW_HYBRID || ecParams.algorithm == FMW_MERGE) SequenceProcessFramework::processSequencesParallel<SequenceWorkItemPair, FMIndexWalkResult, FMIndexWalkProcess, FMIndexWalkPostProcess>(opt::readsFile, processorVector, &postProcessor); else SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, FMIndexWalkResult, FMIndexWalkProcess, FMIndexWalkPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } delete pBWT; if(pRBWT != NULL) delete pRBWT; if(pSSA != NULL) delete pSSA; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; return 0; }
// // Main // int correctMain(int argc, char** argv) { parseCorrectOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = NULL; // If the correction mode is k-mer only, then do not load the reverse // BWT as it is not needed if(opt::algorithm != ECA_KMER) pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); BWTIntervalCache intervalCache(opt::intervalCacheLength, pBWT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, NULL, opt::errorRate, opt::seedLength, opt::seedStride, false, opt::branchCutoff); // Learn the parameters of the kmer corrector if(opt::bLearnKmerParams) { int threshold = learnKmerParameters(pBWT); if(threshold != -1) CorrectionThresholds::Instance().setBaseMinSupport(threshold); } // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Set the error correction parameters ErrorCorrectParameters ecParams; ecParams.pOverlapper = pOverlapper; ecParams.pIntervalCache = &intervalCache; ecParams.algorithm = opt::algorithm; ecParams.minOverlap = opt::minOverlap; ecParams.numOverlapRounds = opt::numOverlapRounds; ecParams.conflictCutoff = opt::conflictCutoff; ecParams.numKmerRounds = opt::numKmerRounds; ecParams.kmerLength = opt::kmerLength; ecParams.printOverlaps = opt::verbose > 1; // Setup post-processor bool bCollectMetrics = !opt::metricsFile.empty(); ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics); if(opt::numThreads <= 1) { // Serial mode ErrorCorrectProcess processor(ecParams); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<ErrorCorrectProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } if(bCollectMetrics) { std::ostream* pMetricsWriter = createWriter(opt::metricsFile); postProcessor.writeMetrics(pMetricsWriter); delete pMetricsWriter; } delete pBWT; if(pRBWT != NULL) delete pRBWT; delete pOverlapper; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int PacBioCorrectionMain(int argc, char** argv) { parsePacBioCorrectionOptions(argc, argv); // Set the error correction parameters PacBioCorrectionParameters ecParams; BWT *pBWT, *pRBWT; SampledSuffixArray* pSSA; // Load indices #pragma omp parallel { #pragma omp single nowait { //Initialization of large BWT takes some time, pass the disk to next job std::cout << std::endl << "Loading BWT: " << opt::prefix + BWT_EXT << "\n"; pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); } #pragma omp single nowait { std::cout << "Loading RBWT: " << opt::prefix + RBWT_EXT << "\n"; pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); } #pragma omp single nowait { std::cout << "Loading Sampled Suffix Array: " << opt::prefix + SAI_EXT << "\n"; pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI); } } // Sample 100000 kmer counts into KmerDistribution from reverse BWT // Don't sample from forward BWT as Illumina reads are bad at the 3' end // ecParams.kd = BWTAlgorithms::sampleKmerCounts(opt::kmerLength, 100000, pBWT); // ecParams.kd.computeKDAttributes(); // ecParams.kd.print(100); // const size_t RepeatKmerFreq = ecParams.kd.getCutoffForProportion(0.95); // std::cout << "Median kmer frequency: " <<ecParams.kd.getMedian() << "\t Std: " << ecParams.kd.getSdv() // <<"\t 95% kmer frequency: " << ecParams.kd.getCutoffForProportion(0.95) // << "\t Repeat frequency cutoff: " << ecParams.kd.getRepeatKmerCutoff() << "\n"; BWTIndexSet indexSet; indexSet.pBWT = pBWT; indexSet.pRBWT = pRBWT; indexSet.pSSA = pSSA; ecParams.indices = indexSet; // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); ecParams.algorithm = opt::algorithm; ecParams.kmerLength = opt::kmerLength; ecParams.maxLeaves = opt::maxLeaves; ecParams.minOverlap = opt::minOverlap; ecParams.maxOverlap = opt::maxOverlap; ecParams.minKmerLength = opt::minKmerLength; ecParams.seedKmerThreshold = opt::seedKmerThreshold; ecParams.FMWKmerThreshold = opt::kmerThreshold; ecParams.numOfNextTarget = opt::numOfNextTarget; ecParams.collectedSeeds = opt::collect; ecParams.isSplit = opt::split; ecParams.isFirst = opt::isFirst; ecParams.maxSeedInterval = opt::maxSeedInterval; if(ecParams.algorithm == PBC_SELF) { std::cout << std::endl << "Correcting PacBio reads for " << opt::readsFile << " using--" << std::endl << "number of threads:\t" << opt::numThreads << std::endl << "large kmer size:\t" << ecParams.kmerLength << std::endl << "large kmer freq. cutoff:\t" << ecParams.seedKmerThreshold << std::endl << "small kmer size:\t" << ecParams.minKmerLength << std::endl << "small kmer freq. cutoff:\t" << ecParams.FMWKmerThreshold << std::endl << "max leaves:\t" << ecParams.maxLeaves << std::endl << "max depth:\t1.2~0.8* (length between two seeds +- 20)" << std::endl << "num of next Targets:\t" << ecParams.numOfNextTarget << std::endl; } else if(ecParams.algorithm == PBC_HYBRID) { std::cout << std::endl << "Correcting PacBio reads for " << opt::readsFile << " using--" << std::endl << "number of threads:\t" << opt::numThreads << std::endl << "max kmer size:\t" << ecParams.kmerLength << std::endl << "min kmer size:\t" << ecParams.minKmerLength << std::endl << "seed kmer threshold:\t" << ecParams.seedKmerThreshold << std::endl << "max distance of searching seed:\t2* tendency distance" << std::endl << "max overlap:\t" << ecParams.maxOverlap << std::endl << "max leaves:\t" << ecParams.maxLeaves << std::endl << "search depth:\t1.2~0.8* (length between two seeds +- 10)" << std::endl << "kmer threshold:\t" << ecParams.FMWKmerThreshold << std::endl << std::endl; // computing distance of various continuous matches length (dk) for(int i = 0 ; i <= ecParams.kmerLength ; i++) { if(i >= ecParams.minKmerLength && i <= ecParams.kmerLength) ecParams.seedWalkDistance.push_back(2*3.8649*pow(2.7183,0.1239*i)); else ecParams.seedWalkDistance.push_back(0); } } // Setup post-processor PacBioCorrectionPostProcess postProcessor(pWriter, pDiscardWriter, ecParams); if(opt::numThreads <= 1) { // Serial mode PacBioCorrectionProcess processor(ecParams); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, PacBioCorrectionResult, PacBioCorrectionProcess, PacBioCorrectionPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<PacBioCorrectionProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { PacBioCorrectionProcess* pProcessor = new PacBioCorrectionProcess(ecParams); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, PacBioCorrectionResult, PacBioCorrectionProcess, PacBioCorrectionPostProcess>(opt::readsFile, processorVector, &postProcessor); // SequenceProcessFramework::processSequencesParallelOpenMP<SequenceWorkItem, // PacBioCorrectionResult, // PacBioCorrectionProcess, // PacBioCorrectionPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } delete pBWT; if(pRBWT != NULL) delete pRBWT; if(pSSA != NULL) delete pSSA; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; return 0; }
// // Main // int correctMain(int argc, char** argv) { parseCorrectOptions(argc, argv); std::cout << "Correcting sequencing errors for " << opt::readsFile << "\n"; // Load indices BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = NULL; SampledSuffixArray* pSSA = NULL; if(opt::algorithm == ECA_OVERLAP) pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI); BWTIntervalCache* pIntervalCache = new BWTIntervalCache(opt::intervalCacheLength, pBWT); BWTIndexSet indexSet; indexSet.pBWT = pBWT; indexSet.pRBWT = pRBWT; indexSet.pSSA = pSSA; indexSet.pCache = pIntervalCache; // Learn the parameters of the kmer corrector if(opt::bLearnKmerParams) { int threshold = learnKmerParameters(pBWT); if(threshold != -1) CorrectionThresholds::Instance().setBaseMinSupport(threshold); } // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Set the error correction parameters ErrorCorrectParameters ecParams; ecParams.pOverlapper = NULL; ecParams.indices = indexSet; ecParams.algorithm = opt::algorithm; ecParams.minOverlap = opt::minOverlap; ecParams.numOverlapRounds = opt::numOverlapRounds; ecParams.minIdentity = 1.0f - opt::errorRate; ecParams.conflictCutoff = opt::conflictCutoff; ecParams.numKmerRounds = opt::numKmerRounds; ecParams.kmerLength = opt::kmerLength; ecParams.printOverlaps = opt::verbose > 0; // Setup post-processor bool bCollectMetrics = !opt::metricsFile.empty(); ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics); if(opt::numThreads <= 1) { // Serial mode ErrorCorrectProcess processor(ecParams); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<ErrorCorrectProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } if(bCollectMetrics) { std::ostream* pMetricsWriter = createWriter(opt::metricsFile); postProcessor.writeMetrics(pMetricsWriter); delete pMetricsWriter; } delete pBWT; delete pIntervalCache; if(pRBWT != NULL) delete pRBWT; if(pSSA != NULL) delete pSSA; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int FMMergeMain(int argc, char** argv) { parseFMMergeOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); pOverlapper->setExactModeOverlap(true); pOverlapper->setExactModeIrreducible(true); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Construct a bitvector indicating what reads have been used // All the processes read from this vector and only the post processor // writes to it. BitVector markedReads(pBWT->getNumStrings()); std::ostream* pWriter = createWriter(opt::outFile); FMMergePostProcess postProcessor(pWriter, &markedReads); if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT); FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, &processor, &postProcessor); } else { printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<FMMergeProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor); for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } // Check that every bit was set in the bit vector size_t numSet = 0; size_t numTotal = pBWT->getNumStrings(); for(size_t i = 0; i < numTotal; ++i) { if(markedReads.test(i)) ++numSet; } // Get the number of strings in the BWT, this is used to pre-allocated the read table delete pOverlapper; delete pBWT; delete pRBWT; delete pWriter; // Cleanup delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }