// // Main // int statsMain(int argc, char** argv) { parseStatsOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); if(opt::bPrintRunLengths) { pBWT->printInfo(); pBWT->printRunLengths(); } SeqReader reader(opt::readsFile); StatsPostProcess postProcessor(opt::bPrintKmerDist); if(opt::numThreads <= 1) { // Serial mode StatsProcess processor(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, StatsResult, StatsProcess, StatsPostProcess>(reader, &processor, &postProcessor, opt::numReads); } else { // Parallel mode std::vector<StatsProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { StatsProcess* pProcessor = new StatsProcess(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, StatsResult, StatsProcess, StatsPostProcess>(reader, processorVector, &postProcessor, opt::numReads); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } delete pBWT; delete pRBWT; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
int genSSAMain(int argc, char** argv) { Timer t("sga gen-ssa"); parseGenSSAOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT); ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID); pBWT->printInfo(); SampledSuffixArray* pSSA = new SampledSuffixArray(); pSSA->build(pBWT, pRIT, opt::sampleRate); pSSA->printInfo(); pSSA->writeSSA(opt::prefix + SSA_EXT); if(opt::validate) pSSA->validate(opt::readsFile, pBWT); delete pBWT; delete pRIT; delete pSSA; return 0; }
void CompTool::search_forward_matches(const string seq1_name, const string seq2_name, int* SA, BWT& bwt, const int kmer_size, const int slide_letters, const int max_num_matches){ stringstream out_file; out_file << seq1_name << "__" << seq2_name << ".match." << kmer_size << ".forward"; ofstream ofs(out_file.str().c_str()); ofs << "#" << seq2_name << "\t" << seq1_name << endl; // header for(int i = 0; i < seq2_size_ - kmer_size; i += slide_letters){ int8_t* query = &seq2_[i]; int lb, ub; // lower- and upper-bound of matches in suffix array bwt.search(query, kmer_size, lb, ub); if(lb <= ub){ for(int j = lb; j <= ub; j++){ if(j == lb + max_num_matches) break; ofs << i << "\t" << SA[j] << endl; } } } }
void CompTool::search_reverse_matches(const string seq1_name, const string seq2_name, int* SA, BWT& bwt, const int kmer_size, const int slide_letters, const int max_num_matches){ stringstream out_file; out_file << seq1_name << "__" << seq2_name << ".match." << kmer_size << ".reverse"; ofstream ofs(out_file.str().c_str()); ofs << "#" << seq2_name << "\t" << seq1_name << endl; // header int8_t* query = new int8_t[kmer_size]; for(int i = kmer_size-1; i < seq2_size_ - 1; i += slide_letters){ // convert k-mers to reverse complements for(int j = 0; j < kmer_size; j++) query[j] = num_char_ - seq2_[i-j]; int lb, ub; // lower- and upper-bound of matches in suffix array bwt.search(query, kmer_size, lb, ub); if(lb <= ub){ for(int j = lb; j <= ub; j++){ if(j == lb + max_num_matches) break; ofs << i << "\t" << SA[j] << endl; } } } delete[] query; }
void cluster() { BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true); pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f); pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f); BitVector markedReads(pBWT->getNumStrings()); std::string preclustersFile = opt::outFile + ".preclusters"; std::ostream* pPreWriter = createWriter(preclustersFile); ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads); // Set the cluster parameters ClusterParameters parameters; parameters.pOverlapper = pOverlapper; parameters.minOverlap = opt::minOverlap; parameters.maxClusterSize = opt::maxSize; parameters.maxIterations = opt::maxIterations; parameters.pMarkedReads = &markedReads; // Read the limit kmer sequences, if provided std::set<std::string>* pLimitKmers = NULL; if(!opt::limitFile.empty()) { // Read in the limit sequences pLimitKmers = new std::set<std::string>; readLimitKmers(pLimitKmers); parameters.pLimitKmers = pLimitKmers; parameters.limitK = opt::limitKmer; } else { parameters.pLimitKmers = NULL; parameters.limitK = 0; } // Make pre-clusters from the reads if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT); ClusterProcess processor(parameters); // If the extend file is empty, build new clusters if(opt::extendFile.empty()) { PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor); } else { // Process a set of preexisting clusters ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor); } } else { printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<ClusterProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ClusterProcess* pProcessor = new ClusterProcess(parameters); processorVector.push_back(pProcessor); } if(opt::extendFile.empty()) { PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor); } else { ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor); } for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } delete pPreWriter; delete pBWT; delete pRBWT; delete pOverlapper; // Deallocate limit kmers if(pLimitKmers != NULL) delete pLimitKmers; // Open the preclusters file and convert them to read names SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT); ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings()); size_t seedIdx = 0; std::istream* pPreReader = createReader(preclustersFile); std::ostream* pClusterWriter = createWriter(opt::outFile); std::string line; while(getline(*pPreReader,line)) { std::stringstream parser(line); std::string clusterName; std::string readSequence; size_t clusterSize; int64_t lowIdx; int64_t highIdx; parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx; if(lowIdx > highIdx) { // This is an extra read that is not present in the FM-index // Output a record with a fake read ID *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n"; } else { for(int64_t i = lowIdx; i <= highIdx; ++i) { const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID()); std::string readName = targetInfo.id; *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n"; } } } unlink(preclustersFile.c_str()); delete pFwdSAI; delete pRIT; delete pPreReader; delete pClusterWriter; }
// // Main // int filterMain(int argc, char** argv) { parseFilterOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); //pBWT->printInfo(); std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = createWriter(opt::discardFile); QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter); // If performing duplicate check, create a bitvector to record // which reads are duplicates BitVector* pSharedBV = NULL; if(opt::dupCheck) pSharedBV = new BitVector(pBWT->getNumStrings()); // Set up QC parameters QCParameters params; params.pBWT = pBWT; params.pRevBWT = pRBWT; params.pSharedBV = pSharedBV; params.checkDuplicates = opt::dupCheck; params.substringOnly = opt::substringOnly; params.checkKmer = opt::kmerCheck; params.checkHPRuns = opt::hpCheck; params.checkDegenerate = opt::lowComplexityCheck; params.verbose = opt::verbose; params.kmerLength = opt::kmerLength; params.kmerThreshold = opt::kmerThreshold; params.hpKmerLength = 51; params.hpHardAcceptCount = 10; params.hpMinProportion = 0.1f; params.hpMinLength = 6; if(opt::numThreads <= 1) { // Serial mode QCProcess processor(params); PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor); } else { // Parallel mode std::vector<QCProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { QCProcess* pProcessor = new QCProcess(params); processorVector.push_back(pProcessor); } PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor); for(int i = 0; i < opt::numThreads; ++i) delete processorVector[i]; } delete pPostProcessor; delete pWriter; delete pDiscardWriter; delete pBWT; delete pRBWT; if(pSharedBV != NULL) delete pSharedBV; std::cout << "RE-building index for " << opt::outFile << " in memory using ropebwt2\n"; std::string prefix=stripFilename(opt::outFile); //BWT *pBWT, *pRBWT; #pragma omp parallel { #pragma omp single nowait { std::string bwt_filename = prefix + BWT_EXT; BWTCA::runRopebwt2(opt::outFile, bwt_filename, opt::numThreads, false); std::cout << "\t done bwt construction, generating .sai file\n"; pBWT = new BWT(bwt_filename); } #pragma omp single nowait { std::string rbwt_filename = prefix + RBWT_EXT; BWTCA::runRopebwt2(opt::outFile, rbwt_filename, opt::numThreads, true); std::cout << "\t done rbwt construction, generating .rsai file\n"; pRBWT = new BWT(rbwt_filename); } } std::string sai_filename = prefix + SAI_EXT; SampledSuffixArray ssa; ssa.buildLexicoIndex(pBWT, opt::numThreads); ssa.writeLexicoIndex(sai_filename); delete pBWT; std::string rsai_filename = prefix + RSAI_EXT; SampledSuffixArray rssa; rssa.buildLexicoIndex(pRBWT, opt::numThreads); rssa.writeLexicoIndex(rsai_filename); delete pRBWT; // Cleanup delete pTimer; return 0; }
// // Main // int graphDiffMain(int argc, char** argv) { parseGraphDiffOptions(argc, argv); // Create BWTS std::string basePrefix = stripFilename(opt::baseFile); BWT* pBaseBWT = new BWT(basePrefix + BWT_EXT, opt::sampleRate); BWT* pBaseRevBWT = new BWT(basePrefix + RBWT_EXT, opt::sampleRate); std::string variantPrefix = stripFilename(opt::variantFile); BWT* pVariantBWT = new BWT(variantPrefix + BWT_EXT, opt::sampleRate); BWT* pVariantRevBWT = new BWT(variantPrefix + RBWT_EXT, opt::sampleRate); // Create the shared bit vector and shared results aggregator BitVector* pSharedBitVector = new BitVector(pVariantBWT->getBWLen()); GraphCompareAggregateResults* pSharedResults = new GraphCompareAggregateResults(opt::outFile); // Create interval caches to speed up k-mer lookups BWTIntervalCache varBWTCache(opt::cacheLength, pVariantBWT); BWTIntervalCache varRBWTCache(opt::cacheLength, pVariantRevBWT); BWTIntervalCache baseBWTCache(opt::cacheLength, pBaseBWT); BWTIntervalCache baseRBWTCache(opt::cacheLength, pBaseRevBWT); // Set the parameters shared between all threads GraphCompareParameters sharedParameters; sharedParameters.pVariantBWT = pVariantBWT; sharedParameters.pVariantRevBWT = pVariantRevBWT; sharedParameters.pBaseBWT = pBaseBWT; sharedParameters.pBaseRevBWT = pBaseRevBWT; sharedParameters.kmer = opt::kmer; sharedParameters.pBitVector = pSharedBitVector; sharedParameters.kmerThreshold = 3; sharedParameters.maxBranches = opt::maxBranches; sharedParameters.pVarBWTCache = &varBWTCache; sharedParameters.pVarRevBWTCache = &varRBWTCache; sharedParameters.pBaseBWTCache = &baseBWTCache; sharedParameters.pBaseRevBWTCache = &baseRBWTCache; if(opt::numThreads <= 1) { printf("[%s] starting serial-mode graph diff\n", PROGRAM_IDENT); GraphCompare graphCompare(sharedParameters); PROCESS_GDIFF_SERIAL(opt::variantFile, &graphCompare, pSharedResults); graphCompare.updateSharedStats(pSharedResults); } else { printf("[%s] starting parallel-mode graph diff with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<GraphCompare*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { GraphCompare* pProcessor = new GraphCompare(sharedParameters); processorVector.push_back(pProcessor); } PROCESS_GDIFF_PARALLEL(opt::variantFile, processorVector, pSharedResults); for(size_t i = 0; i < processorVector.size(); ++i) { // Update the shared stats processorVector[i]->updateSharedStats(pSharedResults); delete processorVector[i]; processorVector[i] = NULL; } } pSharedResults->printStats(); // Cleanup delete pBaseBWT; delete pBaseRevBWT; delete pVariantBWT; delete pVariantRevBWT; delete pSharedBitVector; delete pSharedResults; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int overlapLongMain(int argc, char** argv) { parseOverlapLongOptions(argc, argv); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setTransitiveTag(true); headerRecord.write(*pASQGWriter); // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Read the sequence file and write vertex records for each // Also store the read names in a vector of strings ReadTable reads; SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION); SeqRecord record; while(pReader->get(record)) { reads.addRead(record.toSeqItem()); ASQG::VertexRecord vr(record.id, record.seq.toString()); vr.write(*pASQGWriter); if(reads.getCount() % 100000 == 0) printf("Read %zu sequences\n", reads.getCount()); } delete pReader; pReader = NULL; BWTIndexSet index; index.pBWT = pBWT; index.pSSA = pSSA; index.pReadTable = &reads; // Make a prefix for the temporary hits files size_t n_reads = reads.getCount(); omp_set_num_threads(opt::numThreads); #pragma omp parallel for for(size_t read_idx = 0; read_idx < n_reads; ++read_idx) { const SeqItem& curr_read = reads.getRead(read_idx); printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length()); SequenceOverlapPairVector sopv = KmerOverlaps::retrieveMatches(curr_read.seq.toString(), opt::seedLength, opt::minOverlap, 1 - opt::errorRate, 100, index); printf("Found %zu matches\n", sopv.size()); for(size_t i = 0; i < sopv.size(); ++i) { std::string match_id = reads.getRead(sopv[i].match_idx).id; // We only want to output each edge once so skip this overlap // if the matched read has a lexicographically lower ID if(curr_read.id > match_id) continue; std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50); printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(), sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, match_id.c_str(), sopv[i].overlap.getOverlapLength(), sopv[i].overlap.getPercentIdentity(), sopv[i].overlap.cigar.c_str()); // Convert to ASQG SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]); SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]); // KmerOverlaps returns the coordinates of the overlap after flipping the reads // to ensure the strand matches. The ASQG file wants the coordinate of the original // sequencing strand. Flip here if necessary if(sopv[i].is_reversed) sc2.flip(); // Convert the SequenceOverlap the ASQG's overlap format Overlap ovr(curr_read.id, sc1, match_id, sc2, sopv[i].is_reversed, -1); ASQG::EdgeRecord er(ovr); er.setCigarTag(sopv[i].overlap.cigar); er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity()); #pragma omp critical { er.write(*pASQGWriter); } } } // Cleanup delete pReader; delete pBWT; delete pSSA; delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int filterMain(int argc, char** argv) { parseFilterOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); pBWT->printInfo(); std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = createWriter(opt::discardFile); QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter); // If performing duplicate check, create a bitvector to record // which reads are duplicates BitVector* pSharedBV = NULL; if(opt::dupCheck) pSharedBV = new BitVector(pBWT->getNumStrings()); // Set up QC parameters QCParameters params; params.pBWT = pBWT; params.pRevBWT = pRBWT; params.pSharedBV = pSharedBV; params.checkDuplicates = opt::dupCheck; params.substringOnly = opt::substringOnly; params.checkKmer = opt::kmerCheck; params.kmerBothStrand = opt::kmerBothStrand; params.checkHPRuns = opt::hpCheck; params.checkDegenerate = opt::lowComplexityCheck; params.verbose = opt::verbose; params.kmerLength = opt::kmerLength; params.kmerThreshold = opt::kmerThreshold; params.hpKmerLength = 51; params.hpHardAcceptCount = 10; params.hpMinProportion = 0.1f; params.hpMinLength = 6; if(opt::numThreads <= 1) { // Serial mode QCProcess processor(params); PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor); } else { // Parallel mode std::vector<QCProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { QCProcess* pProcessor = new QCProcess(params); processorVector.push_back(pProcessor); } PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor); for(int i = 0; i < opt::numThreads; ++i) delete processorVector[i]; } delete pPostProcessor; delete pWriter; delete pDiscardWriter; delete pBWT; delete pRBWT; if(pSharedBV != NULL) delete pSharedBV; // Rebuild the FM-index without the discarded reads std::string out_prefix = stripFilename(opt::outFile); removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads); removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads); // Cleanup delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int overlapMain(int argc, char** argv) { parseOverlapOptions(argc, argv); // Prepare the output ASQG file assert(opt::outputType == OT_ASQG); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setContainmentTag(true); // containments are always present headerRecord.setTransitiveTag(!opt::bIrreducibleOnly); headerRecord.write(*pASQGWriter); // Compute the overlap hits StringVector hitsFilenames; // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::prefix.empty()) indexPrefix = opt::prefix; else { if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); } BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, opt::errorRate, opt::seedLength, opt::seedStride, opt::bIrreducibleOnly); pOverlapper->setExactModeOverlap(opt::errorRate <= 0.0001); pOverlapper->setExactModeIrreducible(opt::errorRate <= 0.0001); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Make a prefix for the temporary hits files std::string outPrefix; outPrefix = stripFilename(opt::readsFile); if(!opt::targetFile.empty()) { outPrefix.append(1, '.'); outPrefix.append(stripFilename(opt::targetFile)); } if(opt::numThreads <= 1) { printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT); computeHitsSerial(outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter); } else { printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); computeHitsParallel(opt::numThreads, outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter); } // Get the number of strings in the BWT, this is used to pre-allocated the read table delete pOverlapper; delete pBWT; delete pRBWT; // Parse the hits files and write the overlaps to the ASQG file convertHitsToASQG(indexPrefix, hitsFilenames, pASQGWriter); // Cleanup delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int correctMain(int argc, char** argv) { parseCorrectOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = NULL; // If the correction mode is k-mer only, then do not load the reverse // BWT as it is not needed if(opt::algorithm != ECA_KMER) pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); BWTIntervalCache intervalCache(opt::intervalCacheLength, pBWT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, NULL, opt::errorRate, opt::seedLength, opt::seedStride, false, opt::branchCutoff); // Learn the parameters of the kmer corrector if(opt::bLearnKmerParams) { int threshold = learnKmerParameters(pBWT); if(threshold != -1) CorrectionThresholds::Instance().setBaseMinSupport(threshold); } // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Set the error correction parameters ErrorCorrectParameters ecParams; ecParams.pOverlapper = pOverlapper; ecParams.pIntervalCache = &intervalCache; ecParams.algorithm = opt::algorithm; ecParams.minOverlap = opt::minOverlap; ecParams.numOverlapRounds = opt::numOverlapRounds; ecParams.conflictCutoff = opt::conflictCutoff; ecParams.numKmerRounds = opt::numKmerRounds; ecParams.kmerLength = opt::kmerLength; ecParams.printOverlaps = opt::verbose > 1; // Setup post-processor bool bCollectMetrics = !opt::metricsFile.empty(); ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics); if(opt::numThreads <= 1) { // Serial mode ErrorCorrectProcess processor(ecParams); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<ErrorCorrectProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } if(bCollectMetrics) { std::ostream* pMetricsWriter = createWriter(opt::metricsFile); postProcessor.writeMetrics(pMetricsWriter); delete pMetricsWriter; } delete pBWT; if(pRBWT != NULL) delete pRBWT; delete pOverlapper; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int correctMain(int argc, char** argv) { parseCorrectOptions(argc, argv); std::cout << "Correcting sequencing errors for " << opt::readsFile << "\n"; // Load indices BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = NULL; SampledSuffixArray* pSSA = NULL; if(opt::algorithm == ECA_OVERLAP) pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI); BWTIntervalCache* pIntervalCache = new BWTIntervalCache(opt::intervalCacheLength, pBWT); BWTIndexSet indexSet; indexSet.pBWT = pBWT; indexSet.pRBWT = pRBWT; indexSet.pSSA = pSSA; indexSet.pCache = pIntervalCache; // Learn the parameters of the kmer corrector if(opt::bLearnKmerParams) { int threshold = learnKmerParameters(pBWT); if(threshold != -1) CorrectionThresholds::Instance().setBaseMinSupport(threshold); } // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Set the error correction parameters ErrorCorrectParameters ecParams; ecParams.pOverlapper = NULL; ecParams.indices = indexSet; ecParams.algorithm = opt::algorithm; ecParams.minOverlap = opt::minOverlap; ecParams.numOverlapRounds = opt::numOverlapRounds; ecParams.minIdentity = 1.0f - opt::errorRate; ecParams.conflictCutoff = opt::conflictCutoff; ecParams.numKmerRounds = opt::numKmerRounds; ecParams.kmerLength = opt::kmerLength; ecParams.printOverlaps = opt::verbose > 0; // Setup post-processor bool bCollectMetrics = !opt::metricsFile.empty(); ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics); if(opt::numThreads <= 1) { // Serial mode ErrorCorrectProcess processor(ecParams); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<ErrorCorrectProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } if(bCollectMetrics) { std::ostream* pMetricsWriter = createWriter(opt::metricsFile); postProcessor.writeMetrics(pMetricsWriter); delete pMetricsWriter; } delete pBWT; delete pIntervalCache; if(pRBWT != NULL) delete pRBWT; if(pSSA != NULL) delete pSSA; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int FMMergeMain(int argc, char** argv) { parseFMMergeOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); pOverlapper->setExactModeOverlap(true); pOverlapper->setExactModeIrreducible(true); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Construct a bitvector indicating what reads have been used // All the processes read from this vector and only the post processor // writes to it. BitVector markedReads(pBWT->getNumStrings()); std::ostream* pWriter = createWriter(opt::outFile); FMMergePostProcess postProcessor(pWriter, &markedReads); if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT); FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, &processor, &postProcessor); } else { printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<FMMergeProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor); for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } // Check that every bit was set in the bit vector size_t numSet = 0; size_t numTotal = pBWT->getNumStrings(); for(size_t i = 0; i < numTotal; ++i) { if(markedReads.test(i)) ++numSet; } // Get the number of strings in the BWT, this is used to pre-allocated the read table delete pOverlapper; delete pBWT; delete pRBWT; delete pWriter; // Cleanup delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }