// // Main // int statsMain(int argc, char** argv) { parseStatsOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); if(opt::bPrintRunLengths) { pBWT->printInfo(); pBWT->printRunLengths(); } SeqReader reader(opt::readsFile); StatsPostProcess postProcessor(opt::bPrintKmerDist); if(opt::numThreads <= 1) { // Serial mode StatsProcess processor(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, StatsResult, StatsProcess, StatsPostProcess>(reader, &processor, &postProcessor, opt::numReads); } else { // Parallel mode std::vector<StatsProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { StatsProcess* pProcessor = new StatsProcess(pBWT, pRBWT, opt::kmerLength, opt::minOverlap, opt::branchCutoff, opt::bNoOverlap); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, StatsResult, StatsProcess, StatsPostProcess>(reader, processorVector, &postProcessor, opt::numReads); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } delete pBWT; delete pRBWT; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
int genSSAMain(int argc, char** argv) { Timer t("sga gen-ssa"); parseGenSSAOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT); ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID); pBWT->printInfo(); SampledSuffixArray* pSSA = new SampledSuffixArray(); pSSA->build(pBWT, pRIT, opt::sampleRate); pSSA->printInfo(); pSSA->writeSSA(opt::prefix + SSA_EXT); if(opt::validate) pSSA->validate(opt::readsFile, pBWT); delete pBWT; delete pRIT; delete pSSA; return 0; }
// // Main // int overlapLongMain(int argc, char** argv) { parseOverlapLongOptions(argc, argv); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setTransitiveTag(true); headerRecord.write(*pASQGWriter); // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Read the sequence file and write vertex records for each // Also store the read names in a vector of strings ReadTable reads; SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION); SeqRecord record; while(pReader->get(record)) { reads.addRead(record.toSeqItem()); ASQG::VertexRecord vr(record.id, record.seq.toString()); vr.write(*pASQGWriter); if(reads.getCount() % 100000 == 0) printf("Read %zu sequences\n", reads.getCount()); } delete pReader; pReader = NULL; BWTIndexSet index; index.pBWT = pBWT; index.pSSA = pSSA; index.pReadTable = &reads; // Make a prefix for the temporary hits files size_t n_reads = reads.getCount(); omp_set_num_threads(opt::numThreads); #pragma omp parallel for for(size_t read_idx = 0; read_idx < n_reads; ++read_idx) { const SeqItem& curr_read = reads.getRead(read_idx); printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length()); SequenceOverlapPairVector sopv = KmerOverlaps::retrieveMatches(curr_read.seq.toString(), opt::seedLength, opt::minOverlap, 1 - opt::errorRate, 100, index); printf("Found %zu matches\n", sopv.size()); for(size_t i = 0; i < sopv.size(); ++i) { std::string match_id = reads.getRead(sopv[i].match_idx).id; // We only want to output each edge once so skip this overlap // if the matched read has a lexicographically lower ID if(curr_read.id > match_id) continue; std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50); printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(), sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, match_id.c_str(), sopv[i].overlap.getOverlapLength(), sopv[i].overlap.getPercentIdentity(), sopv[i].overlap.cigar.c_str()); // Convert to ASQG SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]); SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]); // KmerOverlaps returns the coordinates of the overlap after flipping the reads // to ensure the strand matches. The ASQG file wants the coordinate of the original // sequencing strand. Flip here if necessary if(sopv[i].is_reversed) sc2.flip(); // Convert the SequenceOverlap the ASQG's overlap format Overlap ovr(curr_read.id, sc1, match_id, sc2, sopv[i].is_reversed, -1); ASQG::EdgeRecord er(ovr); er.setCigarTag(sopv[i].overlap.cigar); er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity()); #pragma omp critical { er.write(*pASQGWriter); } } } // Cleanup delete pReader; delete pBWT; delete pSSA; delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int filterMain(int argc, char** argv) { parseFilterOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); pBWT->printInfo(); std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = createWriter(opt::discardFile); QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter); // If performing duplicate check, create a bitvector to record // which reads are duplicates BitVector* pSharedBV = NULL; if(opt::dupCheck) pSharedBV = new BitVector(pBWT->getNumStrings()); // Set up QC parameters QCParameters params; params.pBWT = pBWT; params.pRevBWT = pRBWT; params.pSharedBV = pSharedBV; params.checkDuplicates = opt::dupCheck; params.substringOnly = opt::substringOnly; params.checkKmer = opt::kmerCheck; params.kmerBothStrand = opt::kmerBothStrand; params.checkHPRuns = opt::hpCheck; params.checkDegenerate = opt::lowComplexityCheck; params.verbose = opt::verbose; params.kmerLength = opt::kmerLength; params.kmerThreshold = opt::kmerThreshold; params.hpKmerLength = 51; params.hpHardAcceptCount = 10; params.hpMinProportion = 0.1f; params.hpMinLength = 6; if(opt::numThreads <= 1) { // Serial mode QCProcess processor(params); PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor); } else { // Parallel mode std::vector<QCProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { QCProcess* pProcessor = new QCProcess(params); processorVector.push_back(pProcessor); } PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor); for(int i = 0; i < opt::numThreads; ++i) delete processorVector[i]; } delete pPostProcessor; delete pWriter; delete pDiscardWriter; delete pBWT; delete pRBWT; if(pSharedBV != NULL) delete pSharedBV; // Rebuild the FM-index without the discarded reads std::string out_prefix = stripFilename(opt::outFile); removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads); removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads); // Cleanup delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int overlapMain(int argc, char** argv) { parseOverlapOptions(argc, argv); // Prepare the output ASQG file assert(opt::outputType == OT_ASQG); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setContainmentTag(true); // containments are always present headerRecord.setTransitiveTag(!opt::bIrreducibleOnly); headerRecord.write(*pASQGWriter); // Compute the overlap hits StringVector hitsFilenames; // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::prefix.empty()) indexPrefix = opt::prefix; else { if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); } BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, opt::errorRate, opt::seedLength, opt::seedStride, opt::bIrreducibleOnly); pOverlapper->setExactModeOverlap(opt::errorRate <= 0.0001); pOverlapper->setExactModeIrreducible(opt::errorRate <= 0.0001); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Make a prefix for the temporary hits files std::string outPrefix; outPrefix = stripFilename(opt::readsFile); if(!opt::targetFile.empty()) { outPrefix.append(1, '.'); outPrefix.append(stripFilename(opt::targetFile)); } if(opt::numThreads <= 1) { printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT); computeHitsSerial(outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter); } else { printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); computeHitsParallel(opt::numThreads, outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter); } // Get the number of strings in the BWT, this is used to pre-allocated the read table delete pOverlapper; delete pBWT; delete pRBWT; // Parse the hits files and write the overlaps to the ASQG file convertHitsToASQG(indexPrefix, hitsFilenames, pASQGWriter); // Cleanup delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int correctMain(int argc, char** argv) { parseCorrectOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = NULL; // If the correction mode is k-mer only, then do not load the reverse // BWT as it is not needed if(opt::algorithm != ECA_KMER) pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); BWTIntervalCache intervalCache(opt::intervalCacheLength, pBWT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, NULL, opt::errorRate, opt::seedLength, opt::seedStride, false, opt::branchCutoff); // Learn the parameters of the kmer corrector if(opt::bLearnKmerParams) { int threshold = learnKmerParameters(pBWT); if(threshold != -1) CorrectionThresholds::Instance().setBaseMinSupport(threshold); } // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Set the error correction parameters ErrorCorrectParameters ecParams; ecParams.pOverlapper = pOverlapper; ecParams.pIntervalCache = &intervalCache; ecParams.algorithm = opt::algorithm; ecParams.minOverlap = opt::minOverlap; ecParams.numOverlapRounds = opt::numOverlapRounds; ecParams.conflictCutoff = opt::conflictCutoff; ecParams.numKmerRounds = opt::numKmerRounds; ecParams.kmerLength = opt::kmerLength; ecParams.printOverlaps = opt::verbose > 1; // Setup post-processor bool bCollectMetrics = !opt::metricsFile.empty(); ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics); if(opt::numThreads <= 1) { // Serial mode ErrorCorrectProcess processor(ecParams); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<ErrorCorrectProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } if(bCollectMetrics) { std::ostream* pMetricsWriter = createWriter(opt::metricsFile); postProcessor.writeMetrics(pMetricsWriter); delete pMetricsWriter; } delete pBWT; if(pRBWT != NULL) delete pRBWT; delete pOverlapper; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int correctMain(int argc, char** argv) { parseCorrectOptions(argc, argv); std::cout << "Correcting sequencing errors for " << opt::readsFile << "\n"; // Load indices BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = NULL; SampledSuffixArray* pSSA = NULL; if(opt::algorithm == ECA_OVERLAP) pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI); BWTIntervalCache* pIntervalCache = new BWTIntervalCache(opt::intervalCacheLength, pBWT); BWTIndexSet indexSet; indexSet.pBWT = pBWT; indexSet.pRBWT = pRBWT; indexSet.pSSA = pSSA; indexSet.pCache = pIntervalCache; // Learn the parameters of the kmer corrector if(opt::bLearnKmerParams) { int threshold = learnKmerParameters(pBWT); if(threshold != -1) CorrectionThresholds::Instance().setBaseMinSupport(threshold); } // Open outfiles and start a timer std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = (!opt::discardFile.empty() ? createWriter(opt::discardFile) : NULL); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Set the error correction parameters ErrorCorrectParameters ecParams; ecParams.pOverlapper = NULL; ecParams.indices = indexSet; ecParams.algorithm = opt::algorithm; ecParams.minOverlap = opt::minOverlap; ecParams.numOverlapRounds = opt::numOverlapRounds; ecParams.minIdentity = 1.0f - opt::errorRate; ecParams.conflictCutoff = opt::conflictCutoff; ecParams.numKmerRounds = opt::numKmerRounds; ecParams.kmerLength = opt::kmerLength; ecParams.printOverlaps = opt::verbose > 0; // Setup post-processor bool bCollectMetrics = !opt::metricsFile.empty(); ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics); if(opt::numThreads <= 1) { // Serial mode ErrorCorrectProcess processor(ecParams); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, &processor, &postProcessor); } else { // Parallel mode std::vector<ErrorCorrectProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ErrorCorrectProcess* pProcessor = new ErrorCorrectProcess(ecParams); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, ErrorCorrectResult, ErrorCorrectProcess, ErrorCorrectPostProcess>(opt::readsFile, processorVector, &postProcessor); for(int i = 0; i < opt::numThreads; ++i) { delete processorVector[i]; } } if(bCollectMetrics) { std::ostream* pMetricsWriter = createWriter(opt::metricsFile); postProcessor.writeMetrics(pMetricsWriter); delete pMetricsWriter; } delete pBWT; delete pIntervalCache; if(pRBWT != NULL) delete pRBWT; if(pSSA != NULL) delete pSSA; delete pTimer; delete pWriter; if(pDiscardWriter != NULL) delete pDiscardWriter; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// // Main // int FMMergeMain(int argc, char** argv) { parseFMMergeOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); pOverlapper->setExactModeOverlap(true); pOverlapper->setExactModeIrreducible(true); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Construct a bitvector indicating what reads have been used // All the processes read from this vector and only the post processor // writes to it. BitVector markedReads(pBWT->getNumStrings()); std::ostream* pWriter = createWriter(opt::outFile); FMMergePostProcess postProcessor(pWriter, &markedReads); if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT); FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, &processor, &postProcessor); } else { printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<FMMergeProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor); for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } // Check that every bit was set in the bit vector size_t numSet = 0; size_t numTotal = pBWT->getNumStrings(); for(size_t i = 0; i < numTotal; ++i) { if(markedReads.test(i)) ++numSet; } // Get the number of strings in the BWT, this is used to pre-allocated the read table delete pOverlapper; delete pBWT; delete pRBWT; delete pWriter; // Cleanup delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }