int convertBeetlMain(int argc, char** argv) { Timer t("sga beetl-convert"); parseConvertBeetlOptions(argc, argv); std::cout << "Converting " << opt::inBWTFile << "\n"; // Read the ASCII bwt beetl file and convert it to the SGA // run length encoded binary format // To write the header of the file, we need to count the number of strings // and symbols in the BWT. We do this in an initial pas std::istream* pReader = createReader(opt::inBWTFile); size_t numSymbols = 0; size_t numStrings = 0; char c; while(*pReader >> c) { numSymbols += 1; if(c == 'N') { std::cerr << "Error: ambiguous character found in BWT\n"; std::cerr << "sga preprocess must be run on the data\n"; exit(EXIT_FAILURE); } if(c == '$') numStrings += 1; } delete pReader; printf("Read %zu symbols and %zu strings from the beetl bwt\n", numSymbols, numStrings); // std::string outBWTName = opt::prefix + BWT_EXT; BWTWriterBinary* pWriter = new BWTWriterBinary(outBWTName); pWriter->writeHeader(numStrings, numSymbols, BWF_NOFMI); // Re-read the file, writing out the bw chars pReader = createReader(opt::inBWTFile); while(*pReader >> c) pWriter->writeBWChar(c); pWriter->finalize(); delete pWriter; delete pReader; // Create the suffix array index files using the sampled suffix array machinery std::cout << "Generating lexicographic index (.sai)\n"; BWT* pBWT = new BWT(outBWTName, 512); SampledSuffixArray* pSSA = new SampledSuffixArray(); pSSA->buildLexicoIndex(pBWT); pSSA->writeLexicoIndex(opt::prefix + SAI_EXT); delete pBWT; delete pSSA; return 0; }
void indexInMemoryRopebwt() { std::cout << "Building index for " << opt::readsFile << " in memory using ropebwt\n"; bool use_threads = opt::numThreads >= 4; if(opt::bBuildForward) { std::string bwt_filename = opt::prefix + BWT_EXT; std::string sai_filename = opt::prefix + SAI_EXT; BWTCA::runRopebwt(opt::readsFile, bwt_filename, use_threads, false); if(opt::bBuildSAI) { std::cout << "\t done bwt construction, generating .sai file\n"; BWT* pBWT = new BWT(bwt_filename); SampledSuffixArray ssa; ssa.buildLexicoIndex(pBWT, opt::numThreads); ssa.writeLexicoIndex(sai_filename); delete pBWT; } } if(opt::bBuildReverse) { std::string rbwt_filename = opt::prefix + RBWT_EXT; std::string rsai_filename = opt::prefix + RSAI_EXT; BWTCA::runRopebwt(opt::readsFile, rbwt_filename, use_threads, true); if(opt::bBuildSAI) { std::cout << "\t done rbwt construction, generating .rsai file\n"; BWT* pRBWT = new BWT(rbwt_filename); SampledSuffixArray ssa; ssa.buildLexicoIndex(pRBWT, opt::numThreads); ssa.writeLexicoIndex(rsai_filename); delete pRBWT; } } }
// // Main // int filterMain(int argc, char** argv) { parseFilterOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); //pBWT->printInfo(); std::ostream* pWriter = createWriter(opt::outFile); std::ostream* pDiscardWriter = createWriter(opt::discardFile); QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter); // If performing duplicate check, create a bitvector to record // which reads are duplicates BitVector* pSharedBV = NULL; if(opt::dupCheck) pSharedBV = new BitVector(pBWT->getNumStrings()); // Set up QC parameters QCParameters params; params.pBWT = pBWT; params.pRevBWT = pRBWT; params.pSharedBV = pSharedBV; params.checkDuplicates = opt::dupCheck; params.substringOnly = opt::substringOnly; params.checkKmer = opt::kmerCheck; params.checkHPRuns = opt::hpCheck; params.checkDegenerate = opt::lowComplexityCheck; params.verbose = opt::verbose; params.kmerLength = opt::kmerLength; params.kmerThreshold = opt::kmerThreshold; params.hpKmerLength = 51; params.hpHardAcceptCount = 10; params.hpMinProportion = 0.1f; params.hpMinLength = 6; if(opt::numThreads <= 1) { // Serial mode QCProcess processor(params); PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor); } else { // Parallel mode std::vector<QCProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { QCProcess* pProcessor = new QCProcess(params); processorVector.push_back(pProcessor); } PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor); for(int i = 0; i < opt::numThreads; ++i) delete processorVector[i]; } delete pPostProcessor; delete pWriter; delete pDiscardWriter; delete pBWT; delete pRBWT; if(pSharedBV != NULL) delete pSharedBV; std::cout << "RE-building index for " << opt::outFile << " in memory using ropebwt2\n"; std::string prefix=stripFilename(opt::outFile); //BWT *pBWT, *pRBWT; #pragma omp parallel { #pragma omp single nowait { std::string bwt_filename = prefix + BWT_EXT; BWTCA::runRopebwt2(opt::outFile, bwt_filename, opt::numThreads, false); std::cout << "\t done bwt construction, generating .sai file\n"; pBWT = new BWT(bwt_filename); } #pragma omp single nowait { std::string rbwt_filename = prefix + RBWT_EXT; BWTCA::runRopebwt2(opt::outFile, rbwt_filename, opt::numThreads, true); std::cout << "\t done rbwt construction, generating .rsai file\n"; pRBWT = new BWT(rbwt_filename); } } std::string sai_filename = prefix + SAI_EXT; SampledSuffixArray ssa; ssa.buildLexicoIndex(pBWT, opt::numThreads); ssa.writeLexicoIndex(sai_filename); delete pBWT; std::string rsai_filename = prefix + RSAI_EXT; SampledSuffixArray rssa; rssa.buildLexicoIndex(pRBWT, opt::numThreads); rssa.writeLexicoIndex(rsai_filename); delete pRBWT; // Cleanup delete pTimer; return 0; }