// // Main // int overlapMain(int argc, char** argv) { parseOverlapOptions(argc, argv); // Prepare the output ASQG file assert(opt::outputType == OT_ASQG); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setContainmentTag(true); // containments are always present headerRecord.setTransitiveTag(!opt::bIrreducibleOnly); headerRecord.write(*pASQGWriter); // Compute the overlap hits StringVector hitsFilenames; // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::prefix.empty()) indexPrefix = opt::prefix; else { if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); } BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, opt::errorRate, opt::seedLength, opt::seedStride, opt::bIrreducibleOnly); pOverlapper->setExactModeOverlap(opt::errorRate <= 0.0001); pOverlapper->setExactModeIrreducible(opt::errorRate <= 0.0001); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Make a prefix for the temporary hits files std::string outPrefix; outPrefix = stripFilename(opt::readsFile); if(!opt::targetFile.empty()) { outPrefix.append(1, '.'); outPrefix.append(stripFilename(opt::targetFile)); } if(opt::numThreads <= 1) { printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT); computeHitsSerial(outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter); } else { printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); computeHitsParallel(opt::numThreads, outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter); } // Get the number of strings in the BWT, this is used to pre-allocated the read table delete pOverlapper; delete pBWT; delete pRBWT; // Parse the hits files and write the overlaps to the ASQG file convertHitsToASQG(indexPrefix, hitsFilenames, pASQGWriter); // Cleanup delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
void cluster() { BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true); pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f); pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f); BitVector markedReads(pBWT->getNumStrings()); std::string preclustersFile = opt::outFile + ".preclusters"; std::ostream* pPreWriter = createWriter(preclustersFile); ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads); // Set the cluster parameters ClusterParameters parameters; parameters.pOverlapper = pOverlapper; parameters.minOverlap = opt::minOverlap; parameters.maxClusterSize = opt::maxSize; parameters.maxIterations = opt::maxIterations; parameters.pMarkedReads = &markedReads; // Read the limit kmer sequences, if provided std::set<std::string>* pLimitKmers = NULL; if(!opt::limitFile.empty()) { // Read in the limit sequences pLimitKmers = new std::set<std::string>; readLimitKmers(pLimitKmers); parameters.pLimitKmers = pLimitKmers; parameters.limitK = opt::limitKmer; } else { parameters.pLimitKmers = NULL; parameters.limitK = 0; } // Make pre-clusters from the reads if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT); ClusterProcess processor(parameters); // If the extend file is empty, build new clusters if(opt::extendFile.empty()) { PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor); } else { // Process a set of preexisting clusters ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor); } } else { printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<ClusterProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ClusterProcess* pProcessor = new ClusterProcess(parameters); processorVector.push_back(pProcessor); } if(opt::extendFile.empty()) { PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor); } else { ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor); } for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } delete pPreWriter; delete pBWT; delete pRBWT; delete pOverlapper; // Deallocate limit kmers if(pLimitKmers != NULL) delete pLimitKmers; // Open the preclusters file and convert them to read names SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT); ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings()); size_t seedIdx = 0; std::istream* pPreReader = createReader(preclustersFile); std::ostream* pClusterWriter = createWriter(opt::outFile); std::string line; while(getline(*pPreReader,line)) { std::stringstream parser(line); std::string clusterName; std::string readSequence; size_t clusterSize; int64_t lowIdx; int64_t highIdx; parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx; if(lowIdx > highIdx) { // This is an extra read that is not present in the FM-index // Output a record with a fake read ID *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n"; } else { for(int64_t i = lowIdx; i <= highIdx; ++i) { const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID()); std::string readName = targetInfo.id; *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n"; } } } unlink(preclustersFile.c_str()); delete pFwdSAI; delete pRIT; delete pPreReader; delete pClusterWriter; }
// // Main // int FMMergeMain(int argc, char** argv) { parseFMMergeOptions(argc, argv); BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); pOverlapper->setExactModeOverlap(true); pOverlapper->setExactModeIrreducible(true); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Construct a bitvector indicating what reads have been used // All the processes read from this vector and only the post processor // writes to it. BitVector markedReads(pBWT->getNumStrings()); std::ostream* pWriter = createWriter(opt::outFile); FMMergePostProcess postProcessor(pWriter, &markedReads); if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT); FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads); SequenceProcessFramework::processSequencesSerial<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, &processor, &postProcessor); } else { printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<FMMergeProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads); processorVector.push_back(pProcessor); } SequenceProcessFramework::processSequencesParallel<SequenceWorkItem, FMMergeResult, FMMergeProcess, FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor); for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } // Check that every bit was set in the bit vector size_t numSet = 0; size_t numTotal = pBWT->getNumStrings(); for(size_t i = 0; i < numTotal; ++i) { if(markedReads.test(i)) ++numSet; } // Get the number of strings in the BWT, this is used to pre-allocated the read table delete pOverlapper; delete pBWT; delete pRBWT; delete pWriter; // Cleanup delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }