std::string runPartialBatch(mpi::communicator world, boost::shared_ptr< MatcherInterface > &matcher, ReadSet &_contigs, std::string _contigFile, ReadSet & changedContigs, ReadSet & finalContigs, int batchIdx, int maxContigsPerBatch, SequenceLengthType minKmerSize, double minimumCoverage, SequenceLengthType maxKmerSize, SequenceLengthType maxExtend, SequenceLengthType kmerStep) { LOG_DEBUG(1, "Starting runPartialBatch(" << batchIdx << " of " << _contigs.getSize() << "): " << MemoryUtils::getMemoryUsage()); ReadSet contigs; // new global contigs file a subset of original std::string extendLog; for(int i = batchIdx; i < (int) _contigs.getSize() && i < batchIdx + maxContigsPerBatch; i++) contigs.append(_contigs.getRead(i)); setGlobalReadSetConstants(world, contigs); if (contigs.getGlobalSize() == 0) return extendLog; std::string contigFile = DistributedOfstreamMap::writeGlobalReadSet(world, contigs, UniqueName::generateUniqueGlobalName(".tmp-batch" + UniqueName::getOurUniqueHandle() + "-", batchIdx), ".fasta", FormatOutput::Fasta()); MatcherInterface::MatchReadResults contigReadSet = matcher->match(contigs, contigFile); assert(contigs.getSize() == contigReadSet.size()); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, " batch " << contigs.getSize() << ". Matches made"); int numThreads = omp_get_max_threads(); std::string extendLogs[numThreads]; if (!Cap3Options::getOptions().getCap3Path().empty()) { Cap3 cap3Instances[numThreads]; #pragma omp parallel for for(int i = 0; i < numThreads; i++) { extendLogs[i] = cap3Instances[i].extendContigs(contigs, contigReadSet, changedContigs, finalContigs, minimumCoverage, i, numThreads); } } else if (!NewblerOptions::getOptions().getNewblerPath().empty()) { Newbler newblerInstances[numThreads]; #pragma omp parallel for for(int i = 0; i < numThreads; i++) { extendLogs[i] = newblerInstances[i].extendContigs(contigs, contigReadSet, changedContigs, finalContigs, minimumCoverage, i, numThreads); } } else { extendLog = extendContigsWithContigExtender(contigs, contigReadSet, changedContigs, finalContigs, minKmerSize, minimumCoverage, maxKmerSize, maxExtend, kmerStep); } for(int i = 0; i < numThreads; i++) extendLog += extendLogs[i]; unlink(contigFile.c_str()); return extendLog; }
int main(int argc, char *argv[]) { ForkDaemon::initialize(); ScopedMPIComm< DistributedNucleatingAssemblerOptions > world(argc, argv); Cleanup::prepare(); try { double timing1, timing2; timing1 = MPI_Wtime(); OptionsBaseInterface::FileListType &inputFiles = Options::getOptions().getInputFiles(); std::string contigFile = ContigExtenderBaseOptions::getOptions().getContigFile(); std::string finalContigFile; double minimumCoverage = ContigExtenderBaseOptions::getOptions().getMinimumCoverage(); long maxIterations = DistributedNucleatingAssemblerOptions::getOptions().getMaxIterations(); ReadSet reads; LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Reading Input Files" ); reads.appendAllFiles(inputFiles, world.rank(), world.size()); reads.identifyPairs(); setGlobalReadSetConstants(world, reads); timing2 = MPI_Wtime(); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "loaded " << reads.getGlobalSize() << " Reads, (local:" << reads.getSize() << " pair:" << reads.getPairSize() << ") in " << (timing2-timing1) << " seconds" ); LOG_DEBUG_GATHER(1, MemoryUtils::getMemoryUsage()); if (FilterKnownOdditiesOptions::getOptions().getSkipArtifactFilter() == 0) { LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Preparing artifact filter: "); FilterKnownOddities filter; LOG_VERBOSE_OPTIONAL(2, world.rank() == 0, "Applying sequence artifact filter to Input Files"); unsigned long filtered = filter.applyFilter(reads); LOG_VERBOSE_GATHER(2, "local filter affected (trimmed/removed) " << filtered << " Reads "); LOG_DEBUG_GATHER(1, MemoryUtils::getMemoryUsage()); unsigned long allFiltered; mpi::reduce(world, filtered, allFiltered, std::plus<unsigned long>(), 0); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "distributed filter (trimmed/removed) " << allFiltered << " Reads."); } boost::shared_ptr< MatcherInterface > matcher; if (KmerBaseOptions::getOptions().getKmerSize() == 0) { matcher.reset( new Vmatch(world, UniqueName::generateHashName(inputFiles), reads) ); } else { matcher.reset( new KmerMatch(world, reads) ); } SequenceLengthType minKmerSize, maxKmerSize, kmerStep, maxExtend; ContigExtender<KS>::getMinMaxKmerSize(reads, minKmerSize, maxKmerSize, kmerStep); maxKmerSize = boost::mpi::all_reduce(world, maxKmerSize, mpi::minimum< SequenceLengthType>()); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Kmer size ranges: " << minKmerSize << "\t" << maxKmerSize << "\t" << kmerStep); maxExtend = maxKmerSize; timing1 = timing2; timing2 = MPI_Wtime(); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Prepared Matcher indexes in " << (timing2-timing1) << " seconds"); ReadSet finalContigs; ReadSet contigs; contigs.appendFastaFile(contigFile, world.rank(), world.size()); int maxContigsPerBatch = DistributedNucleatingAssemblerOptions::getOptions().getMaxContigsPerBatch(); short iteration = 0; while (++iteration <= maxIterations) { LOG_DEBUG_GATHER(1, "Iteration " << iteration << " " << MemoryUtils::getMemoryUsage()); int batchIdx = 0; matcher->resetTimes("Start Iteration", MPI_Wtime()); setGlobalReadSetConstants(world, contigs); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Iteration: " << iteration << ". Contig File: " << contigFile << ". contains " << contigs.getGlobalSize() << " Reads"); if (contigs.getGlobalSize() == 0) { LOG_VERBOSE_OPTIONAL(1, true, "There are no contigs to extend in " << contigFile); break; } std::string extendLog; ReadSet changedContigs; int lastBatch = contigs.getSize(); MPI_Allreduce(MPI_IN_PLACE, &lastBatch, 1, MPI_INT, MPI_MAX, world); LOG_DEBUG_OPTIONAL(1, world.rank() == 0, "Iteration: " << iteration << " Last batch is " << lastBatch); while (batchIdx < lastBatch) { extendLog += runPartialBatch(world, matcher, contigs, contigFile, changedContigs, finalContigs, batchIdx, maxContigsPerBatch, minKmerSize, minimumCoverage, maxKmerSize, maxExtend, kmerStep); batchIdx += maxContigsPerBatch; } matcher->recordTime("extendContigs", MPI_Wtime()); LOG_DEBUG_GATHER(1, (extendLog)); finishLongContigs(DistributedNucleatingAssemblerOptions::getOptions().getMaxContigLength(), changedContigs, finalContigs); LOG_DEBUG_GATHER(1, "Changed contigs: " << changedContigs.getSize() << " finalContigs: " << finalContigs.getSize()); setGlobalReadSetConstants(world, changedContigs); setGlobalReadSetConstants(world, finalContigs); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Changed contigs: " << changedContigs.getGlobalSize() << " finalContigs: " << finalContigs.getGlobalSize()); std::string oldFinalContigFile = finalContigFile; std::string oldContigFile = contigFile; { // write out the state of the contig files (so far) so we do not loose them DistributedOfstreamMap om(world, Options::getOptions().getOutputFile(), ""); om.setBuildInMemory(); if (finalContigs.getGlobalSize() > 0) { std::string fileKey = "final-" + boost::lexical_cast< std::string>(iteration); finalContigs.writeAll(om.getOfstream(fileKey), FormatOutput::Fasta()); finalContigFile = om.getRealFilePath(fileKey); } if (changedContigs.getGlobalSize() > 0) { std::string filekey = "-inputcontigs-" + boost::lexical_cast< std::string>(iteration) + ".fasta"; changedContigs.writeAll(om.getOfstream(filekey), FormatOutput::Fasta()); contigFile = om.getRealFilePath(filekey); } contigs = changedContigs; } if (world.rank() == 0) { // preserve the final contigs in case of crash unlink(Options::getOptions().getOutputFile().c_str()); link(finalContigFile.c_str(), Options::getOptions().getOutputFile().c_str()); } matcher->recordTime("writeFinalTime", MPI_Wtime()); if (!Log::isDebug(1) && world.rank() == 0) { // remove most recent contig files (if not debugging) if (!oldFinalContigFile.empty()) { LOG_VERBOSE_OPTIONAL(1, true, "Removing " << oldFinalContigFile); unlink(oldFinalContigFile.c_str()); } if (ContigExtenderBaseOptions::getOptions().getContigFile().compare( oldContigFile) != 0) { LOG_VERBOSE_OPTIONAL(1, true, "Removing " << oldContigFile); unlink(oldContigFile.c_str()); } } if (changedContigs.getGlobalSize() == 0) { LOG_VERBOSE_OPTIONAL(1, world.rank() == 1, "No more contigs to extend " << changedContigs.getSize()); break; } matcher->recordTime("finishIteration", MPI_Wtime()); LOG_DEBUG_GATHER(1, matcher->getTimes("") + ". " + MemoryUtils::getMemoryUsage()); } matcher.reset(); // release the matcher interface if (world.rank() == 0 && !Log::isDebug(1)) { if (ContigExtenderBaseOptions::getOptions().getContigFile().compare( contigFile) != 0) { LOG_DEBUG_OPTIONAL(1, true, "Removing " << contigFile); unlink(contigFile.c_str()); } } // write final contigs (and any unfinished contigs still remaining) finalContigs.append(contigs); std::string tmpFinalFile = DistributedOfstreamMap::writeGlobalReadSet(world, finalContigs, Options::getOptions().getOutputFile(), ".tmp", FormatOutput::Fasta()); if (world.rank() == 0 && !finalContigFile.empty()) { LOG_DEBUG_OPTIONAL(1, true, "Removing " << finalContigFile); unlink(finalContigFile.c_str()); } finalContigFile = tmpFinalFile; if (world.rank() == 0) { unlink(Options::getOptions().getOutputFile().c_str()); rename(finalContigFile.c_str(), Options::getOptions().getOutputFile().c_str()); } finalContigFile = Options::getOptions().getOutputFile(); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Final contigs are in: " << finalContigFile); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Finished"); ForkDaemon::finalize(); } catch (std::exception &e) { LOG_ERROR(1, "DistributedNucleatingAssembler threw an exception! Aborting..." << e.what()); world.abort(1); } catch (...) { LOG_ERROR(1, "DistributedNucleatingAssembler threw an error!" ); world.abort(1); } return 0; }