template <typename StorageSet,typename ReadSet> void mergeKeysFromStorageSet(StorageSet &ss, const ReadSet&other) { int len = other.reads_size(); int sslen = ss.reads_size(); int i; for (i=0;i<sslen;++i) { mergeStorageKey(ss.mutable_reads(i),other.reads(i)); } for (;i<len;++i) { ss.add_reads(); mergeStorageKey(ss.mutable_reads(i),other.reads(i)); } }
void CassandraStorage::executeRangeRead(const Bucket& bucket, SliceRange& range, CommitCallback cb, const String& timestamp) { ReadSet* rs = new ReadSet; bool success = true; try { *rs = mDB->db()->getColumnsValues(bucket.rawHexData(),CF_NAME, timestamp, range); } catch(...){ success = false; } if (rs->size()==0) success = false; mContext->mainStrand->post(std::tr1::bind(&CassandraStorage::completeRange, this, cb, success, rs)); }
void checkReadValuesImpl(Result expected_result, ReadSet expected, Result result, ReadSet* rs) { TS_ASSERT_EQUALS(expected_result, result); if ((result != OH::Storage::SUCCESS) || (expected_result != OH::Storage::SUCCESS)) return; if (!rs) { TS_ASSERT_EQUALS(expected.size(), 0); return; } TS_ASSERT_EQUALS(expected.size(), rs->size()); for(ReadSet::iterator it = expected.begin(); it != expected.end(); it++) { String key = it->first; String value = it->second; TS_ASSERT(rs->find(key) != rs->end()); TS_ASSERT_EQUALS((*rs)[key], value); } }
std::string extendContigsWithCap3(const ReadSet & contigs, ReadSet::ReadSetVector &contigReadSet, ReadSet & changedContigs, ReadSet & finalContigs, ReadSet::ReadSetSizeType minimumCoverage) { std::stringstream extendLog; int poolsWithoutMinimumCoverage = 0; // initialize per-thread Cap3 instances Cap3 cap3[omp_get_max_threads()]; #pragma omp parallel for for (long i = 0; i < (long) contigs.getSize(); i++) { const Read &oldRead = contigs.getRead(i); Read newRead = oldRead; SequenceLengthType oldLen = oldRead.getLength(), newLen = 0; ReadSet::ReadSetSizeType poolSize = contigReadSet[i].getSize(); double extTime = MPI_Wtime(); if (poolSize > minimumCoverage) { LOG_VERBOSE_OPTIONAL(2, true, "Extending " << oldRead.getName() << " with " << poolSize << " pool of reads"); newRead = cap3[omp_get_thread_num()].extendContig(oldRead, contigReadSet[i]); newLen = newRead.getLength(); } else { poolsWithoutMinimumCoverage++; } extTime = MPI_Wtime() - extTime; long deltaLen = (long)newLen - (long)oldLen; if (deltaLen > 0) { extendLog << std::endl << "Cap3 Extended " << oldRead.getName() << " " << deltaLen << " bases to " << newRead.getLength() << ": " << newRead.getName() << " with " << poolSize << " reads in the pool, in " << extTime << " sec"; //#pragma omp critical changedContigs.append(newRead); } else { extendLog << std::endl << "Did not extend " << oldRead.getName() << " with " << poolSize << " reads in the pool, in " << extTime << " sec"; //#pragma omp critical finalContigs.append(oldRead); } } LOG_VERBOSE_OPTIONAL(2, true, "Extended " << contigs.getSize() - poolsWithoutMinimumCoverage << " contigs out of " << contigs.getSize()); return extendLog.str(); }
static void removeWrittenTo(AliasAnalysis *AA, ReadSet &Reads, SILInstruction *ByInst) { // We can ignore retains, cond_fails, and dealloc_stacks. if (isa<StrongRetainInst>(ByInst) || isa<RetainValueInst>(ByInst) || isa<CondFailInst>(ByInst) || isa<DeallocStackInst>(ByInst)) return; SmallVector<SILInstruction *, 8> RS(Reads.begin(), Reads.end()); for (auto R : RS) { auto *LI = dyn_cast<LoadInst>(R); if (LI && !AA->mayWriteToMemory(ByInst, LI->getOperand())) continue; DEBUG(llvm::dbgs() << " mayWriteTo\n" << *ByInst << " to " << *R << "\n"); Reads.erase(R); } }
void Depot::load_overlaps(OverlapSet& dst, uint32_t begin, uint32_t length, const ReadSet& reads) { ASSERT(reads.size() != 0, "Depot", "Empty read set!"); load(dst, begin, length, overlap_data_, overlap_index_); for (auto& it: dst) { auto id = (uint64_t) it->read_a_; ASSERT(id < reads.size(), "Depot", "Missing read %lu!", id); it->read_a_ = reads[id]; id = (uint64_t) it->read_b_; ASSERT(id < reads.size(), "Depot", "Missing read %lu!", id); it->read_b_ = reads[id]; } }
void LoopTreeOptimization::analyzeCurrentLoop( std::unique_ptr<LoopNestSummary> &CurrSummary, ReadSet &SafeReads) { WriteSet &MayWrites = CurrSummary->MayWrites; SILLoop *Loop = CurrSummary->Loop; DEBUG(llvm::dbgs() << " Analyzing accesses.\n"); // Contains function calls in the loop, which only read from memory. SmallVector<ApplyInst *, 8> ReadOnlyApplies; for (auto *BB : Loop->getBlocks()) { for (auto &Inst : *BB) { // Ignore fix_lifetime instructions. if (isa<FixLifetimeInst>(&Inst)) continue; // Collect loads. auto LI = dyn_cast<LoadInst>(&Inst); if (LI) { if (!mayWriteTo(AA, MayWrites, LI)) SafeReads.insert(LI); continue; } if (auto *AI = dyn_cast<ApplyInst>(&Inst)) { // In contrast to load instructions, we first collect all read-only // function calls and add them later to SafeReads. SideEffectAnalysis::FunctionEffects E; SEA->getEffects(E, AI); auto MB = E.getMemBehavior(RetainObserveKind::ObserveRetains); if (MB <= SILInstruction::MemoryBehavior::MayRead) ReadOnlyApplies.push_back(AI); } if (Inst.mayHaveSideEffects()) { MayWrites.push_back(&Inst); // Remove clobbered loads we have seen before. removeWrittenTo(AA, SafeReads, &Inst); } } } for (auto *AI : ReadOnlyApplies) { if (!mayWriteTo(AA, SEA, MayWrites, AI)) SafeReads.insert(AI); } }
// Executes a commit. Runs in a separate thread, so the transaction is // passed in directly void CassandraStorage::executeCommit(const Bucket& bucket, Transaction* trans, CommitCallback cb, const String& timestamp) { ReadSet* rs = new ReadSet; Columns* columns = new Columns; Keys* eraseKeys = new Keys; Keys* readKeys = new Keys; for (Transaction::iterator it = trans->begin(); it != trans->end(); it++) { (*it).execute(bucket, columns, eraseKeys, readKeys, timestamp); } bool success = true; success = CassandraCommit(mDB, bucket, columns, eraseKeys, readKeys, rs, timestamp); if (rs->empty() || !success) { delete rs; rs = NULL; } mContext->mainStrand->post(std::tr1::bind(&CassandraStorage::completeCommit, this, trans, cb, success, rs)); }
std::string extendContigsWithContigExtender(ReadSet & contigs, ReadSet::ReadSetVector &contigReadSet, ReadSet & changedContigs, ReadSet & finalContigs, SequenceLengthType minKmerSize, double minimumCoverage, SequenceLengthType maxKmerSize, SequenceLengthType maxExtend, SequenceLengthType kmerStep) { std::stringstream extendLog; //#pragma omp parallel for for (ReadSet::ReadSetSizeType i = 0; i < contigs.getSize(); i++) { const Read &oldRead = contigs.getRead(i); Read newRead; SequenceLengthType oldLen = oldRead.getLength(), newLen = 0; ReadSet::ReadSetSizeType poolSize = contigReadSet[i].getSize(); SequenceLengthType myKmerSize = minKmerSize; if (poolSize > minimumCoverage) { LOG_VERBOSE_OPTIONAL(2, true, "kmer-Extending " << oldRead.getName() << " with " << poolSize << " pool of reads"); ReadSet myContig; myContig.append(oldRead); ReadSet newContig; while (newLen <= oldLen && myKmerSize <= maxKmerSize) { newContig = ContigExtender<KS>::extendContigs(myContig, contigReadSet[i], maxExtend, myKmerSize, myKmerSize); newLen = newContig.getRead(0).getLength(); myKmerSize += kmerStep; } newRead = newContig.getRead(0); } else { newRead = oldRead; } long deltaLen = (long) newLen - (long) oldLen; if (deltaLen > 0) { extendLog << std::endl << "Kmer Extended " << oldRead.getName() << " " << deltaLen << " bases to " << newRead.getLength() << ": " << newRead.getName() << " with " << poolSize << " reads in the pool K " << (myKmerSize - kmerStep); //#pragma omp critical changedContigs.append(newRead); } else { extendLog << std::endl << "Did not extend " << oldRead.getName() << " with " << poolSize << " reads in the pool"; //#pragma omp critical finalContigs.append(oldRead); } } return extendLog.str(); }
std::string runPartialBatch(mpi::communicator world, boost::shared_ptr< MatcherInterface > &matcher, ReadSet &_contigs, std::string _contigFile, ReadSet & changedContigs, ReadSet & finalContigs, int batchIdx, int maxContigsPerBatch, SequenceLengthType minKmerSize, double minimumCoverage, SequenceLengthType maxKmerSize, SequenceLengthType maxExtend, SequenceLengthType kmerStep) { LOG_DEBUG(1, "Starting runPartialBatch(" << batchIdx << " of " << _contigs.getSize() << "): " << MemoryUtils::getMemoryUsage()); ReadSet contigs; // new global contigs file a subset of original std::string extendLog; for(int i = batchIdx; i < (int) _contigs.getSize() && i < batchIdx + maxContigsPerBatch; i++) contigs.append(_contigs.getRead(i)); setGlobalReadSetConstants(world, contigs); if (contigs.getGlobalSize() == 0) return extendLog; std::string contigFile = DistributedOfstreamMap::writeGlobalReadSet(world, contigs, UniqueName::generateUniqueGlobalName(".tmp-batch" + UniqueName::getOurUniqueHandle() + "-", batchIdx), ".fasta", FormatOutput::Fasta()); MatcherInterface::MatchReadResults contigReadSet = matcher->match(contigs, contigFile); assert(contigs.getSize() == contigReadSet.size()); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, " batch " << contigs.getSize() << ". Matches made"); int numThreads = omp_get_max_threads(); std::string extendLogs[numThreads]; if (!Cap3Options::getOptions().getCap3Path().empty()) { Cap3 cap3Instances[numThreads]; #pragma omp parallel for for(int i = 0; i < numThreads; i++) { extendLogs[i] = cap3Instances[i].extendContigs(contigs, contigReadSet, changedContigs, finalContigs, minimumCoverage, i, numThreads); } } else if (!NewblerOptions::getOptions().getNewblerPath().empty()) { Newbler newblerInstances[numThreads]; #pragma omp parallel for for(int i = 0; i < numThreads; i++) { extendLogs[i] = newblerInstances[i].extendContigs(contigs, contigReadSet, changedContigs, finalContigs, minimumCoverage, i, numThreads); } } else { extendLog = extendContigsWithContigExtender(contigs, contigReadSet, changedContigs, finalContigs, minKmerSize, minimumCoverage, maxKmerSize, maxExtend, kmerStep); } for(int i = 0; i < numThreads; i++) extendLog += extendLogs[i]; unlink(contigFile.c_str()); return extendLog; }
void finishLongContigs(long maxContigLength, ReadSet &changedContigs, ReadSet &finalContigs) { ReadSet keepContigs; for(long i = 0; i < (long) changedContigs.getSize(); i++) { const Read &read = changedContigs.getRead(i); if ((long) read.getLength() >= maxContigLength) { LOG_VERBOSE_OPTIONAL(1, true, read.getName() << " (" << read.getLength() << ") has exceeded maxContiglength, terminating extension"); finalContigs.append(read); } else keepContigs.append(read); } changedContigs.swap(keepContigs); }
/// Checks if \p Inst has no side effects which prevent hoisting. /// The \a SafeReads set contain instructions which we already proved to have /// no such side effects. static bool hasNoSideEffect(SILInstruction *Inst, ReadSet &SafeReads) { // We can (and must) hoist cond_fail instructions if the operand is // invariant. We must hoist them so that we preserve memory safety. A // cond_fail that would have protected (executed before) a memory access // must - after hoisting - also be executed before said access. if (isa<CondFailInst>(Inst)) return true; // Can't hoist if the instruction could read from memory and is not marked // as safe. if (SafeReads.count(Inst)) return true; if (Inst->getMemoryBehavior() == SILInstruction::MemoryBehavior::None) return true; return false; }
void Depot::store_reads(const ReadSet& src) { ASSERT(src.size() != 0, "Depot", "Can not store an empty ReadSet!"); store(src, read_data_, read_index_); }
int main(int argc, char *argv[]) { if (!Fastq2FastaOptions::parseOpts(argc, argv)) exit(1); Cleanup::prepare(); OptionsBaseInterface::FileListType &inputs = Options::getOptions().getInputFiles(); long splitSizeBase = Fastq2FastaOptions::getOptions().getSplitSizeMegaBase() * 1000000; ReadSet reads; LOG_VERBOSE(1, "Reading Input Files" ); reads.appendAllFiles(inputs); LOG_VERBOSE(1, "loaded " << reads.getSize() << " Reads, " << reads.getBaseCount() << " Bases "); reads.identifyPairs(); long currentBase = 0; OfstreamMap ofmap; string outputFilename = Options::getOptions().getOutputFile(); bool hasOfMap = false; ostream *out = &cout; int partitionNum = 1; if (!outputFilename.empty()) { ofmap = OfstreamMap(outputFilename); hasOfMap = true; } else { splitSizeBase = 0; // do not support splitting when no output is specified } bool splitPairs = Fastq2FastaOptions::getOptions().getSplitPairs() != 0; string filekey; for(ReadSet::ReadSetSizeType pairIdx = 0 ; pairIdx < reads.getPairSize(); pairIdx++) { ReadSet::Pair pair = reads.getPair(pairIdx); ReadSet::ReadSetSizeType lesserIdx = std::min(pair.read1, pair.read2); if (hasOfMap) { filekey = reads.getReadFileNamePrefix(lesserIdx); } else { filekey.clear(); } if (splitSizeBase > 0) { SequenceLengthType len = reads.getRead(lesserIdx).getLength(); currentBase += len; if (currentBase > splitSizeBase) { // new output handle partitionNum++; currentBase = len; } filekey += "-" + boost::lexical_cast<string>( partitionNum ); } if (reads.isValidRead(pair.read1) && reads.isValidRead(pair.read2)) { const Read read = reads.getRead(pair.read1); if (hasOfMap) { if (splitPairs) { filekey += "-1"; } out = &( ofmap.getOfstream(filekey) ); } reads.getRead(pair.read1).write(*out); if (splitPairs) { filekey[filekey.length()-1] = '2'; out = &( ofmap.getOfstream(filekey) ); } reads.getRead(pair.read2).write(*out); } else { if (hasOfMap) { out = &( ofmap.getOfstream(filekey) ); } reads.getRead(lesserIdx).write(*out); } } }
template <class ReadSet> SQLiteObjectStorage::Error SQLiteObjectStorage::applyReadSet(const SQLiteDBPtr& db, const ReadSet& rs, Protocol::Response&retval) { int num_reads=rs.reads_size(); retval.clear_reads(); while (retval.reads_size()<num_reads) retval.add_reads(); SQLiteObjectStorage::Error databaseError=None; for (int rs_it=0;rs_it<num_reads;++rs_it) { String object_hex = getTableName(rs.reads(rs_it)); String key_name = getKeyName(rs.reads(rs_it)); String value_query = "SELECT value FROM "; value_query += "\"" TABLE_NAME "\""; value_query += " WHERE object == x\'" + object_hex + "\' AND key == ?"; int rc; char* remain; sqlite3_stmt* value_query_stmt; bool newStep=true; bool locked=false; rc = sqlite3_prepare_v2(db->db(), value_query.c_str(), -1, &value_query_stmt, (const char**)&remain); SQLite::check_sql_error(db->db(), rc, NULL, "Error preparing value query statement"); if (rc==SQLITE_OK) { rc = sqlite3_bind_text(value_query_stmt, 1, key_name.data(), (int)key_name.size(), SQLITE_TRANSIENT); SQLite::check_sql_error(db->db(), rc, NULL, "Error binding key name to value query statement"); if (rc==SQLITE_OK) { int step_rc = sqlite3_step(value_query_stmt); while(step_rc == SQLITE_ROW) { newStep=false; retval.reads(rs_it).set_data((const char*)sqlite3_column_text(value_query_stmt, 0),sqlite3_column_bytes(value_query_stmt, 0)); step_rc = sqlite3_step(value_query_stmt); } if (step_rc != SQLITE_DONE) { // reset the statement so it'll clean up properly rc = sqlite3_reset(value_query_stmt); SQLite::check_sql_error(db->db(), rc, NULL, "Error finalizing value query statement"); if (rc==SQLITE_LOCKED||rc==SQLITE_BUSY) locked=true; } } } rc = sqlite3_finalize(value_query_stmt); SQLite::check_sql_error(db->db(), rc, NULL, "Error finalizing value query statement"); if (locked||rc == SQLITE_LOCKED||rc==SQLITE_BUSY) { retval.clear_reads(); return DatabaseLocked; } if (newStep) { retval.reads(rs_it).clear_data(); retval.reads(rs_it).set_return_status(Protocol::StorageElement::KEY_MISSING); } if(rs.reads(rs_it).has_index()) { retval.reads(rs_it).set_index(rs.reads(rs_it).index()); } } if (rs.has_options()&&(rs.options()&Protocol::ReadWriteSet::RETURN_READ_NAMES)!=0) { // make sure read set is clear before each attempt mergeKeysFromStorageSet( retval, rs ); } return databaseError; }
Read* Depot::load_read(uint32_t index) { ReadSet temp; load_reads(temp, index, 1); return temp.front(); }
int main(int argc, char *argv[]) { ForkDaemon::initialize(); ScopedMPIComm< DistributedNucleatingAssemblerOptions > world(argc, argv); Cleanup::prepare(); try { double timing1, timing2; timing1 = MPI_Wtime(); OptionsBaseInterface::FileListType &inputFiles = Options::getOptions().getInputFiles(); std::string contigFile = ContigExtenderBaseOptions::getOptions().getContigFile(); std::string finalContigFile; double minimumCoverage = ContigExtenderBaseOptions::getOptions().getMinimumCoverage(); long maxIterations = DistributedNucleatingAssemblerOptions::getOptions().getMaxIterations(); ReadSet reads; LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Reading Input Files" ); reads.appendAllFiles(inputFiles, world.rank(), world.size()); reads.identifyPairs(); setGlobalReadSetConstants(world, reads); timing2 = MPI_Wtime(); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "loaded " << reads.getGlobalSize() << " Reads, (local:" << reads.getSize() << " pair:" << reads.getPairSize() << ") in " << (timing2-timing1) << " seconds" ); LOG_DEBUG_GATHER(1, MemoryUtils::getMemoryUsage()); if (FilterKnownOdditiesOptions::getOptions().getSkipArtifactFilter() == 0) { LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Preparing artifact filter: "); FilterKnownOddities filter; LOG_VERBOSE_OPTIONAL(2, world.rank() == 0, "Applying sequence artifact filter to Input Files"); unsigned long filtered = filter.applyFilter(reads); LOG_VERBOSE_GATHER(2, "local filter affected (trimmed/removed) " << filtered << " Reads "); LOG_DEBUG_GATHER(1, MemoryUtils::getMemoryUsage()); unsigned long allFiltered; mpi::reduce(world, filtered, allFiltered, std::plus<unsigned long>(), 0); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "distributed filter (trimmed/removed) " << allFiltered << " Reads."); } boost::shared_ptr< MatcherInterface > matcher; if (KmerBaseOptions::getOptions().getKmerSize() == 0) { matcher.reset( new Vmatch(world, UniqueName::generateHashName(inputFiles), reads) ); } else { matcher.reset( new KmerMatch(world, reads) ); } SequenceLengthType minKmerSize, maxKmerSize, kmerStep, maxExtend; ContigExtender<KS>::getMinMaxKmerSize(reads, minKmerSize, maxKmerSize, kmerStep); maxKmerSize = boost::mpi::all_reduce(world, maxKmerSize, mpi::minimum< SequenceLengthType>()); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Kmer size ranges: " << minKmerSize << "\t" << maxKmerSize << "\t" << kmerStep); maxExtend = maxKmerSize; timing1 = timing2; timing2 = MPI_Wtime(); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Prepared Matcher indexes in " << (timing2-timing1) << " seconds"); ReadSet finalContigs; ReadSet contigs; contigs.appendFastaFile(contigFile, world.rank(), world.size()); int maxContigsPerBatch = DistributedNucleatingAssemblerOptions::getOptions().getMaxContigsPerBatch(); short iteration = 0; while (++iteration <= maxIterations) { LOG_DEBUG_GATHER(1, "Iteration " << iteration << " " << MemoryUtils::getMemoryUsage()); int batchIdx = 0; matcher->resetTimes("Start Iteration", MPI_Wtime()); setGlobalReadSetConstants(world, contigs); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Iteration: " << iteration << ". Contig File: " << contigFile << ". contains " << contigs.getGlobalSize() << " Reads"); if (contigs.getGlobalSize() == 0) { LOG_VERBOSE_OPTIONAL(1, true, "There are no contigs to extend in " << contigFile); break; } std::string extendLog; ReadSet changedContigs; int lastBatch = contigs.getSize(); MPI_Allreduce(MPI_IN_PLACE, &lastBatch, 1, MPI_INT, MPI_MAX, world); LOG_DEBUG_OPTIONAL(1, world.rank() == 0, "Iteration: " << iteration << " Last batch is " << lastBatch); while (batchIdx < lastBatch) { extendLog += runPartialBatch(world, matcher, contigs, contigFile, changedContigs, finalContigs, batchIdx, maxContigsPerBatch, minKmerSize, minimumCoverage, maxKmerSize, maxExtend, kmerStep); batchIdx += maxContigsPerBatch; } matcher->recordTime("extendContigs", MPI_Wtime()); LOG_DEBUG_GATHER(1, (extendLog)); finishLongContigs(DistributedNucleatingAssemblerOptions::getOptions().getMaxContigLength(), changedContigs, finalContigs); LOG_DEBUG_GATHER(1, "Changed contigs: " << changedContigs.getSize() << " finalContigs: " << finalContigs.getSize()); setGlobalReadSetConstants(world, changedContigs); setGlobalReadSetConstants(world, finalContigs); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Changed contigs: " << changedContigs.getGlobalSize() << " finalContigs: " << finalContigs.getGlobalSize()); std::string oldFinalContigFile = finalContigFile; std::string oldContigFile = contigFile; { // write out the state of the contig files (so far) so we do not loose them DistributedOfstreamMap om(world, Options::getOptions().getOutputFile(), ""); om.setBuildInMemory(); if (finalContigs.getGlobalSize() > 0) { std::string fileKey = "final-" + boost::lexical_cast< std::string>(iteration); finalContigs.writeAll(om.getOfstream(fileKey), FormatOutput::Fasta()); finalContigFile = om.getRealFilePath(fileKey); } if (changedContigs.getGlobalSize() > 0) { std::string filekey = "-inputcontigs-" + boost::lexical_cast< std::string>(iteration) + ".fasta"; changedContigs.writeAll(om.getOfstream(filekey), FormatOutput::Fasta()); contigFile = om.getRealFilePath(filekey); } contigs = changedContigs; } if (world.rank() == 0) { // preserve the final contigs in case of crash unlink(Options::getOptions().getOutputFile().c_str()); link(finalContigFile.c_str(), Options::getOptions().getOutputFile().c_str()); } matcher->recordTime("writeFinalTime", MPI_Wtime()); if (!Log::isDebug(1) && world.rank() == 0) { // remove most recent contig files (if not debugging) if (!oldFinalContigFile.empty()) { LOG_VERBOSE_OPTIONAL(1, true, "Removing " << oldFinalContigFile); unlink(oldFinalContigFile.c_str()); } if (ContigExtenderBaseOptions::getOptions().getContigFile().compare( oldContigFile) != 0) { LOG_VERBOSE_OPTIONAL(1, true, "Removing " << oldContigFile); unlink(oldContigFile.c_str()); } } if (changedContigs.getGlobalSize() == 0) { LOG_VERBOSE_OPTIONAL(1, world.rank() == 1, "No more contigs to extend " << changedContigs.getSize()); break; } matcher->recordTime("finishIteration", MPI_Wtime()); LOG_DEBUG_GATHER(1, matcher->getTimes("") + ". " + MemoryUtils::getMemoryUsage()); } matcher.reset(); // release the matcher interface if (world.rank() == 0 && !Log::isDebug(1)) { if (ContigExtenderBaseOptions::getOptions().getContigFile().compare( contigFile) != 0) { LOG_DEBUG_OPTIONAL(1, true, "Removing " << contigFile); unlink(contigFile.c_str()); } } // write final contigs (and any unfinished contigs still remaining) finalContigs.append(contigs); std::string tmpFinalFile = DistributedOfstreamMap::writeGlobalReadSet(world, finalContigs, Options::getOptions().getOutputFile(), ".tmp", FormatOutput::Fasta()); if (world.rank() == 0 && !finalContigFile.empty()) { LOG_DEBUG_OPTIONAL(1, true, "Removing " << finalContigFile); unlink(finalContigFile.c_str()); } finalContigFile = tmpFinalFile; if (world.rank() == 0) { unlink(Options::getOptions().getOutputFile().c_str()); rename(finalContigFile.c_str(), Options::getOptions().getOutputFile().c_str()); } finalContigFile = Options::getOptions().getOutputFile(); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Final contigs are in: " << finalContigFile); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Finished"); ForkDaemon::finalize(); } catch (std::exception &e) { LOG_ERROR(1, "DistributedNucleatingAssembler threw an exception! Aborting..." << e.what()); world.abort(1); } catch (...) { LOG_ERROR(1, "DistributedNucleatingAssembler threw an error!" ); world.abort(1); } return 0; }
int main(int argc, char *argv[]) { ScopedMPIComm< MPIEstimateSizeOptions > world(argc, argv); Cleanup::prepare(); try { MemoryUtils::getMemoryUsage(); std::string outputFilename = Options::getOptions().getOutputFile(); OptionsBaseInterface::FileListType &inputs = Options::getOptions().getInputFiles(); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Reading Input Files"); long partitions = MPIEstimateSizeOptions::getOptions().getSamplePartitions(); double maxFraction = MPIEstimateSizeOptions::getOptions().getMaxSampleFraction(); assert(maxFraction < 1.0); double fraction = 0.0; long totalPartitions = (long) partitions / maxFraction; unsigned long totalReads = 0; unsigned long totalBases = 0; long rawKmers = 0; KS spectrum(world, 0); for (long iter = 0 ; iter < partitions && fraction < maxFraction; iter++) { fraction += (double) 1. / (double) totalPartitions; LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Starting iteration " << iter << " at " << fraction*100 << "%"); ReadSet reads; reads.appendAllFiles(inputs, world.rank()*totalPartitions + iter, world.size()*totalPartitions); setGlobalReadSetConstants(world, reads); unsigned long counts[3], totalCounts[3]; mpi::all_reduce(world, (unsigned long*) counts, 3, (unsigned long*) totalCounts, std::plus<unsigned long>()); totalReads += totalCounts[0]; totalBases += totalCounts[2]; if (KmerBaseOptions::getOptions().getKmerSize() > 0) { // lazy allocate if (rawKmers == 0) { rawKmers = KS::estimateRawKmers(world, inputs); spectrum = KS(world, rawKmers); } spectrum.buildKmerSpectrum(reads); spectrum.trackSpectrum(true); LOG_DEBUG_OPTIONAL(1, true, "SizeTracker: " << spectrum.getSizeTracker().toString()); if (Log::isDebug(1)) { KS::MPIHistogram h = spectrum._getHistogram(false); std::string hist = h.toString(); LOG_DEBUG_OPTIONAL(1, world.rank() == 0, "Collective Kmer Histogram\n" << hist); } } } std::string hist = spectrum.getHistogram(false); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, "Collective Kmer Histogram\n" << hist); KS::SizeTracker reducedSizeTracker = spectrum.reduceSizeTracker(world); std::string reducedSizeTrackerFile = outputFilename; if (reducedSizeTrackerFile.empty()) { reducedSizeTrackerFile = UniqueName::generateUniqueName("tmp-estimateSize"); } float errorRate = TrackingData::getErrorRate(); LOG_DEBUG_OPTIONAL(1, true, "Kmer error rate: " << errorRate); float commonErrorRate = 0.0; MPI_Reduce(&errorRate, &commonErrorRate, 1, MPI_FLOAT, MPI_SUM, 0, world); commonErrorRate /= (float) world.size(); if (world.rank() == 0) { { LOG_VERBOSE_OPTIONAL(1, true, "Writing size tracking file to:" << reducedSizeTrackerFile); LOG_DEBUG_OPTIONAL(1, true, "SizeTracker:\n" << reducedSizeTracker.toString()); OfstreamMap ofm(reducedSizeTrackerFile, ""); ofm.getOfstream("") << reducedSizeTracker.toString(); } std::string basePath = FileUtils::getBasePath(argv[0]); std::stringstream cmdss; cmdss << "Rscript " << basePath << "/EstimateSize.R " << reducedSizeTrackerFile; // << " " << (commonErrorRate*1.25); std::string command = cmdss.str(); if (!FileUtils::getBasePath("Rscript").empty() || basePath.empty()) { LOG_DEBUG_OPTIONAL(1, true, "Executing: " << command); IPipestream ipipe(command); double errorRate = 0.0, genomeSize = 0.0; bool readValues = false; while (ipipe.good() && !ipipe.eof()) { std::string line; std::getline(ipipe, line); LOG_DEBUG_OPTIONAL(2, true, "Read: " << line); if (line.find("errorRate") != std::string::npos) { LOG_DEBUG_OPTIONAL(2, true, "Found headers in: " << line); readValues = true; continue; } if (readValues) { readValues = false; LOG_DEBUG_OPTIONAL(2, true, "Reading errorRate and GenomeSize from " << line); std::stringstream ss; ss << line; ss >> errorRate; ss >> genomeSize; } } LOG_VERBOSE_OPTIONAL(1, true, "Estimated Kmer-quality errorRate: " << commonErrorRate); LOG_VERBOSE_OPTIONAL(1, true, "Distributed readCount: " << totalReads); LOG_VERBOSE_OPTIONAL(1, true, "Estimated fractionRead: " << fraction); LOG_VERBOSE_OPTIONAL(1, true, "Estimated errorRate: " << errorRate); LOG_VERBOSE_OPTIONAL(1, true, "Estimated genomeSize: " << genomeSize); ipipe.close(); double totalRawKmers = reducedSizeTracker.getLastElement().rawKmers / fraction; double estimatedUniqueKmers = totalRawKmers * errorRate + genomeSize; LOG_VERBOSE_OPTIONAL(1, true, "Estimated totalRawKmers: " << totalRawKmers); LOG_VERBOSE_OPTIONAL(1, true, "Estimated totalUniqueKmers: " << estimatedUniqueKmers); if (reducedSizeTrackerFile.compare(outputFilename) != 0) { LOG_DEBUG_OPTIONAL(1, true, "Removing temporary size tracking file: " << reducedSizeTrackerFile); unlink(reducedSizeTrackerFile.c_str()); } } else {