std::string extendContigsWithContigExtender(ReadSet & contigs, ReadSet::ReadSetVector &contigReadSet, ReadSet & changedContigs, ReadSet & finalContigs, SequenceLengthType minKmerSize, double minimumCoverage, SequenceLengthType maxKmerSize, SequenceLengthType maxExtend, SequenceLengthType kmerStep) { std::stringstream extendLog; //#pragma omp parallel for for (ReadSet::ReadSetSizeType i = 0; i < contigs.getSize(); i++) { const Read &oldRead = contigs.getRead(i); Read newRead; SequenceLengthType oldLen = oldRead.getLength(), newLen = 0; ReadSet::ReadSetSizeType poolSize = contigReadSet[i].getSize(); SequenceLengthType myKmerSize = minKmerSize; if (poolSize > minimumCoverage) { LOG_VERBOSE_OPTIONAL(2, true, "kmer-Extending " << oldRead.getName() << " with " << poolSize << " pool of reads"); ReadSet myContig; myContig.append(oldRead); ReadSet newContig; while (newLen <= oldLen && myKmerSize <= maxKmerSize) { newContig = ContigExtender<KS>::extendContigs(myContig, contigReadSet[i], maxExtend, myKmerSize, myKmerSize); newLen = newContig.getRead(0).getLength(); myKmerSize += kmerStep; } newRead = newContig.getRead(0); } else { newRead = oldRead; } long deltaLen = (long) newLen - (long) oldLen; if (deltaLen > 0) { extendLog << std::endl << "Kmer Extended " << oldRead.getName() << " " << deltaLen << " bases to " << newRead.getLength() << ": " << newRead.getName() << " with " << poolSize << " reads in the pool K " << (myKmerSize - kmerStep); //#pragma omp critical changedContigs.append(newRead); } else { extendLog << std::endl << "Did not extend " << oldRead.getName() << " with " << poolSize << " reads in the pool"; //#pragma omp critical finalContigs.append(oldRead); } } return extendLog.str(); }
void finishLongContigs(long maxContigLength, ReadSet &changedContigs, ReadSet &finalContigs) { ReadSet keepContigs; for(long i = 0; i < (long) changedContigs.getSize(); i++) { const Read &read = changedContigs.getRead(i); if ((long) read.getLength() >= maxContigLength) { LOG_VERBOSE_OPTIONAL(1, true, read.getName() << " (" << read.getLength() << ") has exceeded maxContiglength, terminating extension"); finalContigs.append(read); } else keepContigs.append(read); } changedContigs.swap(keepContigs); }
std::string runPartialBatch(mpi::communicator world, boost::shared_ptr< MatcherInterface > &matcher, ReadSet &_contigs, std::string _contigFile, ReadSet & changedContigs, ReadSet & finalContigs, int batchIdx, int maxContigsPerBatch, SequenceLengthType minKmerSize, double minimumCoverage, SequenceLengthType maxKmerSize, SequenceLengthType maxExtend, SequenceLengthType kmerStep) { LOG_DEBUG(1, "Starting runPartialBatch(" << batchIdx << " of " << _contigs.getSize() << "): " << MemoryUtils::getMemoryUsage()); ReadSet contigs; // new global contigs file a subset of original std::string extendLog; for(int i = batchIdx; i < (int) _contigs.getSize() && i < batchIdx + maxContigsPerBatch; i++) contigs.append(_contigs.getRead(i)); setGlobalReadSetConstants(world, contigs); if (contigs.getGlobalSize() == 0) return extendLog; std::string contigFile = DistributedOfstreamMap::writeGlobalReadSet(world, contigs, UniqueName::generateUniqueGlobalName(".tmp-batch" + UniqueName::getOurUniqueHandle() + "-", batchIdx), ".fasta", FormatOutput::Fasta()); MatcherInterface::MatchReadResults contigReadSet = matcher->match(contigs, contigFile); assert(contigs.getSize() == contigReadSet.size()); LOG_VERBOSE_OPTIONAL(1, world.rank() == 0, " batch " << contigs.getSize() << ". Matches made"); int numThreads = omp_get_max_threads(); std::string extendLogs[numThreads]; if (!Cap3Options::getOptions().getCap3Path().empty()) { Cap3 cap3Instances[numThreads]; #pragma omp parallel for for(int i = 0; i < numThreads; i++) { extendLogs[i] = cap3Instances[i].extendContigs(contigs, contigReadSet, changedContigs, finalContigs, minimumCoverage, i, numThreads); } } else if (!NewblerOptions::getOptions().getNewblerPath().empty()) { Newbler newblerInstances[numThreads]; #pragma omp parallel for for(int i = 0; i < numThreads; i++) { extendLogs[i] = newblerInstances[i].extendContigs(contigs, contigReadSet, changedContigs, finalContigs, minimumCoverage, i, numThreads); } } else { extendLog = extendContigsWithContigExtender(contigs, contigReadSet, changedContigs, finalContigs, minKmerSize, minimumCoverage, maxKmerSize, maxExtend, kmerStep); } for(int i = 0; i < numThreads; i++) extendLog += extendLogs[i]; unlink(contigFile.c_str()); return extendLog; }
std::string extendContigsWithCap3(const ReadSet & contigs, ReadSet::ReadSetVector &contigReadSet, ReadSet & changedContigs, ReadSet & finalContigs, ReadSet::ReadSetSizeType minimumCoverage) { std::stringstream extendLog; int poolsWithoutMinimumCoverage = 0; // initialize per-thread Cap3 instances Cap3 cap3[omp_get_max_threads()]; #pragma omp parallel for for (long i = 0; i < (long) contigs.getSize(); i++) { const Read &oldRead = contigs.getRead(i); Read newRead = oldRead; SequenceLengthType oldLen = oldRead.getLength(), newLen = 0; ReadSet::ReadSetSizeType poolSize = contigReadSet[i].getSize(); double extTime = MPI_Wtime(); if (poolSize > minimumCoverage) { LOG_VERBOSE_OPTIONAL(2, true, "Extending " << oldRead.getName() << " with " << poolSize << " pool of reads"); newRead = cap3[omp_get_thread_num()].extendContig(oldRead, contigReadSet[i]); newLen = newRead.getLength(); } else { poolsWithoutMinimumCoverage++; } extTime = MPI_Wtime() - extTime; long deltaLen = (long)newLen - (long)oldLen; if (deltaLen > 0) { extendLog << std::endl << "Cap3 Extended " << oldRead.getName() << " " << deltaLen << " bases to " << newRead.getLength() << ": " << newRead.getName() << " with " << poolSize << " reads in the pool, in " << extTime << " sec"; //#pragma omp critical changedContigs.append(newRead); } else { extendLog << std::endl << "Did not extend " << oldRead.getName() << " with " << poolSize << " reads in the pool, in " << extTime << " sec"; //#pragma omp critical finalContigs.append(oldRead); } } LOG_VERBOSE_OPTIONAL(2, true, "Extended " << contigs.getSize() - poolsWithoutMinimumCoverage << " contigs out of " << contigs.getSize()); return extendLog.str(); }
int main(int argc, char *argv[]) { if (!Fastq2FastaOptions::parseOpts(argc, argv)) exit(1); Cleanup::prepare(); OptionsBaseInterface::FileListType &inputs = Options::getOptions().getInputFiles(); long splitSizeBase = Fastq2FastaOptions::getOptions().getSplitSizeMegaBase() * 1000000; ReadSet reads; LOG_VERBOSE(1, "Reading Input Files" ); reads.appendAllFiles(inputs); LOG_VERBOSE(1, "loaded " << reads.getSize() << " Reads, " << reads.getBaseCount() << " Bases "); reads.identifyPairs(); long currentBase = 0; OfstreamMap ofmap; string outputFilename = Options::getOptions().getOutputFile(); bool hasOfMap = false; ostream *out = &cout; int partitionNum = 1; if (!outputFilename.empty()) { ofmap = OfstreamMap(outputFilename); hasOfMap = true; } else { splitSizeBase = 0; // do not support splitting when no output is specified } bool splitPairs = Fastq2FastaOptions::getOptions().getSplitPairs() != 0; string filekey; for(ReadSet::ReadSetSizeType pairIdx = 0 ; pairIdx < reads.getPairSize(); pairIdx++) { ReadSet::Pair pair = reads.getPair(pairIdx); ReadSet::ReadSetSizeType lesserIdx = std::min(pair.read1, pair.read2); if (hasOfMap) { filekey = reads.getReadFileNamePrefix(lesserIdx); } else { filekey.clear(); } if (splitSizeBase > 0) { SequenceLengthType len = reads.getRead(lesserIdx).getLength(); currentBase += len; if (currentBase > splitSizeBase) { // new output handle partitionNum++; currentBase = len; } filekey += "-" + boost::lexical_cast<string>( partitionNum ); } if (reads.isValidRead(pair.read1) && reads.isValidRead(pair.read2)) { const Read read = reads.getRead(pair.read1); if (hasOfMap) { if (splitPairs) { filekey += "-1"; } out = &( ofmap.getOfstream(filekey) ); } reads.getRead(pair.read1).write(*out); if (splitPairs) { filekey[filekey.length()-1] = '2'; out = &( ofmap.getOfstream(filekey) ); } reads.getRead(pair.read2).write(*out); } else { if (hasOfMap) { out = &( ofmap.getOfstream(filekey) ); } reads.getRead(lesserIdx).write(*out); } } }