bool VcfMaterializer::_materializeNext(seqan::Dna5String & seq, MethylationLevels * levels, std::vector<SmallVarInfo> & varInfos, std::vector<std::pair<int, int> > & breakpoints, int & rID, int & haplotype) { if (levels) SEQAN_CHECK(!empty(methFastaFileName), "Must initialize with methylation FASTA file for levels"); if (empty(vcfFileName)) { if (currRID >= (int)(numSeqs(faiIndex) - 1)) return false; currRID += 1; rID = currRID; readSequence(seq, faiIndex, currRID); if (levels && !empty(methFastaFileName)) { _loadLevels(currRID); swap(*levels, currentLevels); } // Build identity PositionMap. TJournalEntries journal; reinit(journal, length(seq)); posMap.reinit(journal); GenomicInterval gi(0, length(seq), 0, length(seq)); seqan::String<PositionMap::TInterval> intervals; appendValue(intervals, PositionMap::TInterval(gi.svBeginPos, gi.svEndPos, gi)); createIntervalTree(posMap.svIntervalTree, intervals); createIntervalTree(posMap.svIntervalTreeSTL, intervals); return true; } // Number of sequences. int numSeqs = length(contigNames(context(vcfFileIn))); // Stop if there are no more haplotypes to materialize. if (currRID >= (numSeqs - 1) && nextHaplotype == numHaplotypes) return false; // Load variants for next contig if necessary. if (currRID == -1 || nextHaplotype == numHaplotypes) { currRID += 1; nextHaplotype = 0; _loadVariantsForContig(contigVariants, currRID); readSequence(contigSeq, faiIndex, currRID); if (levels && !empty(methFastaFileName)) _loadLevels(currRID); } // Materialize variants for the current haplotype. VariantMaterializer varMat(rng, contigVariants, *methOptions); if (levels) varMat.run(seq, posMap, *levels, varInfos, breakpoints, contigSeq, currentLevels, nextHaplotype); else varMat.run(seq, posMap, varInfos, breakpoints, contigSeq, nextHaplotype); // Write out rID and haploty rID = currRID; haplotype = nextHaplotype++; return true; }
int main(int argc, char const * argv[]) { // Additional checks seqan::ArgumentParser parser = buildParser(); seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv); // Check if input was successfully parsed. if (res != seqan::ArgumentParser::PARSE_OK) return res == seqan::ArgumentParser::PARSE_ERROR; // Check if one or two input files (single or paired-end) were given. int fileCount = getArgumentValueCount(parser, 0); if (fileCount < 1) { printShortHelp(parser); return 1; } unsigned int radius = 1; getOptionValue(radius, parser, "r"); seqan::CharString readsFileName; getOptionValue(readsFileName, parser, "i"); // Open input file, BamFileIn can read SAM and BAM files. seqan::BamFileIn bamFileIn(seqan::toCString(readsFileName)); seqan::CharString _filterChromosomes; seqan::getOptionValue(_filterChromosomes, parser, "fc"); std::string filterChromosomes = seqan::toCString(_filterChromosomes); OccurenceMap occurenceMap; Statistics stats; std::cout << "read bam file... "; auto t1 = std::chrono::steady_clock::now(); seqan::BamAlignmentRecord record; seqan::BamHeader header; readHeader(header, bamFileIn); const auto chromosomeFilterSet = calculateChromosomeFilter(filterChromosomes, contigNames(context(bamFileIn))); const auto chromosomes = contigNames(context(bamFileIn)); processBamFile(bamFileIn, chromosomeFilterSet, occurenceMap, stats); auto t2 = std::chrono::steady_clock::now(); std::cout << std::chrono::duration_cast<std::chrono::duration<float>>(t2 - t1).count() << "s" << std::endl; std::vector<std::pair<unsigned int, unsigned int>> hits(radius * 2 + 1); t1 = std::chrono::steady_clock::now(); std::cout << "calculating 5'-ends around peaks... "; for (unsigned int fileIndex = 0;fileIndex < static_cast<unsigned int>(fileCount); ++fileIndex) { seqan::CharString fileName_; getArgumentValue(fileName_, parser, fileIndex, 0); const std::string fileName = seqan::toCString(fileName_); std::ifstream infile(fileName); std::string chromosome, dummy; unsigned int start, end; while (infile >> chromosome >> start >> end >> dummy) { int rID = -1; for (unsigned int i = 0;i < length(chromosomes);++i) if (chromosomes[i] == chromosome) { rID = i; break; } if (rID == -1) { std::cout << "invalid chromosome name: " << chromosome << " in file " << fileName << std::endl; return -1; } seqan::BamAlignmentRecord record; record.beginPos = std::max<int>(start - radius, 0); record.rID = rID; record.flag = 0; unsigned int index = 0; if (start < radius) index += radius - start; while (record.beginPos <= static_cast<__int32>(start + radius)) { BamRecordKey<NoBarcode> pos(record); auto el = occurenceMap.find(pos); if(el != occurenceMap.end()) hits[index].first += el->second; pos.init(pos.getRID(), pos.get5EndPosition(), true); el = occurenceMap.find(pos); if (el != occurenceMap.end()) hits[index].second += el->second; ++record.beginPos; ++index; } } std::string outFilename = getFilePrefix(fileName) + std::string("_5PrimeEnds.tab"); if (seqan::isSet(parser, "o")) { seqan::CharString outFileName_; getOptionValue(outFileName_, parser, "o"); outFilename = seqan::toCString(outFileName_); } std::fstream fs; std::cout << "writing " << outFilename << std::endl; #ifdef _MSC_VER fs.open(outFilename, std::fstream::out, _SH_DENYNO); #else fs.open(outFilename, std::fstream::out); #endif int i = - static_cast<int>(radius); for (const auto& hit : hits) fs << i++ << "\t" << hit.first << "\t" << hit.second << std::endl; fs.close(); } t2 = std::chrono::steady_clock::now(); std::cout << std::chrono::duration_cast<std::chrono::duration<float>>(t2 - t1).count() << "s" << std::endl; return 0; }