int main(int argc, char const ** argv) { double startTime = 0; // ----------------------------------------------------------------------- // Parse command line. // ----------------------------------------------------------------------- FxSamCoverageOptions options; seqan::ArgumentParser::ParseResult res = parseArgs(options, argc, argv); if (res != seqan::ArgumentParser::PARSE_OK) return res == seqan::ArgumentParser::PARSE_ERROR; // 1 on errors, 0 otherwise // ----------------------------------------------------------------------- // Show options. // ----------------------------------------------------------------------- if (options.verbosity >= 1) { std::cerr << "____OPTIONS___________________________________________________________________\n" << "\n" << "VERBOSITY " << options.verbosity << "\n" << "GENOME " << options.inGenomePath << "\n" << "SAM " << options.inSamPath << "\n" << "OUT " << options.outPath << "\n" << "WINDOW SIZE " << options.windowSize << "\n"; } // ----------------------------------------------------------------------- // Load Genome FAI Index // ----------------------------------------------------------------------- std::cerr << "\n" << "___PREPRATION_____________________________________________________________________\n" << "\n" << "Indexing GENOME file " << options.inGenomePath << " ..."; seqan::FaiIndex faiIndex; if (build(faiIndex, toCString(options.inGenomePath)) != 0) { std::cerr << "Could not build FAI index.\n"; return 1; } std::cerr << " OK\n"; // Prepare bins. seqan::String<seqan::String<BinData> > bins; resize(bins, numSeqs(faiIndex)); // ----------------------------------------------------------------------- // Compute C+G content // ----------------------------------------------------------------------- std::cerr << "\n" << "___C+G CONTENT COMPUTATION________________________________________________________\n" << "\n"; for (unsigned i = 0; i < numSeqs(faiIndex); ++i) { std::cerr << "[" << sequenceName(faiIndex, i) << "] ..."; unsigned numBins = (sequenceLength(faiIndex, i) + options.windowSize - 1) / options.windowSize; resize(bins[i], numBins); seqan::Dna5String contigSeq; if (readSequence(contigSeq, faiIndex, i) != 0) { std::cerr << "\nERROR: Could not read sequence " << sequenceName(faiIndex, i) << " from file!\n"; return 1; } for (unsigned bin = 0; bin < numBins; ++bin) { unsigned cgCounter = 0; unsigned binSize = 0; bins[i][bin].length = options.windowSize; if ((bin + 1) * options.windowSize > length(contigSeq)) bins[i][bin].length = length(contigSeq) - bin * options.windowSize; for (unsigned pos = bin * options.windowSize; pos < length(contigSeq) && pos < (bin + 1) * options.windowSize; ++pos, ++binSize) cgCounter += (contigSeq[pos] == 'C' || contigSeq[pos] == 'G'); bins[i][bin].cgContent = 1.0 * cgCounter / binSize; } std::cerr << "DONE\n"; } // ----------------------------------------------------------------------- // Compute Coverage // ----------------------------------------------------------------------- std::cerr << "\n" << "___COVERAGE COMPUATATION________________________________________________________\n" << "\n" << "Computing Coverage..."; seqan::BamStream bamStream(toCString(options.inSamPath)); if (!isGood(bamStream)) { std::cerr << "Could not open " << options.inSamPath << "!\n"; return 1; } seqan::BamAlignmentRecord record; while (!atEnd(bamStream)) { if (readRecord(record, bamStream) != 0) { std::cerr << "ERROR: Could not read record from BAM file!\n"; return 1; } if (hasFlagUnmapped(record) || hasFlagSecondary(record) || record.rId == seqan::BamAlignmentRecord::INVALID_REFID) continue; // Skip these records. int contigId = 0; seqan::CharString const & contigName = nameStore(bamStream.bamIOContext)[record.rId]; if (!getIdByName(faiIndex, contigName, contigId)) { std::cerr << "ERROR: Alignment to unknown contig " << contigId << "!\n"; return 1; } unsigned binNo = record.pos / options.windowSize; bins[contigId][binNo].coverage += 1; } std::cerr << "DONE\n"; // ----------------------------------------------------------------------- // Write Output // ----------------------------------------------------------------------- std::ostream * out = &std::cout; std::ofstream outFile; if (options.outPath != "-") { outFile.open(toCString(options.outPath), std::ios::binary | std::ios::out); if (!outFile.good()) { std::cerr << "ERROR: Could not open output file " << options.outPath << "!\n"; return 1; } out = &outFile; } (*out) << "#BIN\tREF_NAME\tREF_BIN\tBIN_BEGIN\tBIN_LENGTH\tCOVERAGE\tCG_CONTENT\n"; for (unsigned i = 0, globalBin = 0; i < length(bins); ++i) { for (unsigned refBin = 0; refBin < length(bins[i]); ++refBin, ++globalBin) { (*out) << globalBin << '\t' << sequenceName(faiIndex, i) << '\t' << refBin << '\t' << refBin * options.windowSize << '\t' << bins[i][refBin].length << '\t' << bins[i][refBin].coverage << '\t' << bins[i][refBin].cgContent << '\n'; } } if (options.verbosity >= 2) std::cerr << "Took " << (sysTime() - startTime) << " s\n"; return 0; }
bool VcfMaterializer::_materializeNext(seqan::Dna5String & seq, MethylationLevels * levels, std::vector<SmallVarInfo> & varInfos, std::vector<std::pair<int, int> > & breakpoints, int & rID, int & haplotype) { if (levels) SEQAN_CHECK(!empty(methFastaFileName), "Must initialize with methylation FASTA file for levels"); if (empty(vcfFileName)) { if (currRID >= (int)(numSeqs(faiIndex) - 1)) return false; currRID += 1; rID = currRID; readSequence(seq, faiIndex, currRID); if (levels && !empty(methFastaFileName)) { _loadLevels(currRID); swap(*levels, currentLevels); } // Build identity PositionMap. TJournalEntries journal; reinit(journal, length(seq)); posMap.reinit(journal); GenomicInterval gi(0, length(seq), 0, length(seq)); seqan::String<PositionMap::TInterval> intervals; appendValue(intervals, PositionMap::TInterval(gi.svBeginPos, gi.svEndPos, gi)); createIntervalTree(posMap.svIntervalTree, intervals); createIntervalTree(posMap.svIntervalTreeSTL, intervals); return true; } // Number of sequences. int numSeqs = length(contigNames(context(vcfFileIn))); // Stop if there are no more haplotypes to materialize. if (currRID >= (numSeqs - 1) && nextHaplotype == numHaplotypes) return false; // Load variants for next contig if necessary. if (currRID == -1 || nextHaplotype == numHaplotypes) { currRID += 1; nextHaplotype = 0; _loadVariantsForContig(contigVariants, currRID); readSequence(contigSeq, faiIndex, currRID); if (levels && !empty(methFastaFileName)) _loadLevels(currRID); } // Materialize variants for the current haplotype. VariantMaterializer varMat(rng, contigVariants, *methOptions); if (levels) varMat.run(seq, posMap, *levels, varInfos, breakpoints, contigSeq, currentLevels, nextHaplotype); else varMat.run(seq, posMap, varInfos, breakpoints, contigSeq, nextHaplotype); // Write out rID and haploty rID = currRID; haplotype = nextHaplotype++; return true; }