Example #1
0
int main(int argc, char const ** argv)
{
    double startTime = 0;
    
    // -----------------------------------------------------------------------
    // Parse command line.
    // -----------------------------------------------------------------------
    FxSamCoverageOptions options;
    seqan::ArgumentParser::ParseResult res = parseArgs(options, argc, argv);
    if (res != seqan::ArgumentParser::PARSE_OK)
        return res == seqan::ArgumentParser::PARSE_ERROR;  // 1 on errors, 0 otherwise

    // -----------------------------------------------------------------------
    // Show options.
    // -----------------------------------------------------------------------
    if (options.verbosity >= 1)
    {
        std::cerr << "____OPTIONS___________________________________________________________________\n"
                  << "\n"
                  << "VERBOSITY    " << options.verbosity << "\n"
                  << "GENOME       " << options.inGenomePath << "\n"
                  << "SAM          " << options.inSamPath << "\n"
                  << "OUT          " << options.outPath << "\n"
                  << "WINDOW SIZE  " << options.windowSize << "\n";
    }

    // -----------------------------------------------------------------------
    // Load Genome FAI Index
    // -----------------------------------------------------------------------

    std::cerr << "\n"
              << "___PREPRATION_____________________________________________________________________\n"
              << "\n"
              << "Indexing GENOME file  " << options.inGenomePath << " ...";
    seqan::FaiIndex faiIndex;
    if (build(faiIndex, toCString(options.inGenomePath)) != 0)
    {
        std::cerr << "Could not build FAI index.\n";
        return 1;
    }
    std::cerr << " OK\n";

    // Prepare bins.
    seqan::String<seqan::String<BinData> > bins;
    resize(bins, numSeqs(faiIndex));

    // -----------------------------------------------------------------------
    // Compute C+G content 
    // -----------------------------------------------------------------------

    std::cerr << "\n"
              << "___C+G CONTENT COMPUTATION________________________________________________________\n"
              << "\n";

    for (unsigned i = 0; i < numSeqs(faiIndex); ++i)
    {
        std::cerr << "[" << sequenceName(faiIndex, i) << "] ...";
        unsigned numBins = (sequenceLength(faiIndex, i) + options.windowSize - 1) / options.windowSize;
        resize(bins[i], numBins);
        seqan::Dna5String contigSeq;
        if (readSequence(contigSeq, faiIndex, i) != 0)
        {
            std::cerr << "\nERROR: Could not read sequence " << sequenceName(faiIndex, i) << " from file!\n";
            return 1;
        }

        for (unsigned bin = 0; bin < numBins; ++bin)
        {
            unsigned cgCounter = 0;
            unsigned binSize = 0;
            bins[i][bin].length = options.windowSize;
            if ((bin + 1) * options.windowSize > length(contigSeq))
                bins[i][bin].length = length(contigSeq) - bin * options.windowSize;
            for (unsigned pos = bin * options.windowSize; pos < length(contigSeq) && pos < (bin + 1) * options.windowSize; ++pos, ++binSize)
                cgCounter += (contigSeq[pos] == 'C' || contigSeq[pos] == 'G');
            bins[i][bin].cgContent = 1.0 * cgCounter / binSize;
        }
        std::cerr << "DONE\n";
    }

    // -----------------------------------------------------------------------
    // Compute Coverage
    // -----------------------------------------------------------------------

    std::cerr << "\n"
              << "___COVERAGE COMPUATATION________________________________________________________\n"
              << "\n"
              << "Computing Coverage...";

    seqan::BamStream bamStream(toCString(options.inSamPath));
    if (!isGood(bamStream))
    {
        std::cerr << "Could not open " << options.inSamPath << "!\n";
        return 1;
    }

    seqan::BamAlignmentRecord record;
    while (!atEnd(bamStream))
    {
        if (readRecord(record, bamStream) != 0)
        {
            std::cerr << "ERROR: Could not read record from BAM file!\n";
            return 1;
        }

        if (hasFlagUnmapped(record) || hasFlagSecondary(record) || record.rId == seqan::BamAlignmentRecord::INVALID_REFID)
            continue;  // Skip these records.

        int contigId = 0;
        seqan::CharString const & contigName = nameStore(bamStream.bamIOContext)[record.rId];
        if (!getIdByName(faiIndex, contigName, contigId))
        {
            std::cerr << "ERROR: Alignment to unknown contig " << contigId << "!\n";
            return 1;
        }
        unsigned binNo = record.pos / options.windowSize;
        bins[contigId][binNo].coverage += 1;
    }

    std::cerr << "DONE\n";

    // -----------------------------------------------------------------------
    // Write Output
    // -----------------------------------------------------------------------

    std::ostream * out = &std::cout;
    std::ofstream outFile;
    if (options.outPath != "-")
    {
        outFile.open(toCString(options.outPath), std::ios::binary | std::ios::out);
        if (!outFile.good())
        {
            std::cerr << "ERROR: Could not open output file " << options.outPath << "!\n";
            return 1;
        }
        out = &outFile;
    }

    (*out) << "#BIN\tREF_NAME\tREF_BIN\tBIN_BEGIN\tBIN_LENGTH\tCOVERAGE\tCG_CONTENT\n";
    for (unsigned i = 0, globalBin = 0; i < length(bins); ++i)
    {
        for (unsigned refBin = 0; refBin < length(bins[i]); ++refBin, ++globalBin)
        {
            (*out) << globalBin << '\t'
                   << sequenceName(faiIndex, i) << '\t'
                   << refBin << '\t'
                   << refBin * options.windowSize << '\t'
                   << bins[i][refBin].length << '\t'
                   << bins[i][refBin].coverage << '\t'
                   << bins[i][refBin].cgContent << '\n';
        }
    }

    if (options.verbosity >= 2)
        std::cerr << "Took " << (sysTime() - startTime) << " s\n";

    return 0;
}
Example #2
0
bool VcfMaterializer::_materializeNext(seqan::Dna5String & seq,
                                       MethylationLevels * levels,
                                       std::vector<SmallVarInfo> & varInfos,
                                       std::vector<std::pair<int, int> > & breakpoints,
                                       int & rID,
                                       int & haplotype)
{
    if (levels)
        SEQAN_CHECK(!empty(methFastaFileName), "Must initialize with methylation FASTA file for levels");

    if (empty(vcfFileName))
    {
        if (currRID >= (int)(numSeqs(faiIndex) - 1))
            return false;
        currRID += 1;
        rID = currRID;
        readSequence(seq, faiIndex, currRID);
        if (levels && !empty(methFastaFileName))
        {
            _loadLevels(currRID);
            swap(*levels, currentLevels);
        }

        // Build identity PositionMap.
        TJournalEntries journal;
        reinit(journal, length(seq));
        posMap.reinit(journal);
        GenomicInterval gi(0, length(seq), 0, length(seq));
        seqan::String<PositionMap::TInterval> intervals;
        appendValue(intervals, PositionMap::TInterval(gi.svBeginPos, gi.svEndPos, gi));
        createIntervalTree(posMap.svIntervalTree, intervals);
        createIntervalTree(posMap.svIntervalTreeSTL, intervals);

        return true;
    }

    // Number of sequences.
    int numSeqs = length(contigNames(context(vcfFileIn)));

    // Stop if there are no more haplotypes to materialize.
    if (currRID >= (numSeqs - 1) && nextHaplotype == numHaplotypes)
        return false;

    // Load variants for next contig if necessary.
    if (currRID == -1 || nextHaplotype == numHaplotypes)
    {
        currRID += 1;
        nextHaplotype = 0;

        _loadVariantsForContig(contigVariants, currRID);
        readSequence(contigSeq, faiIndex, currRID);
        if (levels && !empty(methFastaFileName))
            _loadLevels(currRID);
    }

    // Materialize variants for the current haplotype.
    VariantMaterializer varMat(rng, contigVariants, *methOptions);
    if (levels)
        varMat.run(seq, posMap, *levels, varInfos, breakpoints, contigSeq, currentLevels, nextHaplotype);
    else
        varMat.run(seq, posMap, varInfos, breakpoints, contigSeq, nextHaplotype);

    // Write out rID and haploty
    rID = currRID;
    haplotype = nextHaplotype++;
    return true;
}