예제 #1
0
bool VcfMaterializer::_materializeNext(seqan::Dna5String & seq,
                                       MethylationLevels * levels,
                                       std::vector<SmallVarInfo> & varInfos,
                                       std::vector<std::pair<int, int> > & breakpoints,
                                       int & rID,
                                       int & haplotype)
{
    if (levels)
        SEQAN_CHECK(!empty(methFastaFileName), "Must initialize with methylation FASTA file for levels");

    if (empty(vcfFileName))
    {
        if (currRID >= (int)(numSeqs(faiIndex) - 1))
            return false;
        currRID += 1;
        rID = currRID;
        readSequence(seq, faiIndex, currRID);
        if (levels && !empty(methFastaFileName))
        {
            _loadLevels(currRID);
            swap(*levels, currentLevels);
        }

        // Build identity PositionMap.
        TJournalEntries journal;
        reinit(journal, length(seq));
        posMap.reinit(journal);
        GenomicInterval gi(0, length(seq), 0, length(seq));
        seqan::String<PositionMap::TInterval> intervals;
        appendValue(intervals, PositionMap::TInterval(gi.svBeginPos, gi.svEndPos, gi));
        createIntervalTree(posMap.svIntervalTree, intervals);
        createIntervalTree(posMap.svIntervalTreeSTL, intervals);

        return true;
    }

    // Number of sequences.
    int numSeqs = length(contigNames(context(vcfFileIn)));

    // Stop if there are no more haplotypes to materialize.
    if (currRID >= (numSeqs - 1) && nextHaplotype == numHaplotypes)
        return false;

    // Load variants for next contig if necessary.
    if (currRID == -1 || nextHaplotype == numHaplotypes)
    {
        currRID += 1;
        nextHaplotype = 0;

        _loadVariantsForContig(contigVariants, currRID);
        readSequence(contigSeq, faiIndex, currRID);
        if (levels && !empty(methFastaFileName))
            _loadLevels(currRID);
    }

    // Materialize variants for the current haplotype.
    VariantMaterializer varMat(rng, contigVariants, *methOptions);
    if (levels)
        varMat.run(seq, posMap, *levels, varInfos, breakpoints, contigSeq, currentLevels, nextHaplotype);
    else
        varMat.run(seq, posMap, varInfos, breakpoints, contigSeq, nextHaplotype);

    // Write out rID and haploty
    rID = currRID;
    haplotype = nextHaplotype++;
    return true;
}
예제 #2
0
int main(int argc, char const * argv[])
{
    // Additional checks
    seqan::ArgumentParser parser = buildParser();
    seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);

    // Check if input was successfully parsed.
    if (res != seqan::ArgumentParser::PARSE_OK)
        return res == seqan::ArgumentParser::PARSE_ERROR;

    // Check if one or two input files (single or paired-end) were given.
    int fileCount = getArgumentValueCount(parser, 0);
    if (fileCount < 1) {
        printShortHelp(parser);
        return 1;
    }

    unsigned int radius = 1;
    getOptionValue(radius, parser, "r");

    seqan::CharString readsFileName;
    getOptionValue(readsFileName, parser, "i");

    // Open input file, BamFileIn can read SAM and BAM files.
    seqan::BamFileIn bamFileIn(seqan::toCString(readsFileName));
    
    seqan::CharString _filterChromosomes;
    seqan::getOptionValue(_filterChromosomes, parser, "fc");
    std::string filterChromosomes = seqan::toCString(_filterChromosomes);

    OccurenceMap occurenceMap;
    Statistics stats;

    std::cout << "read bam file... ";
    auto t1 = std::chrono::steady_clock::now();
    seqan::BamAlignmentRecord record;
    seqan::BamHeader header;
    readHeader(header, bamFileIn);
    const auto chromosomeFilterSet = calculateChromosomeFilter(filterChromosomes, contigNames(context(bamFileIn)));
    const auto chromosomes = contigNames(context(bamFileIn));
    processBamFile(bamFileIn, chromosomeFilterSet, occurenceMap, stats);
    auto t2 = std::chrono::steady_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::duration<float>>(t2 - t1).count() << "s" << std::endl;

    std::vector<std::pair<unsigned int, unsigned int>> hits(radius * 2 + 1);

    t1 = std::chrono::steady_clock::now();
    std::cout << "calculating 5'-ends around peaks... ";

    for (unsigned int fileIndex = 0;fileIndex < static_cast<unsigned int>(fileCount); ++fileIndex)
    {
        seqan::CharString fileName_;
        getArgumentValue(fileName_, parser, fileIndex, 0);
        const std::string fileName = seqan::toCString(fileName_);

        std::ifstream infile(fileName);
        std::string chromosome, dummy;
        unsigned int start, end;
        while (infile >> chromosome >> start >> end >> dummy)
        {
            int rID = -1;
            for (unsigned int i = 0;i < length(chromosomes);++i)
                if (chromosomes[i] == chromosome)
                {
                    rID = i;
                    break;
                }
            if (rID == -1)
            {
                std::cout << "invalid chromosome name: " << chromosome << " in file " << fileName << std::endl;
                return -1;
            }
            seqan::BamAlignmentRecord record;
            record.beginPos = std::max<int>(start - radius, 0);
            record.rID = rID;
            record.flag = 0;
            unsigned int index = 0;
            if (start < radius)
                index += radius - start;
            while (record.beginPos <= static_cast<__int32>(start + radius))
            {
                BamRecordKey<NoBarcode> pos(record);
                auto el = occurenceMap.find(pos);
                if(el != occurenceMap.end())
                    hits[index].first += el->second;
                pos.init(pos.getRID(), pos.get5EndPosition(), true);
                el = occurenceMap.find(pos);
                if (el != occurenceMap.end())
                    hits[index].second += el->second;
                ++record.beginPos;
                ++index;
            }
        }

        std::string outFilename = getFilePrefix(fileName) + std::string("_5PrimeEnds.tab");
        if (seqan::isSet(parser, "o"))
        {
            seqan::CharString outFileName_;
            getOptionValue(outFileName_, parser, "o");
            outFilename = seqan::toCString(outFileName_);
        }


        std::fstream fs;
        std::cout << "writing " << outFilename << std::endl;
#ifdef _MSC_VER
        fs.open(outFilename, std::fstream::out, _SH_DENYNO);
#else
        fs.open(outFilename, std::fstream::out);
#endif
        int i = - static_cast<int>(radius);
        for (const auto& hit : hits)
            fs << i++ << "\t" << hit.first << "\t" << hit.second << std::endl;
        fs.close();
    }
    t2 = std::chrono::steady_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::duration<float>>(t2 - t1).count() << "s" << std::endl;


	return 0;
}