Ejemplo n.º 1
0
void VcfMaterializer::init()
{
    if (!empty(vcfFileName))
    {
        // Open VCF stream.
        if (!open(vcfFileIn, toCString(vcfFileName)))
            throw MasonIOException("Could not open VCF stream.");

        // Read header.
        readHeader(vcfHeader, vcfFileIn);

        // Read first VCF record.
        if (!atEnd(vcfFileIn))
            readRecord(vcfRecord, vcfFileIn);

        // Get number of haplotypes in VCF file.
        SEQAN_ASSERT_NOT(empty(vcfRecord.genotypeInfos));
        seqan::StringSet<seqan::CharString> xs;
        seqan::DirectionIterator<seqan::CharString const, seqan::Input>::Type inputIter =
                directionIterator(vcfRecord.genotypeInfos[0], seqan::Input());
        numHaplotypes = 1;
        for (; !atEnd(inputIter); ++inputIter)
            numHaplotypes += (*inputIter == '|' || *inputIter == '/');
    }
    else
    {
        numHaplotypes = 1;
    }

    // Open input FASTA file and FAI.
    if (!open(faiIndex, toCString(fastaFileName)))
    {
        if (!build(faiIndex, toCString(fastaFileName)))
            throw MasonIOException("Could not build FAI index.");

        seqan::CharString faiPath = fastaFileName;
        append(faiPath, ".fai");
        if (!save(faiIndex, toCString(faiPath)))
            throw MasonIOException("Could not write FAI index.");
    }

    // Open methylation FASTA FAI file if given.
    if (!empty(methFastaFileName))
    {
        if (!open(methFaiIndex, toCString(methFastaFileName)))
        {
            if (!build(methFaiIndex, toCString(methFastaFileName)))
                throw MasonIOException("Could not build methylation levels FAI index.");

            seqan::CharString faiPath = methFastaFileName;
            append(faiPath, ".fai");
            if (!save(methFaiIndex, toCString(faiPath)))
                throw MasonIOException("Could not write methylation levels FAI index.");
        }
    }
}
Ejemplo n.º 2
0
void VcfMaterializer::_loadLevels(int rID)
{
    currentLevels.clear();

    std::stringstream ssTop, ssBottom;
    ssTop << sequenceName(faiIndex, rID) << "/TOP";
    unsigned idx = 0;
    if (!getIdByName(idx, methFaiIndex, ssTop.str().c_str()))
        throw MasonIOException("Could not find top levels in methylation FASTA.");
    readSequence(currentLevels.forward, methFaiIndex, idx);
    ssBottom << sequenceName(faiIndex, rID) << "/BOT";
    if (!getIdByName(idx, methFaiIndex, ssBottom.str().c_str()))
        throw MasonIOException("Could not find bottom levels in methylation FASTA.");
    readSequence(currentLevels.reverse, methFaiIndex, idx);
}
Ejemplo n.º 3
0
    void _readFirstRecord(MyGffRecord & record)
    {
        record.rID = record.INVALID_IDX;  // uninitialized

        bool found = false;
        while (!found && !atEnd(gffFileIn))
        {
            readRecord(record, gffFileIn);

            // Translate ref to idx from VCF.
            unsigned idx = 0;
            if (!getIdByName(idx, vcfMat.faiIndex, record.ref))
                throw MasonIOException("Reference name from GFF/GTF not in VCF!");
            record.rID = idx;

            if (empty(options.gffType) || (options.gffType == record.type))
            {
                found = true;
                break;
            }
        }
        if (!found)
            record.rID = seqan::maxValue<int>();
    }
Ejemplo n.º 4
0
    int run()
    {
        // Intialization
        std::cerr << "__INITIALIZATION_____________________________________________________________\n"
                  << "\n";

        std::cerr << "Opening files...";
        try
        {
            vcfMat.init();

            if (!open(seqFileOut, toCString(options.outputFileName)))
                throw MasonIOException("Could not open output file.");

            if (!open(gffFileIn, toCString(options.inputGffFile)))
                throw MasonIOException("Could not open GFF/GTF file.");
        }
        catch (MasonIOException e)
        {
            std::cerr << "\nERROR: " << e.what() << "\n";
            return 1;
        }
        std::cerr << " OK\n";

        // Perform genome simulation.
        std::cerr << "\n__COMPUTING TRANSCRIPTS______________________________________________________\n"
                  << "\n";

        // Read first GFF record.
        MyGffRecord record;
        _readFirstRecord(record);
        if (record.rID == seqan::maxValue<int>())
            return 0;  // at end, could not read any, done

        // Transcript names.
        typedef seqan::StringSet<seqan::CharString> TNameStore;
        typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
        TNameStore transcriptNames;
        TNameStoreCache transcriptNamesCache(transcriptNames);

        // The splicing instructions for the current contig.
        std::vector<SplicingInstruction> splicingInstructions;

        // Materialized sequence.
        seqan::Dna5String seq;
        // Tanscript ids, used as a buffer below.
        seqan::String<unsigned> transcriptIDs;

        // Read GFF/GTF file contig by contig (must be sorted by reference name).  For each contig, we all recors,
        // create simulation instructions and then build the transcripts for each haplotype.
        while (record.rID != seqan::maxValue<int>())  // sentinel, at end
        {
            seqan::CharString refName = record.ref;
            std::cerr << "Splicing for " << refName << " ...";

            // Read GFF records for this contig.
            MyGffRecord firstGffRecord = record;
            while (record.rID == firstGffRecord.rID)
            {
                if (empty(options.gffType) || (record.type == options.gffType))
                {
                    // Make transcript names known to the record.
                    _appendTranscriptNames(transcriptIDs, transcriptNames, transcriptNamesCache, record);
                    // Add the splicing instructions for this record to the list for this contig.
                    for (unsigned i = 0; i < length(transcriptIDs); ++i)
                        splicingInstructions.push_back(SplicingInstruction(transcriptIDs[i], record.beginPos,
                                                                           record.endPos, record.strand));
                }

                if (atEnd(gffFileIn))
                {
                    record.rID = seqan::maxValue<int>();
                    break;
                }

                readRecord(record, gffFileIn);
                // Translate ref to idx from VCF.
                unsigned idx = 0;
                if (!getIdByName(idx, vcfMat.faiIndex, record.ref))
                    throw MasonIOException("Reference name from GFF/GTF not in VCF!");
                record.rID = idx;
            }

            // ---------------------------------------------------------------
            // Process the splicing instructions.
            // ---------------------------------------------------------------

            // First, sort them.
            std::sort(splicingInstructions.begin(), splicingInstructions.end());

            // Materialize all haplotypes of this contig
            int rID = 0, hID = 0;  // reference and haplotype id
            // Get index of the gff record's reference in the VCF file.
            unsigned idx = 0;
            if (!getIdByName(idx, vcfMat.faiIndex, refName))
            {
                std::stringstream ss;
                ss << "Reference from GFF file " << refName << " unknown in FASTA/FAI file.";
                throw MasonIOException(ss.str());
            }
            rID = idx;

            vcfMat.currRID = rID - 1;
            std::vector<SmallVarInfo> varInfos;  // small variants for counting in read alignments
            std::vector<std::pair<int, int> > breakpoints;  // unused/ignored
            while (vcfMat.materializeNext(seq, varInfos, breakpoints, rID, hID))
            {
                std::cerr << " (allele " << (hID + 1) << ")";
                if (rID != (int)idx)
                    break;  // no more haplotypes for this reference
                _performSplicing(splicingInstructions, seq, transcriptNames, hID, vcfMat);
            }

            std::cerr << " DONE.\n";

            // ---------------------------------------------------------------
            // Handle contig switching.
            // ---------------------------------------------------------------

            // Check that the input GFF file is clustered (weaker than sorted) by reference name.
            if (record.rID < firstGffRecord.rID)
                throw MasonIOException("GFF file not sorted or clustered by reference.");
            // Reset transcript names and cache.
            clear(transcriptNames);
            refresh(transcriptNamesCache);
            // Flush splicing instructions.
            splicingInstructions.clear();
        }

        std::cerr << "\nDone splicing FASTA.\n";

        return 0;
    }
Ejemplo n.º 5
0
    int run()
    {
        // Intialization
        std::cerr << "__INITIALIZATION_____________________________________________________________\n"
                  << "\n";

        std::cerr << "Opening files...";
        try
        {
            vcfMat.init();

            open(outStream, toCString(options.outputFileName), seqan::SequenceStream::WRITE);
            if (!isGood(outStream))
                throw MasonIOException("Could not open output file.");

            if (options.methOptions.simulateMethylationLevels)
            {
                open(outMethLevelStream, toCString(options.methFastaOutFile), seqan::SequenceStream::WRITE);
                if (!isGood(outMethLevelStream))
                    throw MasonIOException("Could not open methylation output file.");
            }
        }
        catch (MasonIOException e)
        {
            std::cerr << "\nERROR: " << e.what() << "\n";
            return 1;
        }
        std::cerr << " OK\n";

        // Perform genome simulation.
        std::cerr << "\n__MATERIALIZING______________________________________________________________\n"
                  << "\n";

        // The identifiers of the just materialized data.
        int rID = 0, hID = 0;
        seqan::Dna5String seq;
        std::cerr << "Materializing...";
        MethylationLevels levels;
        if (options.methOptions.simulateMethylationLevels)  // methylation level simulation
            while (vcfMat.materializeNext(seq, levels, rID, hID))
            {
                std::stringstream ssName;
                ssName << vcfMat.vcfStream.header.sequenceNames[rID] << options.haplotypeNameSep << (hID + 1);
                std::cerr << " " << ssName.str();

                if (writeRecord(outStream, ssName.str(), seq) != 0)
                {
                    std::cerr << "ERROR: Could not write materialized sequence to output.\n";
                    return 1;
                }

                std::stringstream ssTop;
                ssTop << ssName.str() << "/TOP";
                if (writeRecord(outMethLevelStream, ssTop.str(), levels.forward) != 0)
                    throw MasonIOException("Problem writing to methylation output file.");
                std::stringstream ssBottom;
                ssBottom << ssName.str() << "/BOT";
                if (writeRecord(outMethLevelStream, ssBottom.str(), levels.reverse) != 0)
                    throw MasonIOException("Problem writing to methylation output file.");
            }
        else  // NO methylation level simulation
            while (vcfMat.materializeNext(seq, rID, hID))
            {
                std::stringstream ssName;
                ssName << vcfMat.vcfStream.header.sequenceNames[rID] << options.haplotypeNameSep << (hID + 1);
                std::cerr << " " << ssName.str();

                if (writeRecord(outStream, ssName.str(), seq) != 0)
                {
                    std::cerr << "ERROR: Could not write materialized sequence to output.\n";
                    return 1;
                }
            }
        std::cerr << " DONE\n";

        std::cerr << "\nDone materializing VCF file.\n";

        return 0;
    }