예제 #1
0
    int run()
    {
        // Intialization
        std::cerr << "__INITIALIZATION_____________________________________________________________\n"
                  << "\n";

        std::cerr << "Opening files...";
        try
        {
            vcfMat.init();

            if (!open(seqFileOut, toCString(options.outputFileName)))
                throw MasonIOException("Could not open output file.");

            if (!open(gffFileIn, toCString(options.inputGffFile)))
                throw MasonIOException("Could not open GFF/GTF file.");
        }
        catch (MasonIOException e)
        {
            std::cerr << "\nERROR: " << e.what() << "\n";
            return 1;
        }
        std::cerr << " OK\n";

        // Perform genome simulation.
        std::cerr << "\n__COMPUTING TRANSCRIPTS______________________________________________________\n"
                  << "\n";

        // Read first GFF record.
        MyGffRecord record;
        _readFirstRecord(record);
        if (record.rID == seqan::maxValue<int>())
            return 0;  // at end, could not read any, done

        // Transcript names.
        typedef seqan::StringSet<seqan::CharString> TNameStore;
        typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
        TNameStore transcriptNames;
        TNameStoreCache transcriptNamesCache(transcriptNames);

        // The splicing instructions for the current contig.
        std::vector<SplicingInstruction> splicingInstructions;

        // Materialized sequence.
        seqan::Dna5String seq;
        // Tanscript ids, used as a buffer below.
        seqan::String<unsigned> transcriptIDs;

        // Read GFF/GTF file contig by contig (must be sorted by reference name).  For each contig, we all recors,
        // create simulation instructions and then build the transcripts for each haplotype.
        while (record.rID != seqan::maxValue<int>())  // sentinel, at end
        {
            seqan::CharString refName = record.ref;
            std::cerr << "Splicing for " << refName << " ...";

            // Read GFF records for this contig.
            MyGffRecord firstGffRecord = record;
            while (record.rID == firstGffRecord.rID)
            {
                if (empty(options.gffType) || (record.type == options.gffType))
                {
                    // Make transcript names known to the record.
                    _appendTranscriptNames(transcriptIDs, transcriptNames, transcriptNamesCache, record);
                    // Add the splicing instructions for this record to the list for this contig.
                    for (unsigned i = 0; i < length(transcriptIDs); ++i)
                        splicingInstructions.push_back(SplicingInstruction(transcriptIDs[i], record.beginPos,
                                                                           record.endPos, record.strand));
                }

                if (atEnd(gffFileIn))
                {
                    record.rID = seqan::maxValue<int>();
                    break;
                }

                readRecord(record, gffFileIn);
                // Translate ref to idx from VCF.
                unsigned idx = 0;
                if (!getIdByName(idx, vcfMat.faiIndex, record.ref))
                    throw MasonIOException("Reference name from GFF/GTF not in VCF!");
                record.rID = idx;
            }

            // ---------------------------------------------------------------
            // Process the splicing instructions.
            // ---------------------------------------------------------------

            // First, sort them.
            std::sort(splicingInstructions.begin(), splicingInstructions.end());

            // Materialize all haplotypes of this contig
            int rID = 0, hID = 0;  // reference and haplotype id
            // Get index of the gff record's reference in the VCF file.
            unsigned idx = 0;
            if (!getIdByName(idx, vcfMat.faiIndex, refName))
            {
                std::stringstream ss;
                ss << "Reference from GFF file " << refName << " unknown in FASTA/FAI file.";
                throw MasonIOException(ss.str());
            }
            rID = idx;

            vcfMat.currRID = rID - 1;
            std::vector<SmallVarInfo> varInfos;  // small variants for counting in read alignments
            std::vector<std::pair<int, int> > breakpoints;  // unused/ignored
            while (vcfMat.materializeNext(seq, varInfos, breakpoints, rID, hID))
            {
                std::cerr << " (allele " << (hID + 1) << ")";
                if (rID != (int)idx)
                    break;  // no more haplotypes for this reference
                _performSplicing(splicingInstructions, seq, transcriptNames, hID, vcfMat);
            }

            std::cerr << " DONE.\n";

            // ---------------------------------------------------------------
            // Handle contig switching.
            // ---------------------------------------------------------------

            // Check that the input GFF file is clustered (weaker than sorted) by reference name.
            if (record.rID < firstGffRecord.rID)
                throw MasonIOException("GFF file not sorted or clustered by reference.");
            // Reset transcript names and cache.
            clear(transcriptNames);
            refresh(transcriptNamesCache);
            // Flush splicing instructions.
            splicingInstructions.clear();
        }

        std::cerr << "\nDone splicing FASTA.\n";

        return 0;
    }
예제 #2
0
    int run()
    {
        // Intialization
        std::cerr << "__INITIALIZATION_____________________________________________________________\n"
                  << "\n";

        std::cerr << "Opening files...";
        try
        {
            vcfMat.init();

            open(outStream, toCString(options.outputFileName), seqan::SequenceStream::WRITE);
            if (!isGood(outStream))
                throw MasonIOException("Could not open output file.");

            if (options.methOptions.simulateMethylationLevels)
            {
                open(outMethLevelStream, toCString(options.methFastaOutFile), seqan::SequenceStream::WRITE);
                if (!isGood(outMethLevelStream))
                    throw MasonIOException("Could not open methylation output file.");
            }
        }
        catch (MasonIOException e)
        {
            std::cerr << "\nERROR: " << e.what() << "\n";
            return 1;
        }
        std::cerr << " OK\n";

        // Perform genome simulation.
        std::cerr << "\n__MATERIALIZING______________________________________________________________\n"
                  << "\n";

        // The identifiers of the just materialized data.
        int rID = 0, hID = 0;
        seqan::Dna5String seq;
        std::cerr << "Materializing...";
        MethylationLevels levels;
        if (options.methOptions.simulateMethylationLevels)  // methylation level simulation
            while (vcfMat.materializeNext(seq, levels, rID, hID))
            {
                std::stringstream ssName;
                ssName << vcfMat.vcfStream.header.sequenceNames[rID] << options.haplotypeNameSep << (hID + 1);
                std::cerr << " " << ssName.str();

                if (writeRecord(outStream, ssName.str(), seq) != 0)
                {
                    std::cerr << "ERROR: Could not write materialized sequence to output.\n";
                    return 1;
                }

                std::stringstream ssTop;
                ssTop << ssName.str() << "/TOP";
                if (writeRecord(outMethLevelStream, ssTop.str(), levels.forward) != 0)
                    throw MasonIOException("Problem writing to methylation output file.");
                std::stringstream ssBottom;
                ssBottom << ssName.str() << "/BOT";
                if (writeRecord(outMethLevelStream, ssBottom.str(), levels.reverse) != 0)
                    throw MasonIOException("Problem writing to methylation output file.");
            }
        else  // NO methylation level simulation
            while (vcfMat.materializeNext(seq, rID, hID))
            {
                std::stringstream ssName;
                ssName << vcfMat.vcfStream.header.sequenceNames[rID] << options.haplotypeNameSep << (hID + 1);
                std::cerr << " " << ssName.str();

                if (writeRecord(outStream, ssName.str(), seq) != 0)
                {
                    std::cerr << "ERROR: Could not write materialized sequence to output.\n";
                    return 1;
                }
            }
        std::cerr << " DONE\n";

        std::cerr << "\nDone materializing VCF file.\n";

        return 0;
    }