int run() { // Intialization std::cerr << "__INITIALIZATION_____________________________________________________________\n" << "\n"; std::cerr << "Opening files..."; try { vcfMat.init(); if (!open(seqFileOut, toCString(options.outputFileName))) throw MasonIOException("Could not open output file."); if (!open(gffFileIn, toCString(options.inputGffFile))) throw MasonIOException("Could not open GFF/GTF file."); } catch (MasonIOException e) { std::cerr << "\nERROR: " << e.what() << "\n"; return 1; } std::cerr << " OK\n"; // Perform genome simulation. std::cerr << "\n__COMPUTING TRANSCRIPTS______________________________________________________\n" << "\n"; // Read first GFF record. MyGffRecord record; _readFirstRecord(record); if (record.rID == seqan::maxValue<int>()) return 0; // at end, could not read any, done // Transcript names. typedef seqan::StringSet<seqan::CharString> TNameStore; typedef seqan::NameStoreCache<TNameStore> TNameStoreCache; TNameStore transcriptNames; TNameStoreCache transcriptNamesCache(transcriptNames); // The splicing instructions for the current contig. std::vector<SplicingInstruction> splicingInstructions; // Materialized sequence. seqan::Dna5String seq; // Tanscript ids, used as a buffer below. seqan::String<unsigned> transcriptIDs; // Read GFF/GTF file contig by contig (must be sorted by reference name). For each contig, we all recors, // create simulation instructions and then build the transcripts for each haplotype. while (record.rID != seqan::maxValue<int>()) // sentinel, at end { seqan::CharString refName = record.ref; std::cerr << "Splicing for " << refName << " ..."; // Read GFF records for this contig. MyGffRecord firstGffRecord = record; while (record.rID == firstGffRecord.rID) { if (empty(options.gffType) || (record.type == options.gffType)) { // Make transcript names known to the record. _appendTranscriptNames(transcriptIDs, transcriptNames, transcriptNamesCache, record); // Add the splicing instructions for this record to the list for this contig. for (unsigned i = 0; i < length(transcriptIDs); ++i) splicingInstructions.push_back(SplicingInstruction(transcriptIDs[i], record.beginPos, record.endPos, record.strand)); } if (atEnd(gffFileIn)) { record.rID = seqan::maxValue<int>(); break; } readRecord(record, gffFileIn); // Translate ref to idx from VCF. unsigned idx = 0; if (!getIdByName(idx, vcfMat.faiIndex, record.ref)) throw MasonIOException("Reference name from GFF/GTF not in VCF!"); record.rID = idx; } // --------------------------------------------------------------- // Process the splicing instructions. // --------------------------------------------------------------- // First, sort them. std::sort(splicingInstructions.begin(), splicingInstructions.end()); // Materialize all haplotypes of this contig int rID = 0, hID = 0; // reference and haplotype id // Get index of the gff record's reference in the VCF file. unsigned idx = 0; if (!getIdByName(idx, vcfMat.faiIndex, refName)) { std::stringstream ss; ss << "Reference from GFF file " << refName << " unknown in FASTA/FAI file."; throw MasonIOException(ss.str()); } rID = idx; vcfMat.currRID = rID - 1; std::vector<SmallVarInfo> varInfos; // small variants for counting in read alignments std::vector<std::pair<int, int> > breakpoints; // unused/ignored while (vcfMat.materializeNext(seq, varInfos, breakpoints, rID, hID)) { std::cerr << " (allele " << (hID + 1) << ")"; if (rID != (int)idx) break; // no more haplotypes for this reference _performSplicing(splicingInstructions, seq, transcriptNames, hID, vcfMat); } std::cerr << " DONE.\n"; // --------------------------------------------------------------- // Handle contig switching. // --------------------------------------------------------------- // Check that the input GFF file is clustered (weaker than sorted) by reference name. if (record.rID < firstGffRecord.rID) throw MasonIOException("GFF file not sorted or clustered by reference."); // Reset transcript names and cache. clear(transcriptNames); refresh(transcriptNamesCache); // Flush splicing instructions. splicingInstructions.clear(); } std::cerr << "\nDone splicing FASTA.\n"; return 0; }
int run() { // Intialization std::cerr << "__INITIALIZATION_____________________________________________________________\n" << "\n"; std::cerr << "Opening files..."; try { vcfMat.init(); open(outStream, toCString(options.outputFileName), seqan::SequenceStream::WRITE); if (!isGood(outStream)) throw MasonIOException("Could not open output file."); if (options.methOptions.simulateMethylationLevels) { open(outMethLevelStream, toCString(options.methFastaOutFile), seqan::SequenceStream::WRITE); if (!isGood(outMethLevelStream)) throw MasonIOException("Could not open methylation output file."); } } catch (MasonIOException e) { std::cerr << "\nERROR: " << e.what() << "\n"; return 1; } std::cerr << " OK\n"; // Perform genome simulation. std::cerr << "\n__MATERIALIZING______________________________________________________________\n" << "\n"; // The identifiers of the just materialized data. int rID = 0, hID = 0; seqan::Dna5String seq; std::cerr << "Materializing..."; MethylationLevels levels; if (options.methOptions.simulateMethylationLevels) // methylation level simulation while (vcfMat.materializeNext(seq, levels, rID, hID)) { std::stringstream ssName; ssName << vcfMat.vcfStream.header.sequenceNames[rID] << options.haplotypeNameSep << (hID + 1); std::cerr << " " << ssName.str(); if (writeRecord(outStream, ssName.str(), seq) != 0) { std::cerr << "ERROR: Could not write materialized sequence to output.\n"; return 1; } std::stringstream ssTop; ssTop << ssName.str() << "/TOP"; if (writeRecord(outMethLevelStream, ssTop.str(), levels.forward) != 0) throw MasonIOException("Problem writing to methylation output file."); std::stringstream ssBottom; ssBottom << ssName.str() << "/BOT"; if (writeRecord(outMethLevelStream, ssBottom.str(), levels.reverse) != 0) throw MasonIOException("Problem writing to methylation output file."); } else // NO methylation level simulation while (vcfMat.materializeNext(seq, rID, hID)) { std::stringstream ssName; ssName << vcfMat.vcfStream.header.sequenceNames[rID] << options.haplotypeNameSep << (hID + 1); std::cerr << " " << ssName.str(); if (writeRecord(outStream, ssName.str(), seq) != 0) { std::cerr << "ERROR: Could not write materialized sequence to output.\n"; return 1; } } std::cerr << " DONE\n"; std::cerr << "\nDone materializing VCF file.\n"; return 0; }