int main(int argc, char* argv[]) { string plsFileName; int advance; if (argc <= 2) { cout << "usage: testAdvance file.pls.h5 advance " << endl; cout << "move 'advance' reads forward in a file." << endl; exit(1); } plsFileName = argv[1]; advance = atoi(argv[2]); ReaderAgglomerate reader; reader.Initialize(plsFileName); SMRTSequence seq; int seqIndex = 0; int i; for (i = 0; i < 4; i++ ){ seq.Free(); reader.Advance(advance); reader.GetNext(seq); } seq.PrintSeq(cout); }
void BaseFile::CopyReadAt(uint32_t readIndex, SMRTSequence &read) { assert(holeNumbers.size() > readIndex); read.HoleNumber(holeNumbers[readIndex]); if (holeXY.size() > 0) { assert(holeXY.size() > readIndex); read.HoleXY(holeXY[readIndex].xy[0], holeXY[readIndex].xy[1]); } DSLength startPos = readStartPositions[readIndex]; DNALength readLength = readLengths[readIndex]; read.length = readLength; read.Allocate(readLength); if (baseCalls.size() > 0) { assert(baseCalls.size() >= readLength + startPos); CopyArray(baseCalls, startPos, readLength, read.seq); } if (qualityValues.size() > 0) { assert(qualityValues.size() >= readLength + startPos); CopyArray(qualityValues, startPos, readLength, read.qual.data); } if (basWidthInFrames.size() > 0) { assert(basWidthInFrames.size() >= readLength + startPos); CopyArray(basWidthInFrames, startPos, readLength, read.widthInFrames); } if (deletionQV.size() > 0) { assert(deletionQV.size() >= readLength + startPos); CopyArray(deletionQV, startPos, readLength, read.deletionQV.data); } if (deletionTag.size() > 0) { assert(deletionTag.size() >= readLength + startPos); CopyArray(deletionTag, startPos, readLength, read.deletionTag); } if (insertionQV.size() > 0) { assert(insertionQV.size() >= readLength + startPos); CopyArray(insertionQV, startPos, readLength, read.insertionQV.data); } if (substitutionQV.size() > 0) { assert(substitutionQV.size() >= readLength + startPos); CopyArray(substitutionQV, startPos, readLength, read.substitutionQV.data); } if (mergeQV.size() > 0) { assert(mergeQV.size() >= readLength + startPos); CopyArray(mergeQV, startPos, readLength, read.mergeQV.data); } if (substitutionTag.size() > 0) { assert(substitutionTag.size() >= readLength + startPos); CopyArray(substitutionTag, startPos, readLength, read.substitutionTag); } if (preBaseFrames.size() > 0) { assert(preBaseFrames.size() >= readLength + startPos); CopyArray(preBaseFrames, startPos, readLength, read.preBaseFrames); } }
// Given a SMRT sequence and one of its subreads, make the // reverse complement of the subread in the coordinate of the // reverse complement sequence of the SMRT sequence. // Input: // smrtRead - a SMRT read // subreadSequence - a subread of smrtRead // Output: // subreadSequenceRC - the reverse complement of the subread // in the coordinate of the reverse // complement of the SMRT read. void MakeSubreadRC(SMRTSequence & subreadSequenceRC, SMRTSequence & subreadSequence, SMRTSequence & smrtRead) { assert(smrtRead.length >= subreadSequence.length); // Reverse complement sequence of the subread. subreadSequence.MakeRC(subreadSequenceRC); // Update start and end positions of subreadSequenceRC in the // coordinate of reverse compelement sequence of the SMRT read. subreadSequenceRC.SubreadStart(smrtRead.length - subreadSequence.SubreadEnd()); subreadSequenceRC.SubreadEnd (smrtRead.length - subreadSequence.SubreadStart()); subreadSequenceRC.zmwData = smrtRead.zmwData; }
void CreateSMRTSequence(SMRTSequence& smrt, Nucleotide* seq, int holeNumber, int start, int end) { int size = end - start; smrt.seq = new Nucleotide[size]; memcpy(smrt.seq, seq, size * sizeof(Nucleotide)); smrt.length = size; smrt.deleteOnExit = false; smrt.HoleNumber(holeNumber); smrt.SubreadStart(start); smrt.SubreadEnd(end); std::stringstream ss; }
void SMRTSequence::Copy(const SMRTSequence &rhs, int rhsPos, int rhsLength) { // // Make sure not attempting to copy into self. // SMRTSequence subseq; subseq.ReferenceSubstring(rhs, rhsPos, rhsLength); subseq.title = rhs.title; subseq.titleLength = strlen(rhs.title); if (rhs.length == 0) { if (preBaseFrames != NULL) { delete[] preBaseFrames; preBaseFrames = NULL; } if (widthInFrames != NULL) { delete[] widthInFrames; widthInFrames = NULL; } if (pulseIndex != NULL) { delete[] pulseIndex; pulseIndex = NULL; } ((FASTQSequence*)this)->Copy(subseq); // // Make sure that no values of length 0 are allocated by returning here. // } else { assert(rhs.seq != seq); assert(rhsLength <= rhs.length); assert(rhsPos < rhs.length); ((FASTQSequence*)this)->Copy(subseq); if (rhs.preBaseFrames != NULL) { preBaseFrames = new HalfWord[length]; memcpy(preBaseFrames, rhs.preBaseFrames, length*sizeof(HalfWord)); } if (rhs.widthInFrames != NULL) { widthInFrames = new HalfWord[length]; memcpy(widthInFrames, rhs.widthInFrames, length*sizeof(HalfWord)); } if (rhs.pulseIndex != NULL) { pulseIndex = new int[length]; memcpy(pulseIndex, rhs.pulseIndex, sizeof(int) * length); } } zmwData = rhs.zmwData; }
void AfgBasWriter::WriteIdentifier(SMRTSequence &seq) { afgOut << "clr:0," << seq.length << std::endl; afgOut << "eid:"; std::string fastaTitle; seq.GetFASTATitle(fastaTitle); afgOut << fastaTitle << std::endl; }
void SMRTSequence::MakeSubreadAsReference(SMRTSequence &subread, DNALength subreadStart, int subreadEnd) { // // Just create a reference to a substring of this read. // SetSubreadBoundaries(subread, subreadStart, subreadEnd); subread.ReferenceSubstring(*this, subreadStart, subreadEnd - subreadStart); // The subread references this read, protect the memory. subread.deleteOnExit = false; }
// Given a SMRT sequence and a subread interval, make the subread. // Input: // smrtRead - a SMRT sequence // subreadInterval - a subread interval // params - mapping parameters // Output: // subreadSequence - the constructed subread void MakeSubreadOfInterval(SMRTSequence & subreadSequence, SMRTSequence & smrtRead, ReadInterval & subreadInterval, MappingParameters & params) { int start = subreadInterval.start; int end = subreadInterval.end; assert(smrtRead.length >= subreadSequence.length); smrtRead.MakeSubreadAsMasked(subreadSequence, start, end); if (!params.preserveReadTitle) { smrtRead.SetSubreadTitle(subreadSequence, subreadSequence.SubreadStart(), subreadSequence.SubreadEnd()); } else { subreadSequence.CopyTitle(smrtRead.title); } subreadSequence.zmwData = smrtRead.zmwData; }
int main(int argc, char* argv[]) { string plsFileName, fastaOutName; if (argc < 2) { cout << "usage: pls2fasta file.pls.h5 file.fasta " << endl; cout << "Print reads stored in hdf as fasta." << endl; exit(1); } vector<string> plsFileNames; plsFileName = argv[1]; fastaOutName = argv[2]; if (FileOfFileNames::IsFOFN(plsFileName)) { FileOfFileNames::FOFNToList(plsFileName, plsFileNames); } else { plsFileNames.push_back(plsFileName); } int plsFileIndex; for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { ReaderAgglomerate reader; reader.IgnoreCCS(); reader.Initialize(plsFileNames[plsFileIndex]); ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); SMRTSequence seq; int seqIndex = 0; while (reader.GetNext(seq)) { seq.PrintQualSeq(fastaOut); } } }
void MakeVirtualRead(SMRTSequence & smrtRead, const vector<SMRTSequence> & subreads) { assert(subreads.size() > 0); DNALength hqStart = 0, hqEnd = 0; for(auto subread: subreads) { hqStart = min(DNALength(subread.SubreadStart()), hqStart); hqEnd = max(DNALength(subread.SubreadEnd()), hqEnd); } smrtRead.Free(); smrtRead.Allocate(hqEnd); memset(smrtRead.seq, 'N', sizeof(char) * hqEnd); smrtRead.lowQualityPrefix = hqStart; smrtRead.lowQualitySuffix = smrtRead.length - hqEnd; smrtRead.highQualityRegionScore = subreads[0].highQualityRegionScore; smrtRead.HoleNumber(subreads[0].HoleNumber()); stringstream ss; ss << SMRTTitle(subreads[0].GetTitle()).MovieName() << "/" << subreads[0].HoleNumber(); smrtRead.CopyTitle(ss.str()); for (auto subread: subreads) { memcpy(&smrtRead.seq[subread.SubreadStart()], &subread.seq[0], sizeof(char) * subread.length); } }
void SMRTSequence::MakeSubreadAsMasked(SMRTSequence &subread, DNALength subreadStart, int subreadEnd) { // // This creates the entire subread, but masks out the portions // that do not correspond to this insert. // SetSubreadBoundaries(subread, subreadStart, subreadEnd); subread.Copy(*this); DNALength pos; for (pos = 0; pos < subreadStart; pos++) { subread.seq[pos] = 'N'; } for (pos = subreadEnd; pos < length; pos++) { subread.seq[pos] = 'N'; } // This is newly allocated memory, free it on exit. subread.deleteOnExit = true; }
bool HDFPulseWriter::WriteOneZmw(const SMRTSequence& seq, const std::vector<RegionAnnotation>& regions) { if (not this->WriteOneZmw(seq)) { return false; } if (HasRegions()) { if (regions.size() == 0) { std::vector<RegionAnnotation> fake = { RegionAnnotation(seq.HoleNumber(), HQRegion, 0, 0, 0)}; return regionsWriter_->Write(fake); } else { return regionsWriter_->Write(regions); } } return true; }
bool HDFBaseCallsWriter::_WriteSubstitutionTag(const SMRTSequence & read) { if (HasSubstitutionTag()) { if (read.substitutionTag == nullptr) { AddErrorMessage(std::string(PacBio::GroupNames::substitutiontag) + " absent in read " + read.GetTitle()); return false; } else { substitutionTagArray_.Write(read.substitutionTag, read.length); return true; } } return true; }
int main(int argc, char* argv[]) { string program = "pls2fasta"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string plsFileName, fastaOutName; vector<string> plsFileNames; bool trimByRegion, maskByRegion; trimByRegion = false; maskByRegion = false; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector<string> regionFileNames; bool splitSubreads = true; int minSubreadLength = 0; bool addSimulatedData = false; bool printSimulatedCoordinate = false; bool printSimulatedSequenceIndex = false; bool printFastq = false; bool printCcs = false; int lineLength = 50; int minReadScore = 0; vector<int> holeNumbers; CommandLineParser clp; bool printOnlyBest = false; clp.SetProgramName(program); clp.SetVersion(versionString); clp.RegisterStringOption("in.pls.h5", &plsFileName, "Input pls.h5/bax.h5/fofn file.", true); clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions."); clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'."); clp.RegisterStringOption("regionTable", ®ionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset."); clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences."); clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers)."); clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality."); clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences"); clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read. The score is " "a number between 0 and 1000 and represents the expected accuracy percentage * 10. " "A typical value would be between 750 and 800. This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this. Otherwise, print the longest" "subread. This does not support fastq."); string description = ("Converts pls.h5/bax.h5/fofn files to fasta or fastq files. Although fasta files are provided" " with every run, they are not trimmed nor split into subreads. This program takes " "additional annotation information, such as the subread coordinates and high quality regions " "and uses them to create fasta sequences that are substrings of all bases called. Most of the time " "you will want to trim low quality reads, so you should specify -trimByRegion."); clp.SetProgramSummary(description); clp.ParseCommandLine(argc, argv); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; if (trimByRegion and maskByRegion) { cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl; exit(1); } if (printFastq) { // Setting lineLength to 0 flags to print on one line. lineLength = 0; } if (FileOfFileNames::IsFOFN(plsFileName)) { FileOfFileNames::FOFNToList(plsFileName, plsFileNames); } else { plsFileNames.push_back(plsFileName); } if (regionsFOFNName == "") { regionFileNames = plsFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; sort(holeNumbers.begin(), holeNumbers.end()); for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { if (trimByRegion or maskByRegion or splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); regionTable.SortTableByHoleNumber(); } ReaderAgglomerate reader; HDFBasReader ccsReader; if (printOnlyBest) { ccsReader.SetReadBasesFromCCS(); ccsReader.Initialize(plsFileNames[plsFileIndex]); } if (printCcs == false) { reader.IgnoreCCS(); } else { reader.hdfBasReader.SetReadBasesFromCCS(); } if (addSimulatedData) { reader.hdfBasReader.IncludeField("SimulatedCoordinate"); reader.hdfBasReader.IncludeField("SimulatedSequenceIndex"); } if (reader.SetReadFileName(plsFileNames[plsFileIndex]) == 0) { cout << "ERROR, could not determine file type." << plsFileNames[plsFileIndex] << endl; exit(1); } if (reader.Initialize() == 0) { cout << "ERROR, could not initialize file " << plsFileNames[plsFileIndex] << endl; exit(1); } DNALength simulatedCoordinate; DNALength simulatedSequenceIndex; reader.SkipReadQuality(); SMRTSequence seq; vector<ReadInterval> subreadIntervals;; SMRTSequence ccsSeq; while (reader.GetNext(seq)) { if (printOnlyBest) { ccsReader.GetNext(ccsSeq); } if (holeNumbers.size() != 0 and binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) { continue; } if (seq.length == 0) { continue; } if (addSimulatedData) { reader.hdfBasReader.simulatedCoordinateArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedCoordinate); reader.hdfBasReader.simulatedSequenceIndexArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedSequenceIndex); } if (printCcs == true) { if (printFastq == false) { seq.PrintSeq(fastaOut); } else { seq.PrintFastq(fastaOut, lineLength); } continue; } // // Determine the high quality boundaries of the read. This is // the full read is no hq regions exist, or it is stated to // ignore regions. // DNALength hqReadStart, hqReadEnd; int hqRegionScore; if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or (trimByRegion == false and maskByRegion == false)) { hqReadStart = 0; hqReadEnd = seq.length; } // // Mask off the low quality portions of the reads. // if (maskByRegion) { if (hqReadStart > 0) { fill(&seq.seq[0], &seq.seq[hqReadStart], 'N'); } if (hqReadEnd != seq.length) { fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N'); } } // // Now possibly print the full read with masking. This could be handled by making a // if (splitSubreads == false) { ReadInterval wholeRead(0, seq.length); // The set of subread intervals is just the entire read. subreadIntervals.clear(); subreadIntervals.push_back(wholeRead); } else { // // Print subread coordinates no matter whether or not reads have subreads. // subreadIntervals.clear(); // clear old, new intervals are appended. CollectSubreadIntervals(seq, ®ionTable, subreadIntervals); } // // Output all subreads as separate sequences. // int intvIndex; SMRTSequence bestSubreadSequence; int bestSubreadScore = -1; int bestSubreadIndex = 0; int bestSubreadStart = 0, bestSubreadEnd = 0; SMRTSequence bestSubread; for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; subreadSequence.subreadStart = subreadIntervals[intvIndex].start; subreadSequence.subreadEnd = subreadIntervals[intvIndex].end; // // When trimming by region, only output the parts of the // subread that overlap the hq region. // if (trimByRegion == true) { subreadSequence.subreadStart = max((DNALength) subreadIntervals[intvIndex].start, hqReadStart); subreadSequence.subreadEnd = min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd); } if (subreadSequence.subreadStart >= subreadSequence.subreadEnd or subreadSequence.subreadEnd - subreadSequence.subreadStart <= minSubreadLength) { // // There is no high qualty portion of this subread. Skip it. // continue; } if (hqRegionScore < minReadScore) { continue; } // // Print the subread, adding the coordinates as part of the title. // subreadSequence.ReferenceSubstring(seq, subreadSequence.subreadStart, subreadSequence.subreadEnd - subreadSequence.subreadStart); stringstream titleStream; titleStream << seq.title; if (splitSubreads) { // // Add the subread coordinates if splitting on subread. // titleStream << "/" << subreadSequence.subreadStart << "_" << subreadSequence.subreadEnd; } // // If running on simulated data, add where the values were simulated from. // if (addSimulatedData) { titleStream << ((FASTASequence*)&seq)->title << "/chrIndex_" << simulatedSequenceIndex << "/position_"<< simulatedCoordinate; ((FASTASequence*)&seq)->CopyTitle(titleStream.str()); } subreadSequence.CopyTitle(titleStream.str()); // // Eventually replace with WriterAgglomerate. // if (printOnlyBest == false) { if (subreadSequence.length > 0) { if (printFastq == false) { ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut); } else { subreadSequence.PrintFastq(fastaOut, lineLength); } } delete[] subreadSequence.title; } else { int subreadWeightedScore = subreadSequence.length * hqRegionScore; if (subreadWeightedScore > bestSubreadScore) { bestSubreadIndex = intvIndex; bestSubread = subreadSequence; bestSubreadScore = subreadWeightedScore; } } } if (printOnlyBest) { if (ccsSeq.length > 0) { if (printFastq == false) { ccsSeq.PrintSeq(fastaOut); } else { ccsSeq.PrintFastq(fastaOut, ccsSeq.length); } } else { if (bestSubreadScore >= 0) { if (printFastq == false) { bestSubread.PrintSeq(fastaOut); } else { bestSubread.PrintFastq(fastaOut, bestSubread.length); } bestSubread.Free(); } } ccsSeq.Free(); } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; }
bool SubreadConverter::ConvertFile(HDFBasReader* reader, PacBio::BAM::BamWriter* writer, PacBio::BAM::BamWriter* scrapsWriter) { assert(reader); // initialize with default values (shared across all unmapped subreads) BamRecordImpl bamRecord; // read region table info std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader); RegionTable regionTable; string fn = filenameForReader_[reader]; assert(!fn.empty()); if (regionTableReader->Initialize(fn) == 0) { AddErrorMessage("could not read region table on "+fn); return false; } regionTable.Reset(); regionTableReader->ReadTable(regionTable); regionTableReader->Close(); // initialize read scores InitReadScores(reader); // fetch records from HDF5 file SMRTSequence smrtRecord; while (reader->GetNext(smrtRecord)) { // compute subread & adapter intervals SubreadInterval hqInterval; deque<SubreadInterval> subreadIntervals; deque<SubreadInterval> adapterIntervals; try { hqInterval = ComputeSubreadIntervals(&subreadIntervals, &adapterIntervals, regionTable, smrtRecord.zmwData.holeNumber, smrtRecord.length); } catch (runtime_error& e) { AddErrorMessage(string(e.what())); smrtRecord.Free(); return false; } // sequencing ZMW if (IsSequencingZmw(smrtRecord)) { // write subreads to main BAM file for (const SubreadInterval& interval : subreadIntervals) { // skip invalid or 0-sized intervals if (interval.End <= interval.Start) continue; if (!WriteSubreadRecord(smrtRecord, interval.Start, interval.End, ReadGroupId(), static_cast<uint8_t>(interval.LocalContextFlags), writer)) { smrtRecord.Free(); return false; } } // if scraps BAM file present if (scrapsWriter) { // write 5-end LQ sequence if (hqInterval.Start > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqInterval.Start, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write adapters for (const SubreadInterval& interval : adapterIntervals) { // skip invalid or 0-sized adapters if (interval.End <= interval.Start) continue; if (!WriteAdapterRecord(smrtRecord, interval.Start, interval.End, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write 3'-end LQ sequence if (hqInterval.End < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqInterval.End, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } // sequencing ZMW // non-sequencing ZMW else { assert(!IsSequencingZmw(smrtRecord)); // only write these if scraps BAM present & we are in 'internal mode' if (settings_.isInternal && scrapsWriter) { // write 5-end LQ sequence to scraps BAM if (hqInterval.Start > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqInterval.Start, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write subreads & adapters to scraps BAM, sorted by query start while (!subreadIntervals.empty() && !adapterIntervals.empty()) { const SubreadInterval& subread = subreadIntervals.front(); const SubreadInterval& adapter = adapterIntervals.front(); assert(subread.Start != adapter.Start); if (subread.Start < adapter.Start) { if (!WriteFilteredRecord(smrtRecord, subread.Start, subread.End, ScrapsReadGroupId(), static_cast<uint8_t>(subread.LocalContextFlags), scrapsWriter)) { smrtRecord.Free(); return false; } subreadIntervals.pop_front(); } else { if (!WriteAdapterRecord(smrtRecord, adapter.Start, adapter.End, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } adapterIntervals.pop_front(); } } // flush any traling subread intervals while (!subreadIntervals.empty()) { assert(adapterIntervals.empty()); const SubreadInterval& subread = subreadIntervals.front(); if (!WriteFilteredRecord(smrtRecord, subread.Start, subread.End, ScrapsReadGroupId(), static_cast<uint8_t>(subread.LocalContextFlags), scrapsWriter)) { smrtRecord.Free(); return false; } subreadIntervals.pop_front(); } // flush any remaining adapter intervals while (!adapterIntervals.empty()) { assert(subreadIntervals.empty()); const SubreadInterval& adapter = adapterIntervals.front(); if (!WriteAdapterRecord(smrtRecord, adapter.Start, adapter.End, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } adapterIntervals.pop_front(); } // write 3'-end LQ sequence to scraps BAM if (hqInterval.End < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqInterval.End, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } // non-sequencing ZMW smrtRecord.Free(); } // if we get here, all OK return true; }
bool HDFBaseCallsWriter::_WritePulseIndex(const SMRTSequence & read) { if (HasPulseIndex()) { if (read.copiedFromBam) { const PacBio::BAM::BamRecord & record = read.bamRecord; if (record.HasPulseCall()) { const std::string & pulsecall = record.PulseCall(); std::vector<uint16_t> data(read.length, 0); size_t indx = 0; for(size_t i = 0; i < pulsecall.size(); i++) { const char base = pulsecall[i]; if (base == 'A' or base == 'C' or base == 'G' or base == 'T') { assert(indx < read.length); data[indx] = static_cast<uint16_t>(i); indx ++; } else { assert(base == 'a' or base == 'c' or base == 'g' or base == 't'); } } assert(indx == read.length); pulseIndexArray_.Write(&data[0], read.length); return true; } } AddErrorMessage(std::string(PacBio::GroupNames::pulseindex) + " absent in read " + read.GetTitle()); return false; } return true; }
int main(int argc, char* argv[]) { std::string outFileName; unsigned contextLength = 5; int minSamples = 500; int maxSamples = 1000; if (argc < 3) { PrintUsage(); std::exit(EXIT_FAILURE); } int argi = 1; std::string cmpH5FileName; cmpH5FileName = argv[argi++]; outFileName = argv[argi++]; int minAverageQual = 0; bool onlyMaxLength = false; while (argi < argc) { if (strcmp(argv[argi], "-contextLength") == 0) { contextLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-minSamples") == 0) { minSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-maxSamples") == 0) { maxSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-onlyMaxLength") == 0) { onlyMaxLength = true; } else { PrintUsage(); std::cout << "ERROR, bad option: " << argv[argi] << std::endl; std::exit(EXIT_FAILURE); } ++argi; } std::map<std::string, ScoredLength> maxLengthMap; OutputSampleListSet samples(contextLength); SMRTSequence read; std::ofstream sampleOut; CrucialOpen(outFileName, sampleOut, std::ios::out | std::ios::binary); int fileNameIndex; int numContextsReached = 0; int numContexts = 1 << (contextLength * 2); ReaderAgglomerate reader; samples.keyLength = contextLength; HDFCmpFile<CmpAlignment> cmpReader; cmpReader.IncludeField("QualityValue"); cmpReader.IncludeField("DeletionQV"); cmpReader.IncludeField("InsertionQV"); cmpReader.IncludeField("SubstitutionQV"); cmpReader.IncludeField("SubstitutionTag"); cmpReader.IncludeField("DeletionTag"); cmpReader.IncludeField("PulseIndex"); cmpReader.IncludeField("WidthInFrames"); cmpReader.IncludeField("PreBaseFrames"); if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) { std::cout << "ERROR, could not open the cmp file." << std::endl; std::exit(EXIT_FAILURE); } std::cout << "Reading cmp file." << std::endl; CmpFile cmpFile; cmpReader.ReadAlignmentDescriptions(cmpFile); cmpReader.ReadStructure(cmpFile); std::cout << "done reading structure." << std::endl; int alignmentIndex; int nAlignments = cmpReader.alnInfoGroup.GetNAlignments(); std::vector<int> alignmentToBaseMap; for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient(); alignmentIndex++) { // // For ease of use, store the length of the alignment to make another model. // ByteAlignment alignmentArray; cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray); Alignment alignment; ByteAlignmentToAlignment(alignmentArray, alignment); std::string readSequence, refSequence; readSequence.resize(alignmentArray.size()); refSequence.resize(alignmentArray.size()); DNASequence readDNA, refDNA; ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]); ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]); RemoveGaps(readSequence, readSequence); RemoveGaps(refSequence, refSequence); readDNA.seq = (Nucleotide*)readSequence.c_str(); readDNA.length = readSequence.size(); refDNA.seq = (Nucleotide*)refSequence.c_str(); refDNA.length = refSequence.size(); CmpAlignment cmpAlignment; cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read); CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap); if (read.length < contextLength) { continue; } int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart()); if (onlyMaxLength == false) { samples.lengths.push_back(subreadLength); } else { int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() - cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions()); std::stringstream nameStrm; nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber(); std::string nameStr = nameStrm.str(); if (maxLengthMap.find(nameStr) == maxLengthMap.end()) { maxLengthMap[nameStr] = ScoredLength(score, subreadLength); } } int sampleEnd = alignmentArray.size() - contextLength / 2; int a; for (a = contextLength / 2; a < sampleEnd; a++) { // Make sure the context begins on a real nucleotide. while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) { a++; } // // Move ab back to an index where there are contextLength/2 non-gap // characters, counted by nb // int ab; //num bases int ae; //alignment end ab = a - 1; int nb = 0, ne = 0; while (true) { if (RefChar[alignmentArray[ab]] != ' ') { nb++; } if (ab == 0 or nb == static_cast<int>(contextLength) / 2) break; ab--; } // // Advance ae to an index where there are contextLength/2 non-gap // characters, counted by ne. // ae = a + 1; while (ae < static_cast<int>(alignmentArray.size()) and ne < static_cast<int>(contextLength) / 2) { if (RefChar[alignmentArray[ae]] != ' ') { ne++; } ae++; } // // Make sure there are no edge effects that prevent a context of the correct length from being assigned. // if (nb + ne + 1 != static_cast<int>(contextLength)) { continue; } int ai; std::string context; for (ai = ab; ai < ae; ai++) { if (RefChar[alignmentArray[ai]] != ' ') { context.push_back(RefChar[alignmentArray[ai]]); } } assert(context.size() == contextLength); // // Now create the context. // OutputSample sample; // // This context is a deletion, create that. // sample.type = OutputSample::Deletion; // // This context is either an insertion or substitution // // Look to see if the previous aligned position was an // insertion, and move back as far as the insertion extends. int aq = a - 1; int sampleLength; if (QueryChar[alignmentArray[a]] == ' ') { sample.type = OutputSample::Deletion; sampleLength = 0; } else if (RefChar[alignmentArray[aq]] == ' ') { while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and QueryChar[alignmentArray[aq]] != ' ') { aq--; } sample.type = OutputSample::Insertion; sampleLength = a - aq; } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) { sample.type = OutputSample::Match; sampleLength = 1; } else { sample.type = OutputSample::Substitution; sampleLength = 1; } sample.Resize(sampleLength); if (sampleLength > 0) { int seqPos = alignmentToBaseMap[aq]; if (seqPos < static_cast<int>(read.length)) { sample.CopyFromSeq(read, seqPos, sampleLength); std::string nucs; for (size_t n = 0; n < sample.nucleotides.size(); n++) { char c = sample.nucleotides[n]; assert(c == 'A' or c == 'T' or c == 'G' or c == 'C'); nucs.push_back(sample.nucleotides[n]); } } } samples.AppendOutputSample(context, sample); } read.Free(); } if (onlyMaxLength) { std::map<std::string, ScoredLength>::iterator maxScoreIt; for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) { std::cout << maxScoreIt->second.length << std::endl; samples.lengths.push_back(maxScoreIt->second.length); } } samples.Write(sampleOut); return 0; }
bool HDFBaseCallsWriter::_WriteIPD(const SMRTSequence & read) { if (HasIPD()) { if (read.preBaseFrames == nullptr) { AddErrorMessage(std::string(PacBio::GroupNames::prebaseframes) + " absent in read " + read.GetTitle()); return false; } else { ipdArray_.Write(read.preBaseFrames, read.length); return true; } } return true; }
void SMRTSequence::SetSubreadTitle(SMRTSequence &subread, DNALength subreadStart, DNALength subreadEnd) { stringstream titleStream; titleStream << title << "/"<< subreadStart << "_" << subreadEnd; subread.CopyTitle(titleStream.str()); }
bool HqRegionConverter::ConvertFile(HDFBasReader* reader, PacBio::BAM::BamWriter* writer, PacBio::BAM::BamWriter* scrapsWriter) { assert(reader); // read region table info std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader); RegionTable regionTable; std::string fn = filenameForReader_[reader]; assert(!fn.empty()); if (regionTableReader->Initialize(fn) == 0) { AddErrorMessage("could not read region table on "+fn); return false; } regionTable.Reset(); regionTableReader->ReadTable(regionTable); regionTableReader->Close(); // initialize read scores InitReadScores(reader); // fetch records from HDF5 file SMRTSequence smrtRecord; int hqStart, hqEnd, score; while (reader->GetNext(smrtRecord)) { // attempt get high quality region if (!LookupHQRegion(smrtRecord.zmwData.holeNumber, regionTable, hqStart, hqEnd, score)) { stringstream s; s << "could not find HQ region for hole number: " << smrtRecord.zmwData.holeNumber; AddErrorMessage(s.str()); smrtRecord.Free(); return false; } // Catch and repair 1-off errors in the HQ region hqEnd = (hqEnd == static_cast<int>(smrtRecord.length)-1) ? smrtRecord.length : hqEnd; // sequencing ZMW if (IsSequencingZmw(smrtRecord)) { // write HQRegion to main BAM file if (hqStart < hqEnd) { if (!WriteRecord(smrtRecord, hqStart, hqEnd, ReadGroupId(), writer)) { smrtRecord.Free(); return false; } } // if scraps BAM file present if (scrapsWriter) { // write 5'-end LQ sequence if (hqStart > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqStart, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write 3'-end LQ sequence if (static_cast<size_t>(hqEnd) < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqEnd, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } // non-sequencing ZMW else { assert(!IsSequencingZmw(smrtRecord)); // only write these if scraps BAM present & we are in 'internal mode' if (settings_.isInternal && scrapsWriter) { // write 5'-end LQ sequence if (hqStart > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqStart, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write HQRegion to scraps BAM file if (hqStart < hqEnd) { if (!WriteFilteredRecord(smrtRecord, hqStart, hqEnd, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write 3'-end LQ sequence if (static_cast<size_t>(hqEnd) < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqEnd, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } smrtRecord.Free(); } // if we get here, all OK return true; }
int main(int argc, char* argv[]) { string refGenomeFileName = ""; string lengthModelFileName = ""; string outputModelFileName = ""; DNALength numBasesPerFile = 0; string sourceReadsFileName = ""; string titleTableFileName = ""; int numBasH5Files = 1; string basH5BaseFileName = "simulated"; string movieName = "m101211_092754_00114_cSIM_s1_p0"; bool doRandGenInit = true; bool usePosMap = false; bool printPercentRepeat = false; string posMapFileName = ""; vector<string> movieNames; bool useLengthModel = false; bool useFixedLength = false; ofstream posMapFile; int scaledLength = 0; int fixedLength = 0; int nBasFiles = 1; bool useLengthsModel = true; bool printHelp = false; // Look to see if the refAsReads flag is specified anywhere before // parsing the command line. CommandLineParser clp; string commandLine; string helpString; SetHelp(helpString); vector<string> fns; clp.RegisterStringOption("genome", &refGenomeFileName, ""); clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("sourceReads", &sourceReadsFileName, ""); clp.RegisterStringOption("lengthModel", &lengthModelFileName, ""); clp.RegisterIntOption("fixedLength", &fixedLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("lengthModel", &useLengthModel, ""); clp.RegisterStringOption("movieName", &movieName, ""); clp.RegisterStringOption("titleTable", &titleTableFileName, ""); clp.RegisterStringOption("baseFileName", &basH5BaseFileName, ""); clp.RegisterIntOption("nFiles", &nBasFiles, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("meanLength", &scaledLength, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("posMap", &posMapFileName, ""); clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, ""); clp.RegisterFlagOption("h", &printHelp, ""); clp.SetHelp(helpString); clp.ParseCommandLine(argc, argv, fns); clp.CommandLineToString(argc, argv, commandLine); clp.SetProgramName("alchemy"); outputModelFileName = fns[0]; if (argc <= 1 or printHelp or outputModelFileName == "") { cout << helpString << endl; exit(0); } if (usePosMap) { CrucialOpen(posMapFileName, posMapFile, std::ios::out); } if (sourceReadsFileName == "" and fixedLength == 0) { useLengthModel = true; } if (useLengthModel and fixedLength != 0) { cout << "ERROR! You must either use a length model or a fixed length." << endl; exit(1); } if (sourceReadsFileName == "" and numBasesPerFile == 0) { cout << "ERROR! You must specify either a set of read to use as " << endl << "original reads for simulation or the total number of bases " << endl << "to simulate in each bas.h5 file." << endl; exit(1); } if (sourceReadsFileName == "" and refGenomeFileName == "") { cout << "ERROR! You must specify a genome to sample reads from or a set of read "<<endl << "to use as original reads for simulation." << endl; exit(1); } if (fixedLength != 0 and refGenomeFileName == "") { cout << "ERROR! You must specify a genome file if using a fixed length." << endl; exit(1); } if ((fixedLength != 0 or scaledLength != 0) and sourceReadsFileName != "") { cout << "ERROR! You cannot specify a fixed length nor mean length with a source " << endl << "reads file. The read lengths are taken from the source reads or the length model." << endl; exit(1); } LengthHistogram lengthHistogram; OutputSampleListSet outputModel(0); TitleTable titleTable; if (doRandGenInit) { InitializeRandomGeneratorWithTime(); } // // Read models. // if (titleTableFileName != "") { titleTable.Read(titleTableFileName); } outputModel.Read(outputModelFileName); if (useLengthModel) { lengthHistogram.BuildFromAlignmentLengths(outputModel.lengths); } vector<int> alignmentLengths; int meanAlignmentLength; if (scaledLength != 0 and useLengthModel) { // // Scale the histogram so that the average length is 'scaledLength'. // // 1. Integrate histogram long totalLength = 0; long totalSamples = 0; int hi; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) { int ni; ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi]; totalLength += ni * lengthHistogram.lengthHistogram.data[hi]; } totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1]; float meanSampleLength = totalLength / (1.0*totalSamples); float fractionIncrease = scaledLength / meanSampleLength; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) { lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease; } } FASTAReader inReader, seqReader; vector<FASTASequence> reference; DNALength refLength = 0; int i; if (refGenomeFileName != "") { inReader.Init(refGenomeFileName); inReader.ReadAllSequences(reference); for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } } if (sourceReadsFileName != "") { seqReader.Init(sourceReadsFileName); } ofstream readsFile; // // Create and simulate bas.h5 files. // int baseFileIndex; bool readsRemain = true; for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles) // case 1 is reads are generated by file or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file. baseFileIndex++) { // // Prep the base file for writing. // stringstream fileNameStrm, movieNameStrm; //string movieName = "m000000_000000_00000_cSIMULATED_s"; movieNameStrm << movieName << baseFileIndex << "_p0"; string fullMovieName = movieNameStrm.str(); fileNameStrm << fullMovieName << ".bas.h5"; HDFBasWriter basWriter; HDFRegionTableWriter regionWriter; // // This is mainly used to create the atributes. // RegionTable regionTable; regionTable.CreateDefaultAttributes(); basWriter.SetPlatform(Springfield); // // Use a fixed set of fields for now. // // These are all pulled from the outputModel. basWriter.IncludeField("Basecall"); basWriter.IncludeField("QualityValue"); basWriter.IncludeField("SubstitutionQV"); basWriter.IncludeField("SubstitutionTag"); basWriter.IncludeField("InsertionQV"); basWriter.IncludeField("DeletionQV"); basWriter.IncludeField("DeletionTag"); basWriter.IncludeField("WidthInFrames"); basWriter.IncludeField("PreBaseFrames"); basWriter.IncludeField("PulseIndex"); vector<unsigned char> qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag; vector<HalfWord> widthInFrames, preBaseFrames, pulseIndex; // Just go from 0 .. hole Number basWriter.IncludeField("HoleNumber"); // Fixed to 0. basWriter.IncludeField("HoleXY"); if (usePosMap == false) { basWriter.IncludeField("SimulatedSequenceIndex"); basWriter.IncludeField("SimulatedCoordinate"); } basWriter.SetChangeListID("1.3.0.50.104380"); DNALength numSimulatedBases = 0; FASTASequence sampleSeq; //sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; int numReads = 0; int readLength = 0; while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) { DNALength seqIndex, seqPos; if (useLengthModel or fixedLength) { if (useLengthModel) { lengthHistogram.GetRandomLength(readLength); } else { readLength = fixedLength; } } if (refGenomeFileName != "") { FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1)); sampleSeq.seq = &reference[seqIndex].seq[seqPos]; sampleSeq.length = readLength + (outputModel.keyLength - 1); assert(reference[seqIndex].length >= sampleSeq.length); } else if (sourceReadsFileName != "") { if (seqReader.GetNext(sampleSeq) == false) { readsRemain = false; break; } if (sampleSeq.length < outputModel.keyLength) { continue; } // // Now attempt to parse the position from the fasta title. // if (useLengthModel) { int tryNumber = 0; readLength = 0; int maxNTries = 1000; int tryBuffer[5] = {-1,-1,-1,-1,-1}; while (tryNumber < maxNTries and readLength < outputModel.keyLength) { lengthHistogram.GetRandomLength(readLength); readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength); tryBuffer[tryNumber%5] = readLength; tryNumber++; } if (tryNumber >= maxNTries) { cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " <<endl << "minimum number of bases using the length model specified in the alchemy." <<endl << "model. Something is either wrong with the model or the context length is too large." <<endl; cout << "The last few tries were: " << tryBuffer[0] << " " << tryBuffer[1] << " " << tryBuffer[2] << " " << tryBuffer[3] << " " << tryBuffer[4] << endl; exit(1); } } readLength = sampleSeq.length; vector<string> tokens; Tokenize(sampleSeq.title, "|", tokens); if (tokens.size() == 4) { seqPos = atoi(tokens[2].c_str()); if (titleTableFileName == "") { seqIndex = 0; } else { int index; titleTable.Lookup(tokens[1], index); seqIndex = index; } } else { seqPos = 0; } } // // If this is the first read printed to the base file, initialize it. // if (numSimulatedBases == 0) { basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield); regionWriter.Initialize(basWriter.pulseDataGroup); } numSimulatedBases += readLength; int p; // create the sample sequence int contextLength = outputModel.keyLength; int contextMiddle = contextLength / 2; string outputString; int nDel = 0; int nIns = 0; // // Simulate to beyond the sample length. // qualityValue.clear(); substitutionQV.clear(); substitutionTag.clear(); insertionQV.clear(); deletionQV.clear(); deletionTag.clear(); pulseIndex.clear(); widthInFrames.clear(); preBaseFrames.clear(); assert(sampleSeq.length > contextMiddle + 1); for (p = contextMiddle; p < sampleSeq.length - contextMiddle - 1; p++) { string refContext; refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength); string outputContext; int contextWasFound; OutputSample sample; int i; for (i = 0; i < refContext.size(); i++) { refContext[i] = toupper(refContext[i]); } outputModel.SampleRandomSample(refContext, sample); if (sample.type == OutputSample::Deletion ) { // // There was a deletion. Advance in reference, then output // the base after the deletion. // p++; ++nDel; } int cp; // // Add the sampled context, possibly multiple characters because of an insertion. // for (i = 0; i < sample.nucleotides.size(); i++) { outputString.push_back(sample.nucleotides[i]); qualityValue.push_back(sample.qualities[i].qv[0]); deletionQV.push_back(sample.qualities[i].qv[1]); insertionQV.push_back(sample.qualities[i].qv[2]); substitutionQV.push_back(sample.qualities[i].qv[3]); deletionTag.push_back(sample.qualities[i].tags[0]); substitutionTag.push_back(sample.qualities[i].tags[1]); pulseIndex.push_back(sample.qualities[i].frameValues[0]); preBaseFrames.push_back(sample.qualities[i].frameValues[1]); widthInFrames.push_back(sample.qualities[i].frameValues[2]); } nIns += sample.qualities.size() - 1; } if (outputString.find('N') != outputString.npos or outputString.find('n') != outputString.npos) { cout << "WARNING! The sampled string " << endl << outputString << endl << "should not contain N's, but it seems to. This is being ignored "<<endl << "for now so that simulation may continue, but this shouldn't happen"<<endl << "and is really a bug." << endl; numSimulatedBases -= readLength; continue; } // // Ok, done creating the read, now time to create some quality values!!!!! // SMRTSequence read; read.length = outputString.size(); read.Allocate(read.length); memcpy(read.seq, outputString.c_str(), read.length * sizeof(unsigned char)); assert(qualityValue.size() == read.length * sizeof(unsigned char)); memcpy(read.qual.data, &qualityValue[0], read.length * sizeof(unsigned char)); memcpy(read.deletionQV.data, &deletionQV[0], read.length * sizeof(unsigned char)); memcpy(read.insertionQV.data, &insertionQV[0], read.length * sizeof(unsigned char)); memcpy(read.substitutionQV.data, &substitutionQV[0], read.length * sizeof(unsigned char)); memcpy(read.deletionTag, &deletionTag[0], read.length * sizeof(unsigned char)); memcpy(read.substitutionTag, &substitutionTag[0], read.length * sizeof(unsigned char)); memcpy(read.pulseIndex, &pulseIndex[0], read.length * sizeof(int)); memcpy(read.preBaseFrames, &preBaseFrames[0], read.length * sizeof(HalfWord)); memcpy(read.widthInFrames, &widthInFrames[0], read.length * sizeof(HalfWord)); // // The pulse index for now is just fake data. // int i; for (i = 0; i < read.length; i++) { read.pulseIndex[i] = 1; } read.xy[0] = seqIndex; read.xy[1] = seqPos; read.zmwData.holeNumber = numReads; basWriter.Write(read); // Record where this was simulated from. if (usePosMap == false) { basWriter.WriteSimulatedCoordinate(seqPos); basWriter.WriteSimulatedSequenceIndex(seqIndex); } else { posMapFile << fullMovieName << "/" << numReads << "/0_" << read.length << " " << seqIndex << " "<< seqPos; if (printPercentRepeat) { DNALength nRepeat = sampleSeq.GetRepeatContent(); posMapFile << " " << nRepeat*1.0/sampleSeq.length; } posMapFile << endl; } RegionAnnotation region; region.row[0] = read.zmwData.holeNumber; region.row[1] = 1; region.row[2] = 0; region.row[3] = read.length; region.row[4] = 1000; // Should be enough. regionWriter.Write(region); region.row[1] = 2; // Rewrite for hq region encompassing everything. regionWriter.Write(region); if (sourceReadsFileName != "") { sampleSeq.Free(); } read.Free(); ++numReads; } regionWriter.Finalize(regionTable.columnNames, regionTable.regionTypes, regionTable.regionDescriptions, regionTable.regionSources); basWriter.Close(); numReads = 0; // // The bas writer should automatically flush on closing. // } if (usePosMap) { posMapFile.close(); } for (i = 0; i < reference.size(); i++) { reference[i].Free(); } }
bool HDFBaseCallsWriter::_WritePulseWidth(const SMRTSequence & read) { if (HasPulseWidth()) { if (read.widthInFrames == nullptr) { AddErrorMessage(std::string(PacBio::GroupNames::widthinframes) + " absent in read " + read.GetTitle()); return false; } else { pulseWidthArray_.Write(read.widthInFrames, read.length); return true; } } return true; }
bool HDFBaseCallsWriter::_WriteMergeQV(const SMRTSequence & read) { if (HasMergeQV()) { if (read.mergeQV.Empty()) { AddErrorMessage(std::string(PacBio::GroupNames::mergeqv) + " absent in read " + read.GetTitle()); return false; } else { mergeQVArray_.Write(read.mergeQV.data, read.length); return true; } } return true; }
int main(int argc, char* argv[]) { string genomeFileName, readsFileName; TupleMetrics tm; float insRate = 0.10; tm.tupleSize = 8; CommandLineParser clp; int nProcessors = 1; clp.SetProgramName("exhalign"); clp.SetProgramSummary("Count the number of occurrences of every k-mer in a file."); clp.RegisterStringOption("genome", &genomeFileName, "The file of the genome to align to."); clp.RegisterStringOption("reads", &readsFileName, "The reads to align."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("wordsize", &tm.tupleSize, "Size of words to count", CommandLineParser::NonNegativeInteger); clp.RegisterFloatOption("insrate", &insRate, "Roughly the insertion rate (10%)", CommandLineParser::NonNegativeFloat); clp.RegisterIntOption("nProc", &nProcessors, "Number of processors to use", CommandLineParser::NonNegativeInteger); clp.ParseCommandLine(argc, argv); insRate+=1.0; // // Process the reads into a vector of read keywords // vector<string> readsFileNames; vector<FASTQSequence> reads; vector<vector<ReadKeyword> > keywords; SMRTSequence seq, seqRC; ReadKeyword keyword; int readIndex = 0; if (FileOfFileNames::IsFOFN(readsFileName)) { FileOfFileNames::FOFNToList(readsFileName, readsFileNames); } else { readsFileNames.push_back(readsFileName); } ReaderAgglomerate genomeReader; HDFRegionTableReader regionTableReader; genomeReader.Initialize(genomeFileName); FASTQSequence genome; genomeReader.GetNext(genome); SubreadIterator subreadIterator; keywords.resize(nProcessors); RegionTable regionTable, *regionTablePtr; int readsFileIndex; for (readsFileIndex = 0; readsFileIndex < readsFileNames.size(); readsFileIndex++ ) { ReaderAgglomerate reader; reader.Initialize(readsFileNames[readsFileIndex]); regionTalePtr = NULL; if (reader.fileType == HDFPulse or reader.fileType == HDFBase) { regionTableReader.Initialize(readsFileNames[readsFileIndex]); regionTableReader.Read(regionTable); regionTablePtr = ®ionTable; } else { regionTablePtr = NULL; } SMRTSequence fullSequence; while(reader.GetNext(fullSequence)) { subreadIterator.Initialize(&fullSequence, regionTablePtr); SMRTSequence seq; while (subreadIterator.GetNext(seq)) { DNALength pos; if (seq.length < tm.tupleSize) continue; reads.push_back(seq); for (pos = 0; pos < seq.length - tm.tupleSize + 1; pos++) { keyword.tuple.FromStringLR(&seq.seq[pos], tm); keyword.readPos = pos; keyword.readIndex = readIndex; keywords[(readIndex/2)%nProcessors].push_back(keyword); } readIndex++; seq.MakeRC(seqRC); reads.push_back(seqRC); for (pos = 0; pos < seqRC.length - tm.tupleSize + 1; pos++) { keyword.tuple.FromStringLR(&seqRC.seq[pos], tm); keyword.readPos = pos; keyword.readIndex = readIndex; keywords[(readIndex/2)%nProcessors].push_back(keyword); } readIndex++; // seq.Free(); seqRC.Free(); } fullSequence.Free(); } } int procIndex; for (procIndex = 0; procIndex < nProcessors; procIndex++) { std::sort(keywords[procIndex].begin(), keywords[procIndex].end()); } std::vector<int> prevAlignedGenomePos; std::vector<int> readOptScore; std::vector<FastqAlignment > optAlignment; std::vector<int> optGenomeAlignPos; std::vector<int> optGenomeAlignLength; prevAlignedGenomePos.resize(reads.size()); readOptScore.resize(reads.size()); optAlignment.resize(reads.size()); optGenomeAlignPos.resize(reads.size()); optGenomeAlignLength.resize(reads.size()); vector<Data> tdata; tdata.resize(nProcessors); std::fill(prevAlignedGenomePos.begin(), prevAlignedGenomePos.end(), -1); for (procIndex = 0; procIndex < nProcessors; procIndex++) { tdata[procIndex].prevAlignedGenomePos = &prevAlignedGenomePos; tdata[procIndex].readOptScore = &readOptScore; tdata[procIndex].optAlignment = &optAlignment; tdata[procIndex].optGenomeAlignPos = &optGenomeAlignPos; tdata[procIndex].optGenomeAlignLength = &optGenomeAlignLength; tdata[procIndex].keywords = &keywords[procIndex]; tdata[procIndex].genome = &genome; tdata[procIndex].insRate = insRate; tdata[procIndex].reads = &reads; tdata[procIndex].tm = &tm; } if (nProcessors == 1) { KeywordSeededAlignment(&tdata[0]); } else { pthread_t *threads = new pthread_t[nProcessors]; pthread_attr_t *threadAttr = new pthread_attr_t[nProcessors]; for (procIndex = 0; procIndex < nProcessors; procIndex++) { pthread_attr_init(&threadAttr[procIndex]); pthread_create(&threads[procIndex], &threadAttr[procIndex], (void*(*)(void*))KeywordSeededAlignment, &tdata[procIndex]); } for (procIndex = 0; procIndex < nProcessors; procIndex++) { pthread_join(threads[procIndex], NULL); } } VectorIndex i; // cout << "printing alignments for " << reads.size() << " reads." << endl; for (readIndex = 0; readIndex < readOptScore.size(); readIndex +=2 ){ int optIndex = readIndex; if (readOptScore[readIndex] > readOptScore[readIndex+1]) { optIndex= readIndex + 1; } FASTQSequence genomeSubstring; genomeSubstring.seq = &genome.seq[optGenomeAlignPos[optIndex]]; genomeSubstring.length = optGenomeAlignLength[optIndex]; if (prevAlignedGenomePos[optIndex] >= 0) { optAlignment[optIndex].qName.assign(reads[optIndex].title, reads[optIndex].titleLength); optAlignment[optIndex].tName.assign(genome.GetName()); ComputeAlignmentStats(optAlignment[optIndex], reads[optIndex].seq, genomeSubstring.seq, SMRTDistanceMatrix, 6, 6); if (optAlignment[optIndex].blocks.size() > 0) { PrintCompareSequencesAlignment(optAlignment[optIndex], reads[optIndex], genomeSubstring,cout); } /* StickPrintAlignment(optAlignment[optIndex], reads[optIndex], genomeSubstring, cout, 0, optGenomeAlignPos[optIndex]); */ } } for (readIndex = 0; readIndex < readOptScore.size(); readIndex++ ) { reads[readIndex].Free(); } return 0; }
TEST(SubreadsTest, EndToEnd_Multiple) { // setup const string movieName = "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"; vector<string> baxFilenames; baxFilenames.push_back(tests::Data_Dir + "/" + movieName + ".1.bax.h5"); const string generatedBam = movieName + ".subreads.bam"; const string scrapBam = movieName + ".scraps.bam"; // run conversion const int result = RunBax2Bam(baxFilenames, "--subread"); EXPECT_EQ(0, result); // open BAX reader on original data HDFBasReader baxReader; baxReader.IncludeField("Basecall"); baxReader.IncludeField("DeletionQV"); baxReader.IncludeField("DeletionTag"); baxReader.IncludeField("InsertionQV"); baxReader.IncludeField("PreBaseFrames"); baxReader.IncludeField("MergeQV"); baxReader.IncludeField("SubstitutionQV"); baxReader.IncludeField("HQRegionSNR"); // not using SubTag or PulseWidth here string baxBasecallerVersion; string baxBindingKit; string baxSequencingKit; const int initOk = baxReader.Initialize(baxFilenames.front()); EXPECT_EQ(1, initOk); if (initOk == 1) { if (baxReader.scanDataReader.fileHasScanData && baxReader.scanDataReader.initializedRunInfoGroup) { if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("BindingKit")) { HDFAtom<std::string> bkAtom; if (bkAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "BindingKit")) { bkAtom.Read(baxBindingKit); bkAtom.dataspace.close(); } } if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("SequencingKit")) { HDFAtom<std::string> skAtom; if (skAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "SequencingKit")) { skAtom.Read(baxSequencingKit); skAtom.dataspace.close(); } } } baxReader.GetChangeListID(baxBasecallerVersion); } // read region table info boost::scoped_ptr<HDFRegionTableReader> regionTableReader(new HDFRegionTableReader); RegionTable regionTable; std::string fn = baxFilenames.front(); EXPECT_TRUE(regionTableReader->Initialize(fn) != 0); regionTable.Reset(); regionTableReader->ReadTable(regionTable); regionTableReader->Close(); EXPECT_NO_THROW( { // open BAM file BamFile bamFile(generatedBam); // check BAM header information const BamHeader& header = bamFile.Header(); EXPECT_EQ(string("1.5"), header.Version()); EXPECT_EQ(string("unknown"), header.SortOrder()); EXPECT_EQ(string("3.0.1"), header.PacBioBamVersion()); EXPECT_TRUE(header.Sequences().empty()); EXPECT_TRUE(header.Comments().empty()); ASSERT_FALSE(header.Programs().empty()); const vector<string> readGroupIds = header.ReadGroupIds(); ASSERT_FALSE(readGroupIds.empty()); const ReadGroupInfo& rg = header.ReadGroup(readGroupIds.front()); string rawId = movieName + "//SUBREAD"; string md5Id; MakeMD5(rawId, md5Id, 8); EXPECT_EQ(md5Id, rg.Id()); EXPECT_EQ(string("PACBIO"), rg.Platform()); EXPECT_EQ(movieName, rg.MovieName()); EXPECT_TRUE(rg.SequencingCenter().empty()); EXPECT_TRUE(rg.Date().empty()); EXPECT_TRUE(rg.FlowOrder().empty()); EXPECT_TRUE(rg.KeySequence().empty()); EXPECT_TRUE(rg.Library().empty()); EXPECT_TRUE(rg.Programs().empty()); EXPECT_TRUE(rg.PredictedInsertSize().empty()); EXPECT_TRUE(rg.Sample().empty()); EXPECT_EQ("SUBREAD", rg.ReadType()); EXPECT_EQ(baxBasecallerVersion, rg.BasecallerVersion()); EXPECT_EQ(baxBindingKit, rg.BindingKit()); EXPECT_EQ(baxSequencingKit, rg.SequencingKit()); EXPECT_EQ(75, std::stod(rg.FrameRateHz())); EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV)); EXPECT_EQ("dt", rg.BaseFeatureTag(BaseFeature::DELETION_TAG)); EXPECT_EQ("iq", rg.BaseFeatureTag(BaseFeature::INSERTION_QV)); EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD)); EXPECT_EQ("mq", rg.BaseFeatureTag(BaseFeature::MERGE_QV)); EXPECT_EQ("sq", rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV)); EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_TAG)); EXPECT_EQ(FrameCodec::V1, rg.IpdCodec()); // compare 1st record from each file SMRTSequence baxRecord; UInt holeNumber = 0; vector<float> hqSnr; size_t intervalIdx = 0; vector<SubreadInterval> subreadIntervals; size_t numTested = 0; EntireFileQuery entireFile(bamFile); for (BamRecord& bamRecord : entireFile) { if (intervalIdx >= subreadIntervals.size()) { while (baxReader.GetNext(baxRecord)) { holeNumber = baxRecord.zmwData.holeNumber; ComputeSubreadIntervals(&subreadIntervals, regionTable, holeNumber); /* this is for debugging subread interval problems int hqStart = 0; int hqEnd = 0; int hqScore = 0; LookupHQRegion(holeNumber, regionTable, hqStart, hqEnd, hqScore); vector<ReadInterval> subreadIntervals_; CollectSubreadIntervals(baxRecord, ®ionTable, subreadIntervals_); for (int i = subreadIntervals_.size() - 1; i >= 0; --i) { auto& in = subreadIntervals_[i]; int inStart = max(hqStart, in.start); int inEnd = min(hqEnd, in.end); if (inEnd <= inStart) subreadIntervals_.erase(subreadIntervals_.begin() + i); } cerr << "hqRegion: " << hqStart << ", " << hqEnd << endl; cerr << "subreadRegions:" << endl; for (const auto& in : subreadIntervals_) cerr << " l, r: " << in.start << ", " << in.end << endl; cerr << "adapterDerived:" << endl; for (const auto& in : subreadIntervals) cerr << " l, r: " << in.Start << ", " << in.End << endl; cerr << endl; // */ if (subreadIntervals.empty()) continue; intervalIdx = 0; hqSnr.clear(); hqSnr.push_back(baxRecord.HQRegionSnr('A')); hqSnr.push_back(baxRecord.HQRegionSnr('C')); hqSnr.push_back(baxRecord.HQRegionSnr('G')); hqSnr.push_back(baxRecord.HQRegionSnr('T')); EXPECT_GT(hqSnr[0], 0); EXPECT_GT(hqSnr[1], 0); EXPECT_GT(hqSnr[2], 0); EXPECT_GT(hqSnr[3], 0); goto compare; } goto cleanup; } compare: const BamRecordImpl& bamRecordImpl = bamRecord.Impl(); EXPECT_EQ(4680,bamRecordImpl.Bin()); EXPECT_EQ(0, bamRecordImpl.InsertSize()); EXPECT_EQ(255, bamRecordImpl.MapQuality()); EXPECT_EQ(-1, bamRecordImpl.MatePosition()); EXPECT_EQ(-1, bamRecordImpl.MateReferenceId()); EXPECT_EQ(-1, bamRecordImpl.Position()); EXPECT_EQ(-1, bamRecordImpl.ReferenceId()); EXPECT_FALSE(bamRecordImpl.IsMapped()); const int subreadStart = subreadIntervals[intervalIdx].Start; const int subreadEnd = subreadIntervals[intervalIdx].End; const string expectedName = movieName + "/" + to_string(holeNumber) + "/" + to_string(subreadStart) + "_" + to_string(subreadEnd); EXPECT_EQ(expectedName, bamRecordImpl.Name()); using PacBio::BAM::QualityValue; using PacBio::BAM::QualityValues; const DNALength length = subreadEnd - subreadStart; string expectedSequence; expectedSequence.assign((const char*)baxRecord.seq + subreadStart, length); const string bamSequence = bamRecord.Sequence(); const QualityValues bamQualities = bamRecord.Qualities(); EXPECT_EQ(expectedSequence, bamSequence); EXPECT_TRUE(bamQualities.empty()); const QualityValues bamDeletionQVs = bamRecord.DeletionQV(); const QualityValues bamInsertionQVs = bamRecord.InsertionQV(); const QualityValues bamMergeQVs = bamRecord.MergeQV(); const QualityValues bamSubstitutionQVs = bamRecord.SubstitutionQV(); for (size_t i = 0; i < length; ++i) { const size_t pos = subreadStart + i; EXPECT_EQ((QualityValue)baxRecord.GetDeletionQV(pos), bamDeletionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetInsertionQV(pos), bamInsertionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetMergeQV(pos), bamMergeQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetSubstitutionQV(pos), bamSubstitutionQVs.at(i)); } if (baxRecord.deletionTag) { string expectedDeletionTags; expectedDeletionTags.assign((char*)baxRecord.deletionTag + subreadStart, (char*)baxRecord.deletionTag + subreadStart + length); const string& bamDeletionTags = bamRecord.DeletionTag(); EXPECT_EQ(expectedDeletionTags, bamDeletionTags); } if (baxRecord.substitutionTag) { string expectedSubstitutionTags; expectedSubstitutionTags.assign((char*)baxRecord.substitutionTag + subreadStart, (char*)baxRecord.substitutionTag + subreadStart + length); const string& bamSubstitutionTags = bamRecord.SubstitutionTag(); EXPECT_EQ(expectedSubstitutionTags, bamSubstitutionTags); } // TODO: IPDs const LocalContextFlags ctxFlags = subreadIntervals[intervalIdx].LocalContextFlags; EXPECT_EQ(md5Id, bamRecord.ReadGroupId()); EXPECT_EQ(movieName, bamRecord.MovieName()); EXPECT_EQ(1, bamRecord.NumPasses()); EXPECT_EQ(holeNumber, bamRecord.HoleNumber()); EXPECT_EQ(subreadStart, bamRecord.QueryStart()); EXPECT_EQ(subreadEnd, bamRecord.QueryEnd()); EXPECT_EQ(hqSnr, bamRecord.SignalToNoise()); EXPECT_EQ(ctxFlags, bamRecord.LocalContextFlags()); numTested++; intervalIdx++; } cleanup: EXPECT_GT(numTested, 1); // cleanup baxReader.Close(); RemoveFile(generatedBam); RemoveFile(scrapBam); }); // EXPECT_NO_THROW
void ImportReadFromCmpH5(int alignmentIndex, SMRTSequence &read) { CmpAlignment cmpAlignment; alnInfoGroup.ReadCmpAlignment(alignmentIndex, cmpAlignment); // // Cache some stats about the read, and where it was aligned to. // int queryStart = cmpAlignment.GetQueryStart(); int queryEnd = cmpAlignment.GetQueryEnd(); read.holeNumber = cmpAlignment.GetHoleNumber(); int refGroupId = cmpAlignment.GetRefGroupId(); int alnGroupId = cmpAlignment.GetAlnGroupId(); int refGroupIndex = refGroupIdToArrayIndex[refGroupId]; if (alnGroupIdToReadGroupName.find(alnGroupId) == alnGroupIdToReadGroupName.end()) { cout << "INTERNAL ERROR! Could not find read group name for alignment " << "group with Id " << alnGroupId << "." << endl; assert(0); } string readGroupName = alnGroupIdToReadGroupName[alnGroupId]; if (refAlignGroups[refGroupIndex]->experimentNameToIndex.find(readGroupName) == refAlignGroups[refGroupIndex]->experimentNameToIndex.end()) { cout << "Internal ERROR! The read group name " << readGroupName << " is specified as part of " << " the path in alignment " << alignmentIndex << " though it does not exist in the ref align group specified for this alignment." << endl; assert(0); } int readGroupIndex = refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName]; HDFCmpExperimentGroup* expGroup = refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; int offsetBegin = cmpAlignment.GetOffsetBegin(); int offsetEnd = cmpAlignment.GetOffsetEnd(); int alignedSequenceLength = offsetEnd - offsetBegin; string alignedSequence; string readSequence; vector<unsigned char> byteAlignment; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); // // Convert to something we can compare easily. // ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); // // Initialize the sequence of the read. // RemoveGaps(alignedSequence, readSequence); // // Make space for the sequence and all fields. // read.length = readSequence.size(); read.Allocate(read.length); memcpy(read.seq, readSequence.c_str(), readSequence.size() * sizeof(char)); vector<int> baseToAlignmentMap; CreateSequenceToAlignmentMap(byteAlignment, baseToAlignmentMap); // // Read in the quality values // vector<unsigned char> storedQVArray; vector<UChar> qvValues; vector<HalfWord> frameValues; int length = offsetEnd - offsetBegin; qvValues.resize(length); frameValues.resize(length); int i; if (expGroup->experimentGroup.ContainsObject("QualityValue")) { expGroup->qualityValue.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.qual.data[0]); int i; for (i= 0; i < read.length; i++) { assert(read.qual[i] < 100); } } if (expGroup->experimentGroup.ContainsObject("InsertionQV")) { expGroup->insertionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.insertionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("SubstitutionQV")) { expGroup->substitutionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.substitutionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("DeletionQV")) { expGroup->deletionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.deletionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("DeletionTag")) { vector<char> deletionTagValues; deletionTagValues.resize(offsetEnd-offsetBegin); expGroup->deletionTag.Read(offsetBegin, offsetEnd, &deletionTagValues[0]); StoreQualityValueFromAlignment(deletionTagValues, baseToAlignmentMap, read.deletionTag); } if (expGroup->experimentGroup.ContainsObject("SubstitutionTag")) { vector<char> substitutionTagValues; substitutionTagValues.resize(offsetEnd-offsetBegin); expGroup->substitutionTag.Read(offsetBegin, offsetEnd, &substitutionTagValues[0]); StoreQualityValueFromAlignment(substitutionTagValues, baseToAlignmentMap, read.substitutionTag); } if (expGroup->experimentGroup.ContainsObject("PulseIndex")) { vector<uint32_t> pulseIndexValues; pulseIndexValues.resize(offsetEnd-offsetBegin); expGroup->pulseIndex.Read(offsetBegin, offsetEnd, &pulseIndexValues[0]); StoreQualityValueFromAlignment(pulseIndexValues, baseToAlignmentMap, read.pulseIndex); } if (expGroup->experimentGroup.ContainsObject("PreBaseFrames")) { expGroup->preBaseFrames.Read(offsetBegin, offsetEnd, &frameValues[0]); StoreQualityValueFromAlignment(frameValues, baseToAlignmentMap, read.preBaseFrames); } if (expGroup->experimentGroup.ContainsObject("WidthInFrames")) { expGroup->widthInFrames.Read(offsetBegin, offsetEnd, &frameValues[0]); StoreQualityValueFromAlignment(frameValues, baseToAlignmentMap, read.widthInFrames); } }
int main(int argc, char* argv[]) { string inputFileName, outputFileName; if (argc < 2) { PrintUsage(); exit(0); } vector<string> inputFileNames; inputFileName = argv[1]; outputFileName = argv[2]; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector<string> regionFileNames; bool splitSubreads = true; bool useCCS = false; int minSubreadLength = 1; while (argi < argc) { if (strcmp(argv[argi], "-regionTable") == 0) { regionsFOFNName = argv[++argi]; } else if (strcmp(argv[argi], "-noSplitSubreads") == 0) { splitSubreads = false; } else if (strcmp(argv[argi], "-minSubreadLength") == 0) { minSubreadLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-useccsdenovo") == 0) { useCCS = true; } else { PrintUsage(); cout << "ERROR! Option " << argv[argi] << " is not supported." << endl; } argi++; } if (FileOfFileNames::IsFOFN(inputFileName)) { FileOfFileNames::FOFNToList(inputFileName, inputFileNames); } else { inputFileNames.push_back(inputFileName); } if (regionsFOFNName == "") { regionFileNames = inputFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(outputFileName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; AfgBasWriter afgWriter; afgWriter.Initialize(outputFileName); for (plsFileIndex = 0; plsFileIndex < inputFileNames.size(); plsFileIndex++) { if (splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); regionTable.SortTableByHoleNumber(); } ReaderAgglomerate reader; // reader.SkipReadQuality(); // should have been taken care of by *Filter modules if (useCCS){ reader.UseCCS(); } else { reader.IgnoreCCS(); } reader.Initialize(inputFileNames[plsFileIndex]); CCSSequence seq; int seqIndex = 0; int numRecords = 0; vector<ReadInterval> subreadIntervals; while (reader.GetNext(seq)){ ++seqIndex; if (splitSubreads == false) { if (seq.length >= minSubreadLength) { afgWriter.Write(seq); } seq.Free(); continue; } DNALength hqReadStart, hqReadEnd; int score; GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, score); subreadIntervals.clear(); // clear old, new intervals are appended. CollectSubreadIntervals(seq,®ionTable, subreadIntervals); if (seq.length == 0 and subreadIntervals.size() > 0) { cout << "WARNING! A high quality interval region exists for a read of length 0." <<endl; cout << " The offending ZMW number is " << seq.zmwData.holeNumber << endl; seq.Free(); continue; } for (int intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence; int subreadStart = subreadIntervals[intvIndex].start > hqReadStart ? subreadIntervals[intvIndex].start : hqReadStart; int subreadEnd = subreadIntervals[intvIndex].end < hqReadEnd ? subreadIntervals[intvIndex].end : hqReadEnd; int subreadLength = subreadEnd - subreadStart; if (subreadLength < minSubreadLength) continue; subreadSequence.subreadStart = subreadStart; subreadSequence.subreadEnd = subreadEnd; subreadSequence.ReferenceSubstring(seq, subreadStart, subreadLength); stringstream titleStream; titleStream << seq.title << "/" << subreadIntervals[intvIndex].start << "_" << subreadIntervals[intvIndex].end; subreadSequence.CopyTitle(titleStream.str()); afgWriter.Write(subreadSequence); delete[] subreadSequence.title; } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } }
int main(int argc, char* argv[]) { string cmpFileName, movieFileName; int argi = 3; int numMetrics = 8; map<string,bool> metricOptions; int maxElements = 0; // // Default is all options are true // CreateMetricOptions(metricOptions); string metricList = ""; bool useCcs = false; bool byRead = false; bool failOnMissingData = false; CommandLineParser clp; bool printVersion = false; clp.RegisterStringOption("basFileName", &movieFileName, "The input {bas,pls}.h5 or input.fofn.", true); clp.RegisterStringOption("cmpFileName", &cmpFileName, "The cmp.h5 file to load pulse information into.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("metrics", &metricList, "The a string delimited list of metrics (with no spaces).The " "valid options are: QualityValue, ClassifierQV, MergeQV, StartFrame," "PulseWidth, pkmid, IPD, and Light."); clp.RegisterFlagOption("useccs", &useCcs, "Load pulse information for CCS sequences and not raw bases."); clp.RegisterFlagOption("byread", &byRead, "Load pulse information by read rather than buffering an entire pls.h5 file. " "This option will soon be deprecated and on by default."); clp.RegisterIntOption("maxElements", &maxElements, "Set a limit on the size of pls/bas file to buffer in.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("failOnMissingData", &failOnMissingData, "Exit if any data fields are missing from the bas.h5 or pls.h5 input that are required to load a metric. Defualt is a warning."); clp.SetProgramSummary("Load pulse information such as inter pulse distance, or quality information into the cmp.h5 file." "This allows one to analyze kinetic and quality information by alignment column."); clp.ParseCommandLine(argc, argv); if (printVersion) { cout << VERSION << endl; exit(1); } if (metricList == "") { SetDefaultMetricOptions(metricOptions); } else { ParseMetricsList(metricList, metricOptions); } // // Always read in basecalls since they are used to check the sanity // of the alignment indices. // metricOptions["Basecall"] = true; // // Translate from the metrics to be loaded to the ones that are // required to compute them. // vector<string> datasetFields; RequirementMap fieldRequirements; BuildRequirementMap(fieldRequirements); StoreDatasetFieldsFromPulseFields(metricOptions, fieldRequirements, datasetFields); vector<string> movieFileNames; vector<string> fofnMovieNames; FileOfFileNames::StoreFileOrFileList(movieFileName, movieFileNames); HDFBasReader hdfBasReader; HDFPlsReader hdfPlsReader; HDFCCSReader<SMRTSequence> hdfCcsReader; vector<string> baseFileFields, pulseFileFields; int fieldIndex; bool useBaseFile = false, usePulseFile = false; for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfBasReader.ContainsField(datasetFields[fieldIndex])) { useBaseFile = true; baseFileFields.push_back(datasetFields[fieldIndex]); } } if (maxElements != 0) { hdfBasReader.maxAllocNElements = maxElements; hdfPlsReader.maxAllocNElements = maxElements; } // // For now, all runs will attempt to use information from a .bas // file, since it's assumed that if one has alignments, one has a // .bas file. // useBaseFile = true; // // Add some default fields. // hdfBasReader.IncludeField("Basecall"); hdfBasReader.IncludeField("PulseIndex"); hdfBasReader.InitializeFields(baseFileFields); for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfPlsReader.ContainsField(datasetFields[fieldIndex])) { usePulseFile = true; pulseFileFields.push_back(datasetFields[fieldIndex]); } } if (usePulseFile) { hdfPlsReader.InitializeFields(pulseFileFields); } hdfPlsReader.IncludeField("NumEvent"); int nMovies = movieFileNames.size(); int movieIndex; MovieNameToArrayIndex movieNameMap; // // Initialize movies. This accomplishes two tasks. First, all movie // files are opened and initialized, so that if there are data // fields missing the program will exit now rather than in the // middle of loading pulses. // Next, a list of movie names is created in fofnMovieNames. The // cmp file does not necessarily index movies in the order of the // fofn, and so when loading pulses from a movie indexed by a cmp // file, one needs to look up the file name of the movie. This is // done by scanning the fofnMovieNames list in order until the movie // is found. for (movieIndex = 0; movieIndex < nMovies; movieIndex++) { if (!hdfBasReader.Initialize(movieFileNames[movieIndex])) { cout << "ERROR, could not initialize HDF file " << movieFileNames[movieIndex] << " for reading bases." << endl; exit(1); } else { fofnMovieNames.push_back(hdfBasReader.GetMovieName()); movieNameMap[hdfBasReader.GetMovieName()] = movieIndex; hdfBasReader.Close(); } // // The pulse file is optional. // if (usePulseFile) { if (hdfPlsReader.Initialize(movieFileNames[movieIndex]) == 0) { usePulseFile = false; } } } CmpFile cmpFile; /* * These readers pull information from the same pls file. */ HDFCmpFile<CmpAlignment> cmpReader; if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDWR) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(0); } cmpReader.Read(cmpFile); string commandLine; clp.CommandLineToString(argc, argv, commandLine); string versionStr(VERSION); AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionStr); cmpReader.fileLogGroup.AddEntry(commandLine, "Loading pulse metrics", "loadPulses", GetTimestamp(), versionStr); // // Group alignment indices by movie so that they may be processed one movie at a time // later on. The movie indices set keeps track of all indices // listed in alignment files. This keeps a reference to all // alignments in memory at once. At the time of writing this, most // projects will have at most a few million alignments, and so the // size of this structure is modest. // UInt alignmentIndex; map<int, vector<int> > movieIndexSets; for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { movieIndexSets[cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId()].push_back(alignmentIndex); } vector<float> computedPulseField; string alignedSequence; string readSequence; vector<unsigned char> byteAlignment; int m; vector<int> baseToAlignmentMap; // // Load pulses from movies in order they appear in the input fofn. // int fofnMovieIndex; for (fofnMovieIndex = 0; fofnMovieIndex < fofnMovieNames.size(); fofnMovieIndex++) { if (cmpFile.readType == ReadType::CCS or useCcs) { hdfBasReader.SetReadBasesFromCCS(); hdfCcsReader.Initialize(movieFileNames[fofnMovieIndex]); } hdfBasReader.Initialize(movieFileNames[fofnMovieIndex]); BaseFile baseFile; PulseFile pulseFile; if (byRead == false) { // // Read the entire bas file at once, and then extract values // from memory. This can be faster depending on the chunk // size and size of the movie. // hdfBasReader.ReadBaseFile(baseFile); hdfBasReader.Close(); } else { // // Reads are scanned one by instead of caching all. It is // still necessary to read in some of the datasets entirely, // in particular the start positions and hole numbers. // // This is repeated below for a pulse file. Since the pulse // and base files are separate objects, the scan data is // read into each separately. Somehow later the information // should be merged into just one. if (hdfBasReader.scanDataReader.fileHasScanData) { hdfBasReader.scanDataReader.Read(baseFile.scanData); } baseFile.readStartPositions.resize(hdfBasReader.nReads+1); baseFile.readStartPositions[0] = 0; hdfBasReader.GetAllReadLengths(baseFile.readLengths); int i; assert(baseFile.readLengths.size() + 1 == baseFile.readStartPositions.size()); for (i = 1; i < hdfBasReader.nReads + 1; i++ ) { baseFile.readStartPositions[i] = baseFile.readLengths[i-1] + baseFile.readStartPositions[i-1]; } // // Although the whole bas file isn't being read in, it is // necessary to read in which hole numbers are contained in this // bas file since it is possible that the alignment for a // particular hole number may be in a different input bas.h5 // file even if it is the same movie. // hdfBasReader.GetAllHoleNumbers(baseFile.holeNumbers); } set<uint32_t> moviePartHoleNumbers; copy(baseFile.holeNumbers.begin(), baseFile.holeNumbers.end(), inserter(moviePartHoleNumbers, moviePartHoleNumbers.begin())); if (usePulseFile) { hdfPlsReader.Initialize(movieFileNames[fofnMovieIndex]); hdfPlsReader.IncludeField("NumEvent"); hdfPlsReader.IncludeField("StartFrame"); if (byRead == false) { hdfPlsReader.ReadPulseFile(pulseFile); hdfPlsReader.Close(); } else { if (usePulseFile) { pulseFile.pulseStartPositions.resize(hdfBasReader.nReads+1); pulseFile.pulseStartPositions[0] = 0; hdfPlsReader.GetAllNumEvent(pulseFile.numEvent); int i; for (i = 1; i < hdfBasReader.nReads + 1; i++ ) { pulseFile.pulseStartPositions[i] = pulseFile.numEvent[i-1] + pulseFile.pulseStartPositions[i-1]; } if (hdfPlsReader.scanDataReader.fileHasScanData) { hdfPlsReader.scanDataReader.Read(pulseFile.scanData); } } } } string cmpFileMovieName; for (m = 0; m < cmpFile.movieInfo.name.size(); m++) { // // First find the file name for the movie 'm' // cmpFileMovieName = cmpFile.movieInfo.name[m]; int fofnMovieIndex; if (baseFile.GetMovieName() == cmpFileMovieName) { break; } } // // If the movie specified in the input.fofn is not found in the // cmp file, that indicates something bad is happeing. Either the // input.fofn was not used to generate the cmp.h5 file, or no // alignments were found between the input bas.h5 and the // reference. That shouldn't happen. // if (m == cmpFile.movieInfo.name.size()) { cout << "WARNING: The movie indexed in the compare file " << cmpFileMovieName << " is not listed in the file " << movieFileName << endl; continue; } // // Open the movie and load its pulses into memory. // movieIndex = cmpFile.movieInfo.id[m]; int movieAlignmentIndex; float NaN = 0.0/0.0; UChar missingQualityValue = 255; HalfWord missingFrameRateValue = USHRT_MAX; unsigned int missingPulseIndex = UINT_MAX; // // Since usePulseFile is set when the input file is a pulseFile, // and ReadType::CCS becomes the read type when the alignments are // ccs, when pulse files are specified for de novo ccs alignments, // they will be opened as pulse files. Since the de novo ccs // sequences do not have pulse file information, the auto-reading // of pulse files needs to be disabled. Do that here. // if (cmpFile.readType == ReadType::CCS or useCcs) { usePulseFile = false; } // // Now check the sanity of metric options. // map<string,bool>::iterator metricIt; for (metricIt = metricOptions.begin(); metricIt != metricOptions.end(); ++metricIt) { if (metricIt->second == false) { continue; } bool metricMayBeComputed = true; if (cmpFile.readType == ReadType::CCS and metricIt->first != "QualityValue" and metricIt->first != "DeletionQV" and metricIt->first != "SubstitutionQV" and metricIt->first != "InsertionQV" and metricIt->first != "DeletionTag" and metricIt->first != "SubstitutionTag" and metricIt->first != "Basecall") { cout << "ERROR! The metric " << metricIt->first << " cannot be loaded into de novo ccs alignemnts." << endl; // exit(0); metricMayBeComputed = false; } if (metricIt->first == "IPD") { // // The field requirements for IPD are special. // if ((useBaseFile and !hdfBasReader.FieldIsIncluded("PreBaseFrames")) or (usePulseFile and (!hdfPlsReader.FieldIsIncluded("StartFrame") and !hdfPlsReader.FieldIsIncluded("WidthInFrames")))) { metricMayBeComputed = false; } } else { if (fieldRequirements.find(metricIt->first) != fieldRequirements.end()) { // // There are requirements for this field. Make sure all are // present before trying to compute this field. // int requirementIndex; for (requirementIndex = 0; requirementIndex < fieldRequirements[metricIt->first].size(); ++requirementIndex) { string requirement; requirement = fieldRequirements[metricIt->first][requirementIndex]; if (((useBaseFile == false or ((hdfBasReader.includedFields.find(requirement) == hdfBasReader.includedFields.end() or hdfBasReader.includedFields[requirement] == false))) and ((usePulseFile == false or (hdfPlsReader.includedFields.find(requirement) == hdfPlsReader.includedFields.end() or hdfPlsReader.includedFields[requirement] == false))))) { metricMayBeComputed = false; } } } else { // // There are no requirements for this field, so it must exist as // a datset in either the bas or pls file. // if ((useBaseFile == false or ((hdfBasReader.includedFields.find(metricIt->first) == hdfBasReader.includedFields.end() or hdfBasReader.includedFields[metricIt->first] == false))) and (usePulseFile == false or (((hdfPlsReader.includedFields.find(metricIt->first) == hdfPlsReader.includedFields.end() or hdfPlsReader.includedFields[metricIt->first] == false))))) { metricMayBeComputed = false; } } } if (metricMayBeComputed == false) { if (failOnMissingData) { cout << "ERROR"; } else { cout << "WARNING"; } cout << ": There is insufficient data to compute metric: " << metricIt->first << " in the file " << movieFileNames[fofnMovieIndex] << " "; cout << " It will be ignored." << endl; if (failOnMissingData) { exit(1); } metricOptions[metricIt->first] = false; } } UInt i; // // This is currently used as a sentinal for showing that an array // element does not have a value stored for it, as in deleted // bases. // vector<int> pulseIndexArray; vector<unsigned int> statTime; if (metricOptions["WhenStarted"]) { string whenStarted; if (hdfPlsReader.scanDataReader.useWhenStarted == false) { cout << "ERROR! Attempting to read WhenStarted from " << movieFileNames[fofnMovieIndex] << " but the attriubte does not exist." << endl; exit(1); } hdfPlsReader.scanDataReader.ReadWhenStarted(whenStarted); if (!cmpReader.movieInfoGroup.whenStartedArray.IsInitialized()) { cmpReader.movieInfoGroup.whenStartedArray.Initialize(cmpReader.movieInfoGroup.movieInfoGroup, "WhenStarted"); } cmpReader.movieInfoGroup.whenStartedArray.Write(&whenStarted, 1); } if (AnyFieldRequiresFrameRate(datasetFields)) { if (useBaseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, baseFile.GetFrameRate()); } else if (usePulseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, pulseFile.GetFrameRate()); } } // // An index set is a set of indices into the alignment array that // are of reads generated by this movie. Load pulses for all // alignments generated for this movie. // // // Movie index sets should be sorted by alignment index. Build a lookup table for this. // std::vector<std::pair<int,int> > toFrom; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { alignmentIndex = movieIndexSets[movieIndex][movieAlignmentIndex]; toFrom.push_back(std::pair<int,int>(cmpFile.alnInfo.alignments[alignmentIndex].GetAlignmentId(), movieAlignmentIndex)); } // orders by first by default. std::sort(toFrom.begin(), toFrom.end()); // // Load metrics for alignments from movie 'movieIndex'. // cout << "loading " << movieIndexSets[movieIndex].size() << " alignments for movie " << movieIndex << endl; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { alignmentIndex = movieIndexSets[movieIndex][toFrom[movieAlignmentIndex].second]; // // Alignments are groupsd by ref group id then movie id. // int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId(); int movieId = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId(); UInt holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber(); // // Since the movie may be split into multiple parts, look to see // if this hole number is one of the ones covered by this // set. If it is not, just continue. It will be loaded on // another pass through a different movie part. // if (moviePartHoleNumbers.find(holeNumber) == moviePartHoleNumbers.end()) { continue; } // // Now locate where this movie is stored. // if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with reference group " << endl << refGroupId << " that is not found as an alignment group." << endl; exit(1); } int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId]; // // Now find the group containing the alignment for this movie. // if (cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.find(movieId) == cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with movie index " << endl << movieId << " that is not found in the alignment group " << refGroupIndex << endl; exit(1); } int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex[movieId]; // // First do sanity check on the read to make sure the pules and the bases match. // // // Look to see if the output HDF arrays need to be created. // UInt offsetBegin, offsetEnd; offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); int alignedSequenceLength = offsetEnd - offsetBegin; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); // // Convert to something we can compare easily. // ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); // // Do a sanity check to make sure the pulses and the alignment // make sense. The main check is to see if the query sequence // in the alignment is the same as the query sequence in the // read. // // // First pull out the bases corresponding to this read. // int queryStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); int queryEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); // Build a map of where CreateSequenceToAlignmentMap(byteAlignment, baseToAlignmentMap); // // Condense gaps in the alignment for easy comparison. // // RemoveGaps(alignedSequence, alignedSequence); // // Query the cmp file for a way to look up a read based on // coordinate information. For Astro reads, the coords are // based on x and y. For Springfield, it is read index. The // base files should be able to look up reads by x,y or by // index. // int readIndex; if (cmpFile.platformId == Astro) { cout << "ASTRO pulse loading is deprecated." << endl; exit(0); } if (baseFile.LookupReadIndexByHoleNumber(holeNumber, readIndex) == false) { cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl; assert(0); } int readStart, readLength, alignBaseStart, alignBaseEnd, alignBaseLength; readStart = baseFile.readStartPositions[readIndex]; readLength = baseFile.readStartPositions[readIndex+1] - baseFile.readStartPositions[readIndex]; alignBaseStart = readStart + queryStart; alignBaseEnd = readStart + queryEnd; alignBaseLength = alignBaseEnd - alignBaseStart; int pulseStart; if (usePulseFile) { pulseStart = pulseFile.pulseStartPositions[readIndex]; } // // This maps from pulse to a base, since there are more pulses // called than bases, and the is one pulse for every base. // pulseIndexArray.resize(readLength); SMRTSequence sourceRead; unsigned int numPasses; // // These are not allocated in the regular allocate function // since they are only used in loadPulses. (maybe I should // subclass SMRTSequence here). // if (byRead) { // Read in the data from the bas file if it exsts. if (useBaseFile) { hdfBasReader.GetReadAt(readIndex, sourceRead); if (cmpFile.readType == ReadType::CCS or useCcs) { numPasses = hdfCcsReader.GetNumPasses(readIndex); } } // Read in the data from the pls file if it exists. if (usePulseFile) { hdfPlsReader.GetReadAt(readIndex, sourceRead.pulseIndex, sourceRead); } } else { // // The entire base/pulse file was read in, so copy data from that into a read // For the data used in the read, it is possible to simply // reference the data, but for the pls file it is necessary // to copy since there is a packing of data. // if (useBaseFile) { baseFile.CopyReadAt(readIndex, sourceRead); if (cmpFile.readType == ReadType::CCS or useCcs) { numPasses = hdfCcsReader.GetNumPasses(readIndex); } } if (usePulseFile) { // // Copy the subset of pulses that correspond to the ones called as bases. // int i; for (i = 0; i < readLength; i++) { pulseIndexArray[i] = pulseStart + baseFile.pulseIndex[readStart + i]; } pulseFile.CopyReadAt(readIndex, &pulseIndexArray[0], sourceRead); } } readSequence.resize(queryEnd - queryStart); CapQualityValues(sourceRead); copy((char*) (sourceRead.seq + queryStart), (char*) (sourceRead.seq + queryEnd), readSequence.begin()); bool stringsMatch = true; if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) { cout << "ERROR, the query sequence does not match the aligned query sequence." << endl; cout << "HoleNumber: "<< holeNumber << ", MovieName: " << cmpFileMovieName; cout << " ,ReadIndex: " << (int) readIndex << cout << ", qStart: "<< queryStart << ", qEnd: " << queryEnd << endl; cout << "Aligned sequence: "<< endl; cout << alignedSequence << endl; cout << "Original sequence: " << endl; cout << readSequence << endl; assert(0); } /* * Compute any necessary data fields. These usually involve * using differences of pulse indices, pulse widths, etc.. * Missing fields are stored as 0's. */ vector<float> readPulseMetric; vector<float> floatMetric; vector<UChar> qvMetric; vector<HalfWord> frameRateMetric; vector<uint32_t> timeMetric; int ungappedAlignedSequenceLength = alignedSequence.size(); floatMetric.resize(alignedSequenceLength+1); readPulseMetric.resize(alignedSequenceLength+1); qvMetric.resize(alignedSequenceLength+1); frameRateMetric.resize(alignedSequenceLength+1); timeMetric.resize(alignedSequenceLength+1); UInt i; UInt pi; HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; if (cmpFile.readType == ReadType::CCS or useCcs) { if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) { cmpReader.alnInfoGroup.InitializeNumPasses(); } cmpReader.alnInfoGroup.numPasses.WriteToPos(&numPasses, 1, alignmentIndex); } if (metricOptions["StartTimeOffset"] == true) { if (!expGroup->startTimeOffset.IsInitialized()) { expGroup->startTimeOffset.Initialize(expGroup->experimentGroup, "StartTimeOffset"); } unsigned int readStartTimeOffset = sourceRead.startFrame[queryStart]; expGroup->startTimeOffset.WriteToPos(&readStartTimeOffset, 1, alignmentIndex); } if (metricOptions["QualityValue"] == true) { if (!expGroup->qualityValue.IsInitialized()) { expGroup->qualityValue.Initialize(expGroup->experimentGroup, "QualityValue"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.qual[queryStart + i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->qualityValue.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["InsertionQV"] == true) { if (!expGroup->insertionQV.IsInitialized()) { expGroup->insertionQV.Initialize(expGroup->experimentGroup, "InsertionQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.insertionQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->insertionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["MergeQV"] == true) { if (!expGroup->mergeQV.IsInitialized()) { expGroup->mergeQV.Initialize(expGroup->experimentGroup, "MergeQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.mergeQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->mergeQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionQV"] == true) { if (!expGroup->deletionQV.IsInitialized()) { expGroup->deletionQV.Initialize(expGroup->experimentGroup, "DeletionQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.deletionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->deletionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionTag"] == true) { if (!expGroup->deletionTag.IsInitialized()) { expGroup->deletionTag.Initialize(expGroup->experimentGroup, "DeletionTag"); } vector<char> readDeletionTagMetric; readDeletionTagMetric.resize(readPulseMetric.size()); // Store start time normalized to frame rate. for (i = 0; i < readDeletionTagMetric.size()-1; i++ ) { readDeletionTagMetric[i] = '-'; } readDeletionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { assert(baseToAlignmentMap[i] < readDeletionTagMetric.size()); readDeletionTagMetric[baseToAlignmentMap[i]] = sourceRead.deletionTag[queryStart+i]; } readDeletionTagMetric[readDeletionTagMetric.size()-1] = 0; expGroup->deletionTag.WriteToPos(&readDeletionTagMetric[0], readDeletionTagMetric.size(), offsetBegin); } if (metricOptions["PulseIndex"] == true) { if (!expGroup->pulseIndex.IsInitialized()) { expGroup->pulseIndex.Initialize(expGroup->experimentGroup, "PulseIndex"); } vector<uint32_t> readPulseIndexMetric; fill(readPulseIndexMetric.begin(), readPulseIndexMetric.end(), missingPulseIndex); readPulseIndexMetric.resize(readPulseMetric.size()); // Store start time normalized to frame rate. assert(readPulseIndexMetric.size() > 0); for (i = 0; i < readPulseIndexMetric.size(); i++ ) { readPulseIndexMetric[i] = 0; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseIndexMetric[baseToAlignmentMap[i]] = sourceRead.pulseIndex[queryStart+i]; } readPulseIndexMetric[readPulseIndexMetric.size()-1] = 0; expGroup->pulseIndex.WriteToPos(&readPulseIndexMetric[0], readPulseIndexMetric.size(), offsetBegin); } if (metricOptions["SubstitutionTag"] == true) { if (!expGroup->substitutionTag.IsInitialized()) { expGroup->substitutionTag.Initialize(expGroup->experimentGroup, "SubstitutionTag"); } vector<char> readSubstitutionTagMetric; readSubstitutionTagMetric.resize(readPulseMetric.size()); // Store start time normalized to frame rate. for (i = 0; i < readSubstitutionTagMetric.size()-1; i++ ) { readSubstitutionTagMetric[i] = '-'; } readSubstitutionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readSubstitutionTagMetric[baseToAlignmentMap[i]] = sourceRead.substitutionTag[queryStart+i]; } readSubstitutionTagMetric[readSubstitutionTagMetric.size()-1] = 0; expGroup->substitutionTag.WriteToPos(&readSubstitutionTagMetric[0], readSubstitutionTagMetric.size(), offsetBegin); } if (metricOptions["SubstitutionQV"] == true) { if (!expGroup->substitutionQV.IsInitialized()) { expGroup->substitutionQV.Initialize(expGroup->experimentGroup, "SubstitutionQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.substitutionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->substitutionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["ClassifierQV"] == true) { if (!expGroup->classifierQV.IsInitialized()) { expGroup->classifierQV.Initialize(expGroup->experimentGroup, "ClassifierQV"); } // Store start time normalized to frame rate. fill(floatMetric.begin(), floatMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[baseToAlignmentMap[i]] = sourceRead.classifierQV[i+queryStart]; } floatMetric[floatMetric.size()-1] = 0; expGroup->classifierQV.WriteToPos(&floatMetric[0], floatMetric.size(), offsetBegin); } if (metricOptions["StartFrame"] == true) { if (!expGroup->startTime.IsInitialized()) { expGroup->startTime.Initialize(expGroup->experimentGroup, "StartFrame"); } if (useBaseFile) { sourceRead.startFrame = new unsigned int[sourceRead.length]; copy(sourceRead.preBaseFrames, &sourceRead.preBaseFrames[sourceRead.length], sourceRead.startFrame); for (i = 0; i < sourceRead.length-1; i++) { sourceRead.startFrame[i+1] += sourceRead.widthInFrames[i]; } partial_sum(sourceRead.startFrame, &sourceRead.startFrame[sourceRead.length], sourceRead.startFrame); } // Store start time normalized to frame rate. fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[baseToAlignmentMap[i]] = sourceRead.startFrame[i+queryStart]; } timeMetric[timeMetric.size()-1] = 0; expGroup->startTime.WriteToPos(&timeMetric[0], timeMetric.size(), offsetBegin); } if (metricOptions["PulseWidth"] == true) { if (!expGroup->pulseWidth.IsInitialized()) { expGroup->pulseWidth.Initialize(expGroup->experimentGroup, "PulseWidth"); } // Store start time normalized to frame rate. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); // // For legacy reasons, it's possible the width in frames is // stored in the bas file. If this is the case, use the width // in frames there. Otherwise, use the width in frames stored // in the pls file. for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[queryStart + i]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->pulseWidth.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["PreBaseFrames"] == true) { if (!expGroup->preBaseFrames.IsInitialized()) { expGroup->preBaseFrames.Initialize(expGroup->experimentGroup, "PreBaseFrames"); } // Compute width in frames normalized to frame rate. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i+queryStart]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->preBaseFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["WidthInFrames"] == true) { if (!expGroup->widthInFrames.IsInitialized()) { expGroup->widthInFrames.Initialize(expGroup->experimentGroup, "WidthInFrames"); } // Compute width in frames normalized to frame rate. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { if (usePulseFile) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart]; } else { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart]; } } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->widthInFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["pkmid"] == true) { if (!expGroup->pkmid.IsInitialized()) { expGroup->pkmid.Initialize(expGroup->experimentGroup, "pkmid"); } for (i = 0; i < readPulseMetric.size(); i++ ) { readPulseMetric[i] = NaN; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseMetric[baseToAlignmentMap[i]] = sourceRead.midSignal[i+queryStart]; } readPulseMetric[readPulseMetric.size()-1] = 0; expGroup->pkmid.WriteToPos(&readPulseMetric[0], readPulseMetric.size(), offsetBegin); } if (metricOptions["IPD"] == true) { if (!expGroup->ipd.IsInitialized()) { expGroup->ipd.Initialize(expGroup->experimentGroup, "IPD"); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // // The IPD is undefined for the first base in a read. // if (usePulseFile ) { if (queryStart == 0 and i == 0) { frameRateMetric[baseToAlignmentMap[i]] = 0; } else { frameRateMetric[baseToAlignmentMap[i]] = (sourceRead.startFrame[i+queryStart] - sourceRead.startFrame[i+queryStart-1] - sourceRead.widthInFrames[i+queryStart-1]); } } else if (useBaseFile) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i + queryStart]; } } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->ipd.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["Light"] == true) { if (!expGroup->light.IsInitialized()) { expGroup->light.Initialize(expGroup->experimentGroup, "Light"); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.meanSignal[i+queryStart]; frameRateMetric[baseToAlignmentMap[i]] = (frameRateMetric[baseToAlignmentMap[i]] * sourceRead.widthInFrames[i+queryStart]); } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->light.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } sourceRead.Free(); Free(sourceRead.meanSignal); Free(sourceRead.maxSignal); Free(sourceRead.midSignal); Free(sourceRead.startFrame); Free(sourceRead.classifierQV); Free(sourceRead.widthInFrames); } if (byRead == true) { if (useBaseFile) { hdfBasReader.Close(); } if (cmpFile.readType == ReadType::CCS or useCcs) { hdfCcsReader.Close(); } if (usePulseFile) { hdfPlsReader.Close(); } } } // done loading movies cmpReader.Close(); }