int main(int argc, char* argv[]) { std::string outFileName; unsigned contextLength = 5; int minSamples = 500; int maxSamples = 1000; if (argc < 3) { PrintUsage(); std::exit(EXIT_FAILURE); } int argi = 1; std::string cmpH5FileName; cmpH5FileName = argv[argi++]; outFileName = argv[argi++]; int minAverageQual = 0; bool onlyMaxLength = false; while (argi < argc) { if (strcmp(argv[argi], "-contextLength") == 0) { contextLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-minSamples") == 0) { minSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-maxSamples") == 0) { maxSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-onlyMaxLength") == 0) { onlyMaxLength = true; } else { PrintUsage(); std::cout << "ERROR, bad option: " << argv[argi] << std::endl; std::exit(EXIT_FAILURE); } ++argi; } std::map<std::string, ScoredLength> maxLengthMap; OutputSampleListSet samples(contextLength); SMRTSequence read; std::ofstream sampleOut; CrucialOpen(outFileName, sampleOut, std::ios::out | std::ios::binary); int fileNameIndex; int numContextsReached = 0; int numContexts = 1 << (contextLength * 2); ReaderAgglomerate reader; samples.keyLength = contextLength; HDFCmpFile<CmpAlignment> cmpReader; cmpReader.IncludeField("QualityValue"); cmpReader.IncludeField("DeletionQV"); cmpReader.IncludeField("InsertionQV"); cmpReader.IncludeField("SubstitutionQV"); cmpReader.IncludeField("SubstitutionTag"); cmpReader.IncludeField("DeletionTag"); cmpReader.IncludeField("PulseIndex"); cmpReader.IncludeField("WidthInFrames"); cmpReader.IncludeField("PreBaseFrames"); if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) { std::cout << "ERROR, could not open the cmp file." << std::endl; std::exit(EXIT_FAILURE); } std::cout << "Reading cmp file." << std::endl; CmpFile cmpFile; cmpReader.ReadAlignmentDescriptions(cmpFile); cmpReader.ReadStructure(cmpFile); std::cout << "done reading structure." << std::endl; int alignmentIndex; int nAlignments = cmpReader.alnInfoGroup.GetNAlignments(); std::vector<int> alignmentToBaseMap; for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient(); alignmentIndex++) { // // For ease of use, store the length of the alignment to make another model. // ByteAlignment alignmentArray; cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray); Alignment alignment; ByteAlignmentToAlignment(alignmentArray, alignment); std::string readSequence, refSequence; readSequence.resize(alignmentArray.size()); refSequence.resize(alignmentArray.size()); DNASequence readDNA, refDNA; ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]); ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]); RemoveGaps(readSequence, readSequence); RemoveGaps(refSequence, refSequence); readDNA.seq = (Nucleotide*)readSequence.c_str(); readDNA.length = readSequence.size(); refDNA.seq = (Nucleotide*)refSequence.c_str(); refDNA.length = refSequence.size(); CmpAlignment cmpAlignment; cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read); CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap); if (read.length < contextLength) { continue; } int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart()); if (onlyMaxLength == false) { samples.lengths.push_back(subreadLength); } else { int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() - cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions()); std::stringstream nameStrm; nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber(); std::string nameStr = nameStrm.str(); if (maxLengthMap.find(nameStr) == maxLengthMap.end()) { maxLengthMap[nameStr] = ScoredLength(score, subreadLength); } } int sampleEnd = alignmentArray.size() - contextLength / 2; int a; for (a = contextLength / 2; a < sampleEnd; a++) { // Make sure the context begins on a real nucleotide. while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) { a++; } // // Move ab back to an index where there are contextLength/2 non-gap // characters, counted by nb // int ab; //num bases int ae; //alignment end ab = a - 1; int nb = 0, ne = 0; while (true) { if (RefChar[alignmentArray[ab]] != ' ') { nb++; } if (ab == 0 or nb == static_cast<int>(contextLength) / 2) break; ab--; } // // Advance ae to an index where there are contextLength/2 non-gap // characters, counted by ne. // ae = a + 1; while (ae < static_cast<int>(alignmentArray.size()) and ne < static_cast<int>(contextLength) / 2) { if (RefChar[alignmentArray[ae]] != ' ') { ne++; } ae++; } // // Make sure there are no edge effects that prevent a context of the correct length from being assigned. // if (nb + ne + 1 != static_cast<int>(contextLength)) { continue; } int ai; std::string context; for (ai = ab; ai < ae; ai++) { if (RefChar[alignmentArray[ai]] != ' ') { context.push_back(RefChar[alignmentArray[ai]]); } } assert(context.size() == contextLength); // // Now create the context. // OutputSample sample; // // This context is a deletion, create that. // sample.type = OutputSample::Deletion; // // This context is either an insertion or substitution // // Look to see if the previous aligned position was an // insertion, and move back as far as the insertion extends. int aq = a - 1; int sampleLength; if (QueryChar[alignmentArray[a]] == ' ') { sample.type = OutputSample::Deletion; sampleLength = 0; } else if (RefChar[alignmentArray[aq]] == ' ') { while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and QueryChar[alignmentArray[aq]] != ' ') { aq--; } sample.type = OutputSample::Insertion; sampleLength = a - aq; } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) { sample.type = OutputSample::Match; sampleLength = 1; } else { sample.type = OutputSample::Substitution; sampleLength = 1; } sample.Resize(sampleLength); if (sampleLength > 0) { int seqPos = alignmentToBaseMap[aq]; if (seqPos < static_cast<int>(read.length)) { sample.CopyFromSeq(read, seqPos, sampleLength); std::string nucs; for (size_t n = 0; n < sample.nucleotides.size(); n++) { char c = sample.nucleotides[n]; assert(c == 'A' or c == 'T' or c == 'G' or c == 'C'); nucs.push_back(sample.nucleotides[n]); } } } samples.AppendOutputSample(context, sample); } read.Free(); } if (onlyMaxLength) { std::map<std::string, ScoredLength>::iterator maxScoreIt; for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) { std::cout << maxScoreIt->second.length << std::endl; samples.lengths.push_back(maxScoreIt->second.length); } } samples.Write(sampleOut); return 0; }
int main(int argc, char* argv[]) { string program = "samtoh5"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, cmpFileName, refFileName; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; string readType = "standard"; int verbosity = 0; clp.SetProgramName(program); clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file."); clp.SetVersion(versionString); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file.", true); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads.", true); clp.RegisterStringOption("out.cmp.h5", &cmpFileName, "Output cmp.h5 file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when converting alignments " "generated from reads produced by the " "pls2fasta from bas.h5 files by parsing read " "coordinates from the SMRT read title. The title " "is in the format /name/hole/coordinates, where " "coordinates are in the format \\d+_\\d+, and " "represent the interval of the read that was " "aligned."); clp.RegisterStringOption("readType", &readType, "Set the read type: 'standard', 'strobe', 'CCS', " "or 'cDNA'"); clp.RegisterIntOption("verbosity", &verbosity, "Set desired verbosity.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); string description = ("Because SAM has optional tags that have different " "meanings in different programs, careful usage is required in order to " "have proper output. The \"xs\" tag in bwa-sw is used to show the " "suboptimal score, but in PacBio SAM (blasr) it is defined as the start " "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is " "specified, the xs tag is ignored, but when it is not specified, the " "coordinates given by the xs and xe tags are used to define the interval " "of a read that is aligned. The CIGAR string is relative to this interval."); clp.SetExamples(description); clp.ParseCommandLine(argc, argv); if (readType != "standard" and readType != "strobe" and readType != "cDNA" and readType != "CCS") { cout << "ERROR. Read type '" << readType << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." << endl; exit(1); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader; FASTAReader fastaReader; HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > cmpFile; // // Initialize input/output files. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); cmpFile.Create(cmpFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Convert sam to cmp.h5"; cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString); // // Set the readType // cmpFile.SetReadType(readType); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); // // This should probably be handled by the alignmentSetAdapter, but // time constraints... // AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // // Always recompute the MD5 values even if they exist in the input // sam file. Because MD5 is defined differently in sam and cmp.h5 files. // The SAM convention uppercases and normalizes before computing the MD5. // For cmp.h5, we compute the MD5 on the sequence 'as is'. // for(int i = 0; i < alignmentSet.references.size(); i++) { MakeMD5((const char*)&references[i].seq[0], (unsigned int)references[i].length, alignmentSet.references[i].md5); } // // Map short names for references obtained from file.sam to full names obtained from reference.fasta // map<string, string> shortRefNameToFull; map<string, string>::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (int i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // // Start setting up the cmp.h5 file. // AlignmentSetToCmpH5Adapter<HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > > alignmentSetAdapter; alignmentSetAdapter.Initialize(); alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile); // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } vector<AlignmentCandidate<> > convertedAlignments; if (verbosity > 0) { cout << "Storing alignment for " << samAlignment.qName << endl; } SAMAlignmentsToCandidates(samAlignment, references, alignmentSetAdapter.refNameToIndex, convertedAlignments, parseSmrtTitle, false); alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, alignIndex); int a; for (a = 0; a < convertedAlignments.size(); a++) { convertedAlignments[a].FreeSubsequences(); } ++alignIndex; /* if (alignIndex == 100) { return 0; }*/ } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; return 0; }
int main(int argc, char* argv[]) { CommandLineParser clp; string cmpFileName; vector<int> holeNumbers; vector<string> patterns, refGroups; bool printAll = false; clp.RegisterStringOption("cmph5filename", &cmpFileName, "input cmp h5", false); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntListOption("holeNumbers", &holeNumbers, "hole numbers to print alignments", false); clp.RegisterStringListOption("pattern", &patterns, "patterns to search read names to print alignments", false); clp.RegisterFlagOption("all", &printAll, "Just print all alignments.", false); clp.RegisterStringListOption("refgroups", &refGroups, "Reference groups to print.", false); clp.ParseCommandLine(argc, argv); CmpFile cmpFile; /* * These readers pull information from the same pls file. */ HDFCmpFile<CmpAlignment> hdfcmpFile; if (hdfcmpFile.Initialize(cmpFileName) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(1); } hdfcmpFile.Read(cmpFile); int alignmentIndex; for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { int alnHoleNumber; alnHoleNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber(); int hi; bool printThisAlignment = false; // // Read the alignment string. All alignments // int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId(); int alnGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetAlnGroupId(); int refGroupIndex = hdfcmpFile.refGroupIdToArrayIndex[refGroupId]; string readGroupName = hdfcmpFile.alnGroupIdToReadGroupName[alnGroupId]; int readGroupIndex = hdfcmpFile.refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName]; string refGroupPath = cmpFile.refGroup.path[refGroupIndex]; for (hi = 0; hi < holeNumbers.size(); hi++) { if (alnHoleNumber == holeNumbers[hi]) { printThisAlignment = true; break; } } int ri; for (ri = 0; ri < refGroups.size(); ri++) { if (refGroups[ri] == refGroupPath) { printThisAlignment = true; break; } } if (printThisAlignment or printAll) { unsigned int alignStartIndex, alignEndIndex; UInt offsetBegin, offsetEnd; string refSequence; string readSequence; vector<unsigned char> byteAlignment; offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); int alignedSequenceLength = offsetEnd - offsetBegin; if (alignedSequenceLength >= 0) { refSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } hdfcmpFile.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); readSequence.resize(byteAlignment.size()); refSequence.resize(byteAlignment.size()); ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &readSequence[0]); ByteAlignmentToRefString(&byteAlignment[0], byteAlignment.size(), &refSequence[0]); string ungappedRead, ungappedRef; RemoveGaps(readSequence, ungappedRead); RemoveGaps(refSequence, ungappedRef); Alignment alignment; GappedStringsToAlignment(readSequence, refSequence, alignment); DNASequence qAlignedSeq, rAlignedSeq; qAlignedSeq.seq = (Nucleotide*) &ungappedRead[0]; qAlignedSeq.length = ungappedRead.size(); rAlignedSeq.seq = (Nucleotide*) &ungappedRef[0]; rAlignedSeq.length = ungappedRef.size(); int qStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); int tStart = cmpFile.alnInfo.alignments[alignmentIndex].GetRefStart(); stringstream sstrm; sstrm << alnHoleNumber << "/" << qStart << "_" << cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); alignment.qName = sstrm.str(); StickPrintAlignment(alignment, qAlignedSeq, rAlignedSeq, cout, qStart, tStart); } } }
int main(int argc, char* argv[]) { string cmpFileName; CommandLineParser clp; bool printTotalAlignedBases = false; bool printNReads = false; bool printGlobalAccuracy = false; bool printNMatches = false; bool printAverageAccuracy = false; bool printDistBetweenErrorsHist = false; float identityCutoff = 0.0; bool printBinnedErrorRate = false; int minAlignLength = 0; bool printBreakdown = false; int nBins = 20; float totalPercentIdentity = 0; string binnedErrorDistributionFileName = ""; bool countBasesByMovie = false; string matchGapFileName = ""; string readMatchFileName = ""; int matchGapK = 15; string matchRunFileName = ""; string lengthsFileName = ""; bool printAverageLength = false; bool printMovingAverage = false; string movingAverageFileName = ""; string matchCountFileName = ""; string errorMatrixName = ""; bool discardFirstMatch = false; bool discardLastMatch = false; clp.RegisterStringOption("cmpH5File", &cmpFileName, "Input cmp.h5 file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("nAlignments", &printNReads, "Print the total number of reads.", false); clp.RegisterFlagOption("nAlignedBases", &printTotalAlignedBases, "Print the total number of aligned bases", false); clp.RegisterFloatOption("identityCutoff", &identityCutoff, "Print the total number of aligned bases", CommandLineParser::PositiveFloat, false); clp.RegisterFlagOption("binnedErrorRate", &printBinnedErrorRate, "Divide each read into N bins, and count the error rate in each bin", false); clp.RegisterIntOption("nBins", &nBins, "The number of bins for binned error rate.", CommandLineParser::PositiveInteger, false); clp.RegisterIntOption("minAlignLength", &minAlignLength, "Disregard alignments less than n length.", CommandLineParser::PositiveInteger, false); clp.RegisterStringOption("matchCount", &matchCountFileName, "Print the count of matches/mismatches/ins/del per pos to file.", false); clp.RegisterFlagOption("globalAccuracy", &printGlobalAccuracy, "Print the accuracy across all sequences.", false); clp.RegisterFlagOption("averageAccuracy", &printAverageAccuracy, "Print average accuracy of reads.", false); clp.RegisterFlagOption("nMatches", &printNMatches, "Print the number of bases matched.", false); clp.RegisterFlagOption("breakdown", &printBreakdown, "Print insertion/deletion/mismatch breakdown.", false); clp.RegisterFlagOption("errdist", &printDistBetweenErrorsHist, "Print a histogram of distance between errors.", false); clp.RegisterFlagOption("bymovie", &countBasesByMovie, "Count the number of bases aligned in each movie.", false); clp.RegisterStringOption("matchRun", &matchRunFileName, "Print lengths of runs of matches to file.", false); clp.RegisterFlagOption("discardFirstMatch", &discardFirstMatch, "Do not print the first run of matches.", false); clp.RegisterFlagOption("discardLastMatch", &discardLastMatch, "Do not print the last run of matches.", false); clp.RegisterStringOption("lengths", &lengthsFileName, "Print all subread lengths to a file.", false); clp.RegisterFlagOption("averageLength", &printAverageLength, "Print the average subread length.", false); clp.RegisterStringOption("printMovingAverageErrorRate", &movingAverageFileName, "Print moving average of accuracy.", false); clp.RegisterStringOption("binnedErrorDistribution", &binnedErrorDistributionFileName, "Print all binned error rates to a file", false); clp.RegisterStringOption("matchGap", &matchGapFileName, "Print all matches and the gap between them to file.", false); clp.RegisterStringOption("matchesPerRead", &readMatchFileName, "Print statistics for the number of matches found in a read", false); clp.RegisterIntOption("matchGapK", &matchGapK, "(15) The minimum word size to match.", CommandLineParser::PositiveInteger, false); clp.RegisterStringOption("errmat", &errorMatrixName, "Store error rates by read length", false); clp.ParseCommandLine(argc, argv); vector<vector<int> > matchMatrix; vector<vector<int> > errorMatrix; matchMatrix.resize(20); errorMatrix.resize(20); map<int,int> distHist; map<string,int> basesByMovie, readsByMovie; CmpFile cmpFile; /* * These readers pull information from the same pls file. */ HDFCmpFile<CmpAlignment> cmpReader; if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDONLY) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(0); } ofstream movingAverageFile; if (movingAverageFileName != "") { printMovingAverage= true; CrucialOpen(movingAverageFileName, movingAverageFile, std::ios::out); } ofstream matchCountFile; if (matchCountFileName != "") { CrucialOpen(matchCountFileName, matchCountFile, std::ios::out); } cmpReader.Read(cmpFile); int alignmentIndex; long nAlignments = 0; long totalAlignedBases = 0; long totalMatchedBases = 0; long totalAlignedLength = 0; long totalInsertion = 0, totalDeletion = 0, totalMismatch = 0; ofstream accuracyBinsFile, matchGapFile, readMatchFile, matchRunFile, lengthsFile, errMatFile; vector<vector<float> > accuracyDistributionBins; vector<int> mc, ic, dc, mmc; if (binnedErrorDistributionFileName != "") { CrucialOpen(binnedErrorDistributionFileName, accuracyBinsFile, std::ios::out); accuracyDistributionBins.resize(nBins); } if (lengthsFileName != "") { CrucialOpen(lengthsFileName, lengthsFile, std::ios::out); } if (errorMatrixName != "") { CrucialOpen(errorMatrixName, errMatFile, std::ios::out); } vector<float> accuracyBins; if (nBins > 0) { accuracyBins.resize(nBins); } if (matchRunFileName != "") { CrucialOpen(matchRunFileName, matchRunFile, std::ios::out); matchRunFile << "run_length gap_length" << endl; } if (matchGapFileName != "") { CrucialOpen(matchGapFileName, matchGapFile, std::ios::out); matchGapFile << "match_length gap_length" << endl; } if (readMatchFileName != "") { CrucialOpen(readMatchFileName, readMatchFile, std::ios::out); readMatchFile << "read_length nmatches" << endl; } for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { vector<unsigned char> byteAlignment; UInt offsetBegin, offsetEnd; offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart()); if (lengthsFileName != "") { lengthsFile << subreadLength << endl; } int alignedSequenceLength = offsetEnd - offsetBegin; string alignedSequence; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // // // Alignments are groupsd by ref group id then movie id. // int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId(); int movieId = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId(); // // Now locate where this movie is stored. // if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with reference group " << endl << refGroupId << " that is not found as an alignment group." << endl; exit(1); } int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId]; int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex[movieId]; cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); /* if (matchRunFileName != "") { PrintAlignment(byteAlignment, matchRunFile); } */ int n1 = 0, ngt1 = 0; if (matchGapFileName != "" or readMatchFileName != "" or matchRunFileName != "") { // // Print the matches and the gaps in this alignment to // the file. // int i = 0; int alignEnd = byteAlignment.size() - matchGapK; bool matchFound = false; int prevMatchEnd = 0; int prevMatchLength = 0; int nMatches = 0; int prevMatchAnyEnd = 0; while (i < alignEnd) { // Find the first matching character. int nonMatchLength = 0; while (i < alignEnd and (QueryChar[byteAlignment[i]] == ' ' or RefChar[byteAlignment[i]] == ' ' or QueryChar[byteAlignment[i]] != RefChar[byteAlignment[i]])) { i++; nonMatchLength++; } if (i >= alignEnd) { break; } // find the end of this match int matchStart = i; while (i < alignEnd and QueryChar[byteAlignment[i]] != ' ' and RefChar[byteAlignment[i]] != ' ' and QueryChar[byteAlignment[i]] == RefChar[byteAlignment[i]]) { i++; } int matchEnd = i; if (matchRunFileName != "") { bool printAlignment = true; if (discardFirstMatch and matchStart == 0) { printAlignment = false; } if (discardLastMatch and matchEnd == alignEnd) { printAlignment = false; } if (printAlignment) { matchRunFile << matchEnd - matchStart << " " << nonMatchLength << endl; } if (matchEnd - matchStart == 1) { n1++; } else { ngt1++;} } prevMatchAnyEnd = i; // If this match counts as an anchor, process it. if (i - matchStart >= matchGapK) { // Processing starts by looking to see if a previous anchor was found // and if so, printing that and the gap to the current anchor. // if (matchFound == true) { if (matchGapFileName != "") { matchGapFile << prevMatchLength << " " << matchStart - prevMatchEnd << endl; } ++nMatches; } // Processing ends by storing the length of the match, and where // it ended so that the next iteration can use it. matchFound = true; prevMatchLength = i - matchStart; prevMatchEnd = i; } } if (readMatchFileName != "") { readMatchFile << byteAlignment.size() << " " << nMatches << endl; } } if (matchRunFileName != "") { // matchRunFile << "n1: " << n1 << " ngt1 " << ngt1 << endl; } if (countBasesByMovie) { string movieName = cmpFile.movieInfo.name[readGroupIndex]; if (basesByMovie.find(movieName) == basesByMovie.end()) { readsByMovie[movieName] = 0; basesByMovie[movieName] = 0; } basesByMovie[movieName]+= offsetEnd - offsetBegin; readsByMovie[movieName]++; } if (printDistBetweenErrorsHist) { AddDistancesBetweenErrors(byteAlignment, distHist); } float readPctIdentity; readPctIdentity = ComputePacBioAccuracy(byteAlignment); if (readPctIdentity < identityCutoff) { continue; } if (byteAlignment.size() < minAlignLength) { continue; } ++nAlignments; // // several stats use total aligned length int qStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); int qEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); totalAlignedBases += qEnd - qStart; if (errorMatrixName != "") { StoreErrorRateMatrix(byteAlignment, matchMatrix, errorMatrix); } if (printBinnedErrorRate) { StoreBinnedErrorRate(byteAlignment, accuracyBins); } if (binnedErrorDistributionFileName != "") { AppendBinnedErrorRate(byteAlignment, accuracyDistributionBins); } int nMatch, nMismatch, nIns, nDel; CountStats(byteAlignment, nMatch, nMismatch, nIns, nDel); float total = nMatch + nMismatch + nIns + nDel; totalMatchedBases += nMatch; totalMismatch += nMismatch; totalInsertion += nIns; totalDeletion += nDel; // totalMatchedBases += CountNMatches(byteAlignment); totalAlignedLength += byteAlignment.size(); totalPercentIdentity += ComputePercentIdentity(byteAlignment); if (printMovingAverage) { vector<float> movingAverage; StoreMovingAverage(byteAlignment, movingAverage); int i; for (i = 0; i < movingAverage.size(); i++) { movingAverageFile << i << " " << movingAverage[i] << endl; } } if (matchCountFileName != "") { StoreMatchCounts(byteAlignment, mc, ic, dc, mmc); } } if (countBasesByMovie) { map<string,int>::iterator mapIt; for (mapIt = basesByMovie.begin(); mapIt != basesByMovie.end(); ++mapIt) { cout << mapIt->first << " " << readsByMovie[mapIt->first] << " " << mapIt->second << endl;; } } if (matchCountFileName != "") { int i; for (i = 0; i < mc.size(); i++) { matchCountFile << mc[i] << " " << ic[i] << " " << dc[i] << " " << mmc[i] << endl; } matchCountFile.close(); } if (printNReads) { cout << "nAlignments\t"<< nAlignments << endl; } if (printTotalAlignedBases) { cout << "totalAlignedBases\t" << totalAlignedBases << endl; } if (printBreakdown) { float totalTemplate = totalMatchedBases + totalMismatch + totalDeletion + totalInsertion; cout << "M_MM_I_D " << totalMatchedBases << " " << totalMismatch << " " << totalInsertion << " " << totalDeletion << " " << totalMatchedBases / totalTemplate << " " << totalMismatch / totalTemplate << " " << totalInsertion / totalTemplate << " " << totalDeletion / totalTemplate << endl; } if (printNMatches) { cout << "totalMatches\t" << totalMatchedBases << endl; } if (printGlobalAccuracy) { cout << "globalAccuracy\t"<< ((float)totalMatchedBases) / totalAlignedLength << endl; } if (printAverageAccuracy) { cout << "averageAccuracy\t" << totalPercentIdentity / nAlignments << endl; } if (printAverageLength) { cout << "averageAlignmentLength\t" << totalAlignedBases / (float)nAlignments << endl; } if (printBinnedErrorRate) { int i; for (i = 0; i < nBins; i++) { accuracyBins[i] /= nAlignments; cout << accuracyBins[i] << " "; } cout << endl; } if (binnedErrorDistributionFileName != "") { int i, b; for (i = 0; i < accuracyDistributionBins[0].size(); i++) { for (b = 0; b < nBins; b++) { accuracyBinsFile << accuracyDistributionBins[b][i] << " "; } accuracyBinsFile << endl; } accuracyBinsFile.close(); } if (lengthsFileName != "") { lengthsFile.close(); } if (printDistBetweenErrorsHist) { map<int,int>::iterator histIt; for (histIt = distHist.begin(); histIt != distHist.end(); ++histIt) { cout << "hist " << histIt->first << " " << histIt->second << endl; } } if (printMovingAverage){ movingAverageFile.close(); } if (errorMatrixName != "") { int i, j; for (i = 0; i< matchMatrix.size(); i++) { for (j = 0; j < matchMatrix[i].size(); j++) { if ((matchMatrix[i][j] + errorMatrix[i][j]) > 0) { errMatFile << ((float)matchMatrix[i][j]) / ((matchMatrix[i][j] + errorMatrix[i][j])) << " "; } else { errMatFile << " 0 "; } } errMatFile << endl; } errMatFile.close(); } return 0; }
int main(int argc, char* argv[]) { string cmpFileName, movieFileName; int argi = 3; int numMetrics = 8; map<string,bool> metricOptions; int maxElements = 0; // // Default is all options are true // CreateMetricOptions(metricOptions); string metricList = ""; bool useCcs = false; bool byRead = false; bool failOnMissingData = false; CommandLineParser clp; bool printVersion = false; clp.RegisterStringOption("basFileName", &movieFileName, "The input {bas,pls}.h5 or input.fofn.", true); clp.RegisterStringOption("cmpFileName", &cmpFileName, "The cmp.h5 file to load pulse information into.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("metrics", &metricList, "The a string delimited list of metrics (with no spaces).The " "valid options are: QualityValue, ClassifierQV, MergeQV, StartFrame," "PulseWidth, pkmid, IPD, and Light."); clp.RegisterFlagOption("useccs", &useCcs, "Load pulse information for CCS sequences and not raw bases."); clp.RegisterFlagOption("byread", &byRead, "Load pulse information by read rather than buffering an entire pls.h5 file. " "This option will soon be deprecated and on by default."); clp.RegisterIntOption("maxElements", &maxElements, "Set a limit on the size of pls/bas file to buffer in.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("failOnMissingData", &failOnMissingData, "Exit if any data fields are missing from the bas.h5 or pls.h5 input that are required to load a metric. Defualt is a warning."); clp.SetProgramSummary("Load pulse information such as inter pulse distance, or quality information into the cmp.h5 file." "This allows one to analyze kinetic and quality information by alignment column."); clp.ParseCommandLine(argc, argv); if (printVersion) { cout << VERSION << endl; exit(1); } if (metricList == "") { SetDefaultMetricOptions(metricOptions); } else { ParseMetricsList(metricList, metricOptions); } // // Always read in basecalls since they are used to check the sanity // of the alignment indices. // metricOptions["Basecall"] = true; // // Translate from the metrics to be loaded to the ones that are // required to compute them. // vector<string> datasetFields; RequirementMap fieldRequirements; BuildRequirementMap(fieldRequirements); StoreDatasetFieldsFromPulseFields(metricOptions, fieldRequirements, datasetFields); vector<string> movieFileNames; vector<string> fofnMovieNames; FileOfFileNames::StoreFileOrFileList(movieFileName, movieFileNames); HDFBasReader hdfBasReader; HDFPlsReader hdfPlsReader; HDFCCSReader<SMRTSequence> hdfCcsReader; vector<string> baseFileFields, pulseFileFields; int fieldIndex; bool useBaseFile = false, usePulseFile = false; for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfBasReader.ContainsField(datasetFields[fieldIndex])) { useBaseFile = true; baseFileFields.push_back(datasetFields[fieldIndex]); } } if (maxElements != 0) { hdfBasReader.maxAllocNElements = maxElements; hdfPlsReader.maxAllocNElements = maxElements; } // // For now, all runs will attempt to use information from a .bas // file, since it's assumed that if one has alignments, one has a // .bas file. // useBaseFile = true; // // Add some default fields. // hdfBasReader.IncludeField("Basecall"); hdfBasReader.IncludeField("PulseIndex"); hdfBasReader.InitializeFields(baseFileFields); for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfPlsReader.ContainsField(datasetFields[fieldIndex])) { usePulseFile = true; pulseFileFields.push_back(datasetFields[fieldIndex]); } } if (usePulseFile) { hdfPlsReader.InitializeFields(pulseFileFields); } hdfPlsReader.IncludeField("NumEvent"); int nMovies = movieFileNames.size(); int movieIndex; MovieNameToArrayIndex movieNameMap; // // Initialize movies. This accomplishes two tasks. First, all movie // files are opened and initialized, so that if there are data // fields missing the program will exit now rather than in the // middle of loading pulses. // Next, a list of movie names is created in fofnMovieNames. The // cmp file does not necessarily index movies in the order of the // fofn, and so when loading pulses from a movie indexed by a cmp // file, one needs to look up the file name of the movie. This is // done by scanning the fofnMovieNames list in order until the movie // is found. for (movieIndex = 0; movieIndex < nMovies; movieIndex++) { if (!hdfBasReader.Initialize(movieFileNames[movieIndex])) { cout << "ERROR, could not initialize HDF file " << movieFileNames[movieIndex] << " for reading bases." << endl; exit(1); } else { fofnMovieNames.push_back(hdfBasReader.GetMovieName()); movieNameMap[hdfBasReader.GetMovieName()] = movieIndex; hdfBasReader.Close(); } // // The pulse file is optional. // if (usePulseFile) { if (hdfPlsReader.Initialize(movieFileNames[movieIndex]) == 0) { usePulseFile = false; } } } CmpFile cmpFile; /* * These readers pull information from the same pls file. */ HDFCmpFile<CmpAlignment> cmpReader; if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDWR) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(0); } cmpReader.Read(cmpFile); string commandLine; clp.CommandLineToString(argc, argv, commandLine); string versionStr(VERSION); AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionStr); cmpReader.fileLogGroup.AddEntry(commandLine, "Loading pulse metrics", "loadPulses", GetTimestamp(), versionStr); // // Group alignment indices by movie so that they may be processed one movie at a time // later on. The movie indices set keeps track of all indices // listed in alignment files. This keeps a reference to all // alignments in memory at once. At the time of writing this, most // projects will have at most a few million alignments, and so the // size of this structure is modest. // UInt alignmentIndex; map<int, vector<int> > movieIndexSets; for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { movieIndexSets[cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId()].push_back(alignmentIndex); } vector<float> computedPulseField; string alignedSequence; string readSequence; vector<unsigned char> byteAlignment; int m; vector<int> baseToAlignmentMap; // // Load pulses from movies in order they appear in the input fofn. // int fofnMovieIndex; for (fofnMovieIndex = 0; fofnMovieIndex < fofnMovieNames.size(); fofnMovieIndex++) { if (cmpFile.readType == ReadType::CCS or useCcs) { hdfBasReader.SetReadBasesFromCCS(); hdfCcsReader.Initialize(movieFileNames[fofnMovieIndex]); } hdfBasReader.Initialize(movieFileNames[fofnMovieIndex]); BaseFile baseFile; PulseFile pulseFile; if (byRead == false) { // // Read the entire bas file at once, and then extract values // from memory. This can be faster depending on the chunk // size and size of the movie. // hdfBasReader.ReadBaseFile(baseFile); hdfBasReader.Close(); } else { // // Reads are scanned one by instead of caching all. It is // still necessary to read in some of the datasets entirely, // in particular the start positions and hole numbers. // // This is repeated below for a pulse file. Since the pulse // and base files are separate objects, the scan data is // read into each separately. Somehow later the information // should be merged into just one. if (hdfBasReader.scanDataReader.fileHasScanData) { hdfBasReader.scanDataReader.Read(baseFile.scanData); } baseFile.readStartPositions.resize(hdfBasReader.nReads+1); baseFile.readStartPositions[0] = 0; hdfBasReader.GetAllReadLengths(baseFile.readLengths); int i; assert(baseFile.readLengths.size() + 1 == baseFile.readStartPositions.size()); for (i = 1; i < hdfBasReader.nReads + 1; i++ ) { baseFile.readStartPositions[i] = baseFile.readLengths[i-1] + baseFile.readStartPositions[i-1]; } // // Although the whole bas file isn't being read in, it is // necessary to read in which hole numbers are contained in this // bas file since it is possible that the alignment for a // particular hole number may be in a different input bas.h5 // file even if it is the same movie. // hdfBasReader.GetAllHoleNumbers(baseFile.holeNumbers); } set<uint32_t> moviePartHoleNumbers; copy(baseFile.holeNumbers.begin(), baseFile.holeNumbers.end(), inserter(moviePartHoleNumbers, moviePartHoleNumbers.begin())); if (usePulseFile) { hdfPlsReader.Initialize(movieFileNames[fofnMovieIndex]); hdfPlsReader.IncludeField("NumEvent"); hdfPlsReader.IncludeField("StartFrame"); if (byRead == false) { hdfPlsReader.ReadPulseFile(pulseFile); hdfPlsReader.Close(); } else { if (usePulseFile) { pulseFile.pulseStartPositions.resize(hdfBasReader.nReads+1); pulseFile.pulseStartPositions[0] = 0; hdfPlsReader.GetAllNumEvent(pulseFile.numEvent); int i; for (i = 1; i < hdfBasReader.nReads + 1; i++ ) { pulseFile.pulseStartPositions[i] = pulseFile.numEvent[i-1] + pulseFile.pulseStartPositions[i-1]; } if (hdfPlsReader.scanDataReader.fileHasScanData) { hdfPlsReader.scanDataReader.Read(pulseFile.scanData); } } } } string cmpFileMovieName; for (m = 0; m < cmpFile.movieInfo.name.size(); m++) { // // First find the file name for the movie 'm' // cmpFileMovieName = cmpFile.movieInfo.name[m]; int fofnMovieIndex; if (baseFile.GetMovieName() == cmpFileMovieName) { break; } } // // If the movie specified in the input.fofn is not found in the // cmp file, that indicates something bad is happeing. Either the // input.fofn was not used to generate the cmp.h5 file, or no // alignments were found between the input bas.h5 and the // reference. That shouldn't happen. // if (m == cmpFile.movieInfo.name.size()) { cout << "WARNING: The movie indexed in the compare file " << cmpFileMovieName << " is not listed in the file " << movieFileName << endl; continue; } // // Open the movie and load its pulses into memory. // movieIndex = cmpFile.movieInfo.id[m]; int movieAlignmentIndex; float NaN = 0.0/0.0; UChar missingQualityValue = 255; HalfWord missingFrameRateValue = USHRT_MAX; unsigned int missingPulseIndex = UINT_MAX; // // Since usePulseFile is set when the input file is a pulseFile, // and ReadType::CCS becomes the read type when the alignments are // ccs, when pulse files are specified for de novo ccs alignments, // they will be opened as pulse files. Since the de novo ccs // sequences do not have pulse file information, the auto-reading // of pulse files needs to be disabled. Do that here. // if (cmpFile.readType == ReadType::CCS or useCcs) { usePulseFile = false; } // // Now check the sanity of metric options. // map<string,bool>::iterator metricIt; for (metricIt = metricOptions.begin(); metricIt != metricOptions.end(); ++metricIt) { if (metricIt->second == false) { continue; } bool metricMayBeComputed = true; if (cmpFile.readType == ReadType::CCS and metricIt->first != "QualityValue" and metricIt->first != "DeletionQV" and metricIt->first != "SubstitutionQV" and metricIt->first != "InsertionQV" and metricIt->first != "DeletionTag" and metricIt->first != "SubstitutionTag" and metricIt->first != "Basecall") { cout << "ERROR! The metric " << metricIt->first << " cannot be loaded into de novo ccs alignemnts." << endl; // exit(0); metricMayBeComputed = false; } if (metricIt->first == "IPD") { // // The field requirements for IPD are special. // if ((useBaseFile and !hdfBasReader.FieldIsIncluded("PreBaseFrames")) or (usePulseFile and (!hdfPlsReader.FieldIsIncluded("StartFrame") and !hdfPlsReader.FieldIsIncluded("WidthInFrames")))) { metricMayBeComputed = false; } } else { if (fieldRequirements.find(metricIt->first) != fieldRequirements.end()) { // // There are requirements for this field. Make sure all are // present before trying to compute this field. // int requirementIndex; for (requirementIndex = 0; requirementIndex < fieldRequirements[metricIt->first].size(); ++requirementIndex) { string requirement; requirement = fieldRequirements[metricIt->first][requirementIndex]; if (((useBaseFile == false or ((hdfBasReader.includedFields.find(requirement) == hdfBasReader.includedFields.end() or hdfBasReader.includedFields[requirement] == false))) and ((usePulseFile == false or (hdfPlsReader.includedFields.find(requirement) == hdfPlsReader.includedFields.end() or hdfPlsReader.includedFields[requirement] == false))))) { metricMayBeComputed = false; } } } else { // // There are no requirements for this field, so it must exist as // a datset in either the bas or pls file. // if ((useBaseFile == false or ((hdfBasReader.includedFields.find(metricIt->first) == hdfBasReader.includedFields.end() or hdfBasReader.includedFields[metricIt->first] == false))) and (usePulseFile == false or (((hdfPlsReader.includedFields.find(metricIt->first) == hdfPlsReader.includedFields.end() or hdfPlsReader.includedFields[metricIt->first] == false))))) { metricMayBeComputed = false; } } } if (metricMayBeComputed == false) { if (failOnMissingData) { cout << "ERROR"; } else { cout << "WARNING"; } cout << ": There is insufficient data to compute metric: " << metricIt->first << " in the file " << movieFileNames[fofnMovieIndex] << " "; cout << " It will be ignored." << endl; if (failOnMissingData) { exit(1); } metricOptions[metricIt->first] = false; } } UInt i; // // This is currently used as a sentinal for showing that an array // element does not have a value stored for it, as in deleted // bases. // vector<int> pulseIndexArray; vector<unsigned int> statTime; if (metricOptions["WhenStarted"]) { string whenStarted; if (hdfPlsReader.scanDataReader.useWhenStarted == false) { cout << "ERROR! Attempting to read WhenStarted from " << movieFileNames[fofnMovieIndex] << " but the attriubte does not exist." << endl; exit(1); } hdfPlsReader.scanDataReader.ReadWhenStarted(whenStarted); if (!cmpReader.movieInfoGroup.whenStartedArray.IsInitialized()) { cmpReader.movieInfoGroup.whenStartedArray.Initialize(cmpReader.movieInfoGroup.movieInfoGroup, "WhenStarted"); } cmpReader.movieInfoGroup.whenStartedArray.Write(&whenStarted, 1); } if (AnyFieldRequiresFrameRate(datasetFields)) { if (useBaseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, baseFile.GetFrameRate()); } else if (usePulseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, pulseFile.GetFrameRate()); } } // // An index set is a set of indices into the alignment array that // are of reads generated by this movie. Load pulses for all // alignments generated for this movie. // // // Movie index sets should be sorted by alignment index. Build a lookup table for this. // std::vector<std::pair<int,int> > toFrom; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { alignmentIndex = movieIndexSets[movieIndex][movieAlignmentIndex]; toFrom.push_back(std::pair<int,int>(cmpFile.alnInfo.alignments[alignmentIndex].GetAlignmentId(), movieAlignmentIndex)); } // orders by first by default. std::sort(toFrom.begin(), toFrom.end()); // // Load metrics for alignments from movie 'movieIndex'. // cout << "loading " << movieIndexSets[movieIndex].size() << " alignments for movie " << movieIndex << endl; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { alignmentIndex = movieIndexSets[movieIndex][toFrom[movieAlignmentIndex].second]; // // Alignments are groupsd by ref group id then movie id. // int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId(); int movieId = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId(); UInt holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber(); // // Since the movie may be split into multiple parts, look to see // if this hole number is one of the ones covered by this // set. If it is not, just continue. It will be loaded on // another pass through a different movie part. // if (moviePartHoleNumbers.find(holeNumber) == moviePartHoleNumbers.end()) { continue; } // // Now locate where this movie is stored. // if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with reference group " << endl << refGroupId << " that is not found as an alignment group." << endl; exit(1); } int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId]; // // Now find the group containing the alignment for this movie. // if (cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.find(movieId) == cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with movie index " << endl << movieId << " that is not found in the alignment group " << refGroupIndex << endl; exit(1); } int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex[movieId]; // // First do sanity check on the read to make sure the pules and the bases match. // // // Look to see if the output HDF arrays need to be created. // UInt offsetBegin, offsetEnd; offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); int alignedSequenceLength = offsetEnd - offsetBegin; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); // // Convert to something we can compare easily. // ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); // // Do a sanity check to make sure the pulses and the alignment // make sense. The main check is to see if the query sequence // in the alignment is the same as the query sequence in the // read. // // // First pull out the bases corresponding to this read. // int queryStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); int queryEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); // Build a map of where CreateSequenceToAlignmentMap(byteAlignment, baseToAlignmentMap); // // Condense gaps in the alignment for easy comparison. // // RemoveGaps(alignedSequence, alignedSequence); // // Query the cmp file for a way to look up a read based on // coordinate information. For Astro reads, the coords are // based on x and y. For Springfield, it is read index. The // base files should be able to look up reads by x,y or by // index. // int readIndex; if (cmpFile.platformId == Astro) { cout << "ASTRO pulse loading is deprecated." << endl; exit(0); } if (baseFile.LookupReadIndexByHoleNumber(holeNumber, readIndex) == false) { cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl; assert(0); } int readStart, readLength, alignBaseStart, alignBaseEnd, alignBaseLength; readStart = baseFile.readStartPositions[readIndex]; readLength = baseFile.readStartPositions[readIndex+1] - baseFile.readStartPositions[readIndex]; alignBaseStart = readStart + queryStart; alignBaseEnd = readStart + queryEnd; alignBaseLength = alignBaseEnd - alignBaseStart; int pulseStart; if (usePulseFile) { pulseStart = pulseFile.pulseStartPositions[readIndex]; } // // This maps from pulse to a base, since there are more pulses // called than bases, and the is one pulse for every base. // pulseIndexArray.resize(readLength); SMRTSequence sourceRead; unsigned int numPasses; // // These are not allocated in the regular allocate function // since they are only used in loadPulses. (maybe I should // subclass SMRTSequence here). // if (byRead) { // Read in the data from the bas file if it exsts. if (useBaseFile) { hdfBasReader.GetReadAt(readIndex, sourceRead); if (cmpFile.readType == ReadType::CCS or useCcs) { numPasses = hdfCcsReader.GetNumPasses(readIndex); } } // Read in the data from the pls file if it exists. if (usePulseFile) { hdfPlsReader.GetReadAt(readIndex, sourceRead.pulseIndex, sourceRead); } } else { // // The entire base/pulse file was read in, so copy data from that into a read // For the data used in the read, it is possible to simply // reference the data, but for the pls file it is necessary // to copy since there is a packing of data. // if (useBaseFile) { baseFile.CopyReadAt(readIndex, sourceRead); if (cmpFile.readType == ReadType::CCS or useCcs) { numPasses = hdfCcsReader.GetNumPasses(readIndex); } } if (usePulseFile) { // // Copy the subset of pulses that correspond to the ones called as bases. // int i; for (i = 0; i < readLength; i++) { pulseIndexArray[i] = pulseStart + baseFile.pulseIndex[readStart + i]; } pulseFile.CopyReadAt(readIndex, &pulseIndexArray[0], sourceRead); } } readSequence.resize(queryEnd - queryStart); CapQualityValues(sourceRead); copy((char*) (sourceRead.seq + queryStart), (char*) (sourceRead.seq + queryEnd), readSequence.begin()); bool stringsMatch = true; if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) { cout << "ERROR, the query sequence does not match the aligned query sequence." << endl; cout << "HoleNumber: "<< holeNumber << ", MovieName: " << cmpFileMovieName; cout << " ,ReadIndex: " << (int) readIndex << cout << ", qStart: "<< queryStart << ", qEnd: " << queryEnd << endl; cout << "Aligned sequence: "<< endl; cout << alignedSequence << endl; cout << "Original sequence: " << endl; cout << readSequence << endl; assert(0); } /* * Compute any necessary data fields. These usually involve * using differences of pulse indices, pulse widths, etc.. * Missing fields are stored as 0's. */ vector<float> readPulseMetric; vector<float> floatMetric; vector<UChar> qvMetric; vector<HalfWord> frameRateMetric; vector<uint32_t> timeMetric; int ungappedAlignedSequenceLength = alignedSequence.size(); floatMetric.resize(alignedSequenceLength+1); readPulseMetric.resize(alignedSequenceLength+1); qvMetric.resize(alignedSequenceLength+1); frameRateMetric.resize(alignedSequenceLength+1); timeMetric.resize(alignedSequenceLength+1); UInt i; UInt pi; HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; if (cmpFile.readType == ReadType::CCS or useCcs) { if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) { cmpReader.alnInfoGroup.InitializeNumPasses(); } cmpReader.alnInfoGroup.numPasses.WriteToPos(&numPasses, 1, alignmentIndex); } if (metricOptions["StartTimeOffset"] == true) { if (!expGroup->startTimeOffset.IsInitialized()) { expGroup->startTimeOffset.Initialize(expGroup->experimentGroup, "StartTimeOffset"); } unsigned int readStartTimeOffset = sourceRead.startFrame[queryStart]; expGroup->startTimeOffset.WriteToPos(&readStartTimeOffset, 1, alignmentIndex); } if (metricOptions["QualityValue"] == true) { if (!expGroup->qualityValue.IsInitialized()) { expGroup->qualityValue.Initialize(expGroup->experimentGroup, "QualityValue"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.qual[queryStart + i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->qualityValue.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["InsertionQV"] == true) { if (!expGroup->insertionQV.IsInitialized()) { expGroup->insertionQV.Initialize(expGroup->experimentGroup, "InsertionQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.insertionQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->insertionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["MergeQV"] == true) { if (!expGroup->mergeQV.IsInitialized()) { expGroup->mergeQV.Initialize(expGroup->experimentGroup, "MergeQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.mergeQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->mergeQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionQV"] == true) { if (!expGroup->deletionQV.IsInitialized()) { expGroup->deletionQV.Initialize(expGroup->experimentGroup, "DeletionQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.deletionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->deletionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionTag"] == true) { if (!expGroup->deletionTag.IsInitialized()) { expGroup->deletionTag.Initialize(expGroup->experimentGroup, "DeletionTag"); } vector<char> readDeletionTagMetric; readDeletionTagMetric.resize(readPulseMetric.size()); // Store start time normalized to frame rate. for (i = 0; i < readDeletionTagMetric.size()-1; i++ ) { readDeletionTagMetric[i] = '-'; } readDeletionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { assert(baseToAlignmentMap[i] < readDeletionTagMetric.size()); readDeletionTagMetric[baseToAlignmentMap[i]] = sourceRead.deletionTag[queryStart+i]; } readDeletionTagMetric[readDeletionTagMetric.size()-1] = 0; expGroup->deletionTag.WriteToPos(&readDeletionTagMetric[0], readDeletionTagMetric.size(), offsetBegin); } if (metricOptions["PulseIndex"] == true) { if (!expGroup->pulseIndex.IsInitialized()) { expGroup->pulseIndex.Initialize(expGroup->experimentGroup, "PulseIndex"); } vector<uint32_t> readPulseIndexMetric; fill(readPulseIndexMetric.begin(), readPulseIndexMetric.end(), missingPulseIndex); readPulseIndexMetric.resize(readPulseMetric.size()); // Store start time normalized to frame rate. assert(readPulseIndexMetric.size() > 0); for (i = 0; i < readPulseIndexMetric.size(); i++ ) { readPulseIndexMetric[i] = 0; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseIndexMetric[baseToAlignmentMap[i]] = sourceRead.pulseIndex[queryStart+i]; } readPulseIndexMetric[readPulseIndexMetric.size()-1] = 0; expGroup->pulseIndex.WriteToPos(&readPulseIndexMetric[0], readPulseIndexMetric.size(), offsetBegin); } if (metricOptions["SubstitutionTag"] == true) { if (!expGroup->substitutionTag.IsInitialized()) { expGroup->substitutionTag.Initialize(expGroup->experimentGroup, "SubstitutionTag"); } vector<char> readSubstitutionTagMetric; readSubstitutionTagMetric.resize(readPulseMetric.size()); // Store start time normalized to frame rate. for (i = 0; i < readSubstitutionTagMetric.size()-1; i++ ) { readSubstitutionTagMetric[i] = '-'; } readSubstitutionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readSubstitutionTagMetric[baseToAlignmentMap[i]] = sourceRead.substitutionTag[queryStart+i]; } readSubstitutionTagMetric[readSubstitutionTagMetric.size()-1] = 0; expGroup->substitutionTag.WriteToPos(&readSubstitutionTagMetric[0], readSubstitutionTagMetric.size(), offsetBegin); } if (metricOptions["SubstitutionQV"] == true) { if (!expGroup->substitutionQV.IsInitialized()) { expGroup->substitutionQV.Initialize(expGroup->experimentGroup, "SubstitutionQV"); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.substitutionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->substitutionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["ClassifierQV"] == true) { if (!expGroup->classifierQV.IsInitialized()) { expGroup->classifierQV.Initialize(expGroup->experimentGroup, "ClassifierQV"); } // Store start time normalized to frame rate. fill(floatMetric.begin(), floatMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[baseToAlignmentMap[i]] = sourceRead.classifierQV[i+queryStart]; } floatMetric[floatMetric.size()-1] = 0; expGroup->classifierQV.WriteToPos(&floatMetric[0], floatMetric.size(), offsetBegin); } if (metricOptions["StartFrame"] == true) { if (!expGroup->startTime.IsInitialized()) { expGroup->startTime.Initialize(expGroup->experimentGroup, "StartFrame"); } if (useBaseFile) { sourceRead.startFrame = new unsigned int[sourceRead.length]; copy(sourceRead.preBaseFrames, &sourceRead.preBaseFrames[sourceRead.length], sourceRead.startFrame); for (i = 0; i < sourceRead.length-1; i++) { sourceRead.startFrame[i+1] += sourceRead.widthInFrames[i]; } partial_sum(sourceRead.startFrame, &sourceRead.startFrame[sourceRead.length], sourceRead.startFrame); } // Store start time normalized to frame rate. fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[baseToAlignmentMap[i]] = sourceRead.startFrame[i+queryStart]; } timeMetric[timeMetric.size()-1] = 0; expGroup->startTime.WriteToPos(&timeMetric[0], timeMetric.size(), offsetBegin); } if (metricOptions["PulseWidth"] == true) { if (!expGroup->pulseWidth.IsInitialized()) { expGroup->pulseWidth.Initialize(expGroup->experimentGroup, "PulseWidth"); } // Store start time normalized to frame rate. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); // // For legacy reasons, it's possible the width in frames is // stored in the bas file. If this is the case, use the width // in frames there. Otherwise, use the width in frames stored // in the pls file. for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[queryStart + i]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->pulseWidth.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["PreBaseFrames"] == true) { if (!expGroup->preBaseFrames.IsInitialized()) { expGroup->preBaseFrames.Initialize(expGroup->experimentGroup, "PreBaseFrames"); } // Compute width in frames normalized to frame rate. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i+queryStart]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->preBaseFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["WidthInFrames"] == true) { if (!expGroup->widthInFrames.IsInitialized()) { expGroup->widthInFrames.Initialize(expGroup->experimentGroup, "WidthInFrames"); } // Compute width in frames normalized to frame rate. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { if (usePulseFile) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart]; } else { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart]; } } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->widthInFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["pkmid"] == true) { if (!expGroup->pkmid.IsInitialized()) { expGroup->pkmid.Initialize(expGroup->experimentGroup, "pkmid"); } for (i = 0; i < readPulseMetric.size(); i++ ) { readPulseMetric[i] = NaN; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseMetric[baseToAlignmentMap[i]] = sourceRead.midSignal[i+queryStart]; } readPulseMetric[readPulseMetric.size()-1] = 0; expGroup->pkmid.WriteToPos(&readPulseMetric[0], readPulseMetric.size(), offsetBegin); } if (metricOptions["IPD"] == true) { if (!expGroup->ipd.IsInitialized()) { expGroup->ipd.Initialize(expGroup->experimentGroup, "IPD"); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // // The IPD is undefined for the first base in a read. // if (usePulseFile ) { if (queryStart == 0 and i == 0) { frameRateMetric[baseToAlignmentMap[i]] = 0; } else { frameRateMetric[baseToAlignmentMap[i]] = (sourceRead.startFrame[i+queryStart] - sourceRead.startFrame[i+queryStart-1] - sourceRead.widthInFrames[i+queryStart-1]); } } else if (useBaseFile) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i + queryStart]; } } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->ipd.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["Light"] == true) { if (!expGroup->light.IsInitialized()) { expGroup->light.Initialize(expGroup->experimentGroup, "Light"); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.meanSignal[i+queryStart]; frameRateMetric[baseToAlignmentMap[i]] = (frameRateMetric[baseToAlignmentMap[i]] * sourceRead.widthInFrames[i+queryStart]); } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->light.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } sourceRead.Free(); Free(sourceRead.meanSignal); Free(sourceRead.maxSignal); Free(sourceRead.midSignal); Free(sourceRead.startFrame); Free(sourceRead.classifierQV); Free(sourceRead.widthInFrames); } if (byRead == true) { if (useBaseFile) { hdfBasReader.Close(); } if (cmpFile.readType == ReadType::CCS or useCcs) { hdfCcsReader.Close(); } if (usePulseFile) { hdfPlsReader.Close(); } } } // done loading movies cmpReader.Close(); }