int main(int argc, char* argv[]) { string basFileName; if (argc < 2) { cout << "usage: bas2 file.bas.h5 " << endl; cout << "bas2 = bas ls" << endl; cout << " -t --titleindices Print the titles of reads and their indices in the file." << endl; } basFileName = argv[1]; int argi = 1; bool printSummary = true; bool printTitleIndices = false; while (argi < argc) { if (strcmp(argv[argi], "-t") == 0 or strcmp(argv[argi], "--titleindices") == 0) { printTitleIndices = true; printSummary = false; } ++argi; } HDFBasReader reader; reader.Initialize(basFileName); int numReads = reader.GetNumReads(); cout << "Num reads: " << numReads << endl; vector<int> readLengths; reader.GetAllReadLengths(readLengths); DNALength totalLength = 0; VectorIndex i; int numAbove100 = 0; for (i = 0; i< readLengths.size(); i++ ){ totalLength += readLengths[i]; if (readLengths[i] > 100) { ++numAbove100; } } if (printSummary) { if (numReads > 0 ){ cout << "Average read length: " << totalLength / (1.0*numReads) << endl; } cout << "A total of " << numAbove100 << " have length > 100" << endl; return 0; } if (printTitleIndices) { FASTQSequence seq; int seqIndex = 0; while (reader.GetNext(seq)) { cout << seqIndex << " " << seq.title << " " << seq.length << endl; ++seqIndex; } } }
int main(int argc, char* argv[]) { string inFileName, outFileName; if (argc < 3) { cout << "usage: filterHDFPls in out idx1 [idx2 idx3]..." << endl; exit(1); } inFileName = argv[1]; outFileName = argv[2]; vector<int> readIndices; int argi = 3; int minLength = 0; int minAvgQual = 0; while (argi < argc) { if (strcmp(argv[argi], "-minLength") == 0) { minLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-minAvgQual") == 0) { minAvgQual = atoi(argv[++argi]); } ++argi; } std::sort(readIndices.begin(), readIndices.end()); HDFBasReader reader; HDFBasWriter writer; reader.Initialize(inFileName); writer.Initialize(outFileName, reader.GetMovieName(), reader.GetRunCode()); int ri; int curReadIndex = 0; FASTQSequence seq; for (ri = 0; ri < readIndices.size(); ri++, curReadIndex++ ){ reader.GetNext(seq); bool skipRead = false; if (seq.length < minLength) { skipRead = true;} if (seq.GetAverageQuality() < minAvgQual) { skipRead = true; } if (skipRead) { continue; } // all ok, write read out. writer.Write(seq); } }
TEST(SubreadsTest, EndToEnd_Multiple) { // setup const string movieName = "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"; vector<string> baxFilenames; baxFilenames.push_back(tests::Data_Dir + "/" + movieName + ".1.bax.h5"); const string generatedBam = movieName + ".subreads.bam"; const string scrapBam = movieName + ".scraps.bam"; // run conversion const int result = RunBax2Bam(baxFilenames, "--subread"); EXPECT_EQ(0, result); // open BAX reader on original data HDFBasReader baxReader; baxReader.IncludeField("Basecall"); baxReader.IncludeField("DeletionQV"); baxReader.IncludeField("DeletionTag"); baxReader.IncludeField("InsertionQV"); baxReader.IncludeField("PreBaseFrames"); baxReader.IncludeField("MergeQV"); baxReader.IncludeField("SubstitutionQV"); baxReader.IncludeField("HQRegionSNR"); // not using SubTag or PulseWidth here string baxBasecallerVersion; string baxBindingKit; string baxSequencingKit; const int initOk = baxReader.Initialize(baxFilenames.front()); EXPECT_EQ(1, initOk); if (initOk == 1) { if (baxReader.scanDataReader.fileHasScanData && baxReader.scanDataReader.initializedRunInfoGroup) { if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("BindingKit")) { HDFAtom<std::string> bkAtom; if (bkAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "BindingKit")) { bkAtom.Read(baxBindingKit); bkAtom.dataspace.close(); } } if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("SequencingKit")) { HDFAtom<std::string> skAtom; if (skAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "SequencingKit")) { skAtom.Read(baxSequencingKit); skAtom.dataspace.close(); } } } baxReader.GetChangeListID(baxBasecallerVersion); } // read region table info boost::scoped_ptr<HDFRegionTableReader> regionTableReader(new HDFRegionTableReader); RegionTable regionTable; std::string fn = baxFilenames.front(); EXPECT_TRUE(regionTableReader->Initialize(fn) != 0); regionTable.Reset(); regionTableReader->ReadTable(regionTable); regionTableReader->Close(); EXPECT_NO_THROW( { // open BAM file BamFile bamFile(generatedBam); // check BAM header information const BamHeader& header = bamFile.Header(); EXPECT_EQ(string("1.5"), header.Version()); EXPECT_EQ(string("unknown"), header.SortOrder()); EXPECT_EQ(string("3.0.1"), header.PacBioBamVersion()); EXPECT_TRUE(header.Sequences().empty()); EXPECT_TRUE(header.Comments().empty()); ASSERT_FALSE(header.Programs().empty()); const vector<string> readGroupIds = header.ReadGroupIds(); ASSERT_FALSE(readGroupIds.empty()); const ReadGroupInfo& rg = header.ReadGroup(readGroupIds.front()); string rawId = movieName + "//SUBREAD"; string md5Id; MakeMD5(rawId, md5Id, 8); EXPECT_EQ(md5Id, rg.Id()); EXPECT_EQ(string("PACBIO"), rg.Platform()); EXPECT_EQ(movieName, rg.MovieName()); EXPECT_TRUE(rg.SequencingCenter().empty()); EXPECT_TRUE(rg.Date().empty()); EXPECT_TRUE(rg.FlowOrder().empty()); EXPECT_TRUE(rg.KeySequence().empty()); EXPECT_TRUE(rg.Library().empty()); EXPECT_TRUE(rg.Programs().empty()); EXPECT_TRUE(rg.PredictedInsertSize().empty()); EXPECT_TRUE(rg.Sample().empty()); EXPECT_EQ("SUBREAD", rg.ReadType()); EXPECT_EQ(baxBasecallerVersion, rg.BasecallerVersion()); EXPECT_EQ(baxBindingKit, rg.BindingKit()); EXPECT_EQ(baxSequencingKit, rg.SequencingKit()); EXPECT_EQ(75, std::stod(rg.FrameRateHz())); EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV)); EXPECT_EQ("dt", rg.BaseFeatureTag(BaseFeature::DELETION_TAG)); EXPECT_EQ("iq", rg.BaseFeatureTag(BaseFeature::INSERTION_QV)); EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD)); EXPECT_EQ("mq", rg.BaseFeatureTag(BaseFeature::MERGE_QV)); EXPECT_EQ("sq", rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV)); EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_TAG)); EXPECT_EQ(FrameCodec::V1, rg.IpdCodec()); // compare 1st record from each file SMRTSequence baxRecord; UInt holeNumber = 0; vector<float> hqSnr; size_t intervalIdx = 0; vector<SubreadInterval> subreadIntervals; size_t numTested = 0; EntireFileQuery entireFile(bamFile); for (BamRecord& bamRecord : entireFile) { if (intervalIdx >= subreadIntervals.size()) { while (baxReader.GetNext(baxRecord)) { holeNumber = baxRecord.zmwData.holeNumber; ComputeSubreadIntervals(&subreadIntervals, regionTable, holeNumber); /* this is for debugging subread interval problems int hqStart = 0; int hqEnd = 0; int hqScore = 0; LookupHQRegion(holeNumber, regionTable, hqStart, hqEnd, hqScore); vector<ReadInterval> subreadIntervals_; CollectSubreadIntervals(baxRecord, ®ionTable, subreadIntervals_); for (int i = subreadIntervals_.size() - 1; i >= 0; --i) { auto& in = subreadIntervals_[i]; int inStart = max(hqStart, in.start); int inEnd = min(hqEnd, in.end); if (inEnd <= inStart) subreadIntervals_.erase(subreadIntervals_.begin() + i); } cerr << "hqRegion: " << hqStart << ", " << hqEnd << endl; cerr << "subreadRegions:" << endl; for (const auto& in : subreadIntervals_) cerr << " l, r: " << in.start << ", " << in.end << endl; cerr << "adapterDerived:" << endl; for (const auto& in : subreadIntervals) cerr << " l, r: " << in.Start << ", " << in.End << endl; cerr << endl; // */ if (subreadIntervals.empty()) continue; intervalIdx = 0; hqSnr.clear(); hqSnr.push_back(baxRecord.HQRegionSnr('A')); hqSnr.push_back(baxRecord.HQRegionSnr('C')); hqSnr.push_back(baxRecord.HQRegionSnr('G')); hqSnr.push_back(baxRecord.HQRegionSnr('T')); EXPECT_GT(hqSnr[0], 0); EXPECT_GT(hqSnr[1], 0); EXPECT_GT(hqSnr[2], 0); EXPECT_GT(hqSnr[3], 0); goto compare; } goto cleanup; } compare: const BamRecordImpl& bamRecordImpl = bamRecord.Impl(); EXPECT_EQ(4680,bamRecordImpl.Bin()); EXPECT_EQ(0, bamRecordImpl.InsertSize()); EXPECT_EQ(255, bamRecordImpl.MapQuality()); EXPECT_EQ(-1, bamRecordImpl.MatePosition()); EXPECT_EQ(-1, bamRecordImpl.MateReferenceId()); EXPECT_EQ(-1, bamRecordImpl.Position()); EXPECT_EQ(-1, bamRecordImpl.ReferenceId()); EXPECT_FALSE(bamRecordImpl.IsMapped()); const int subreadStart = subreadIntervals[intervalIdx].Start; const int subreadEnd = subreadIntervals[intervalIdx].End; const string expectedName = movieName + "/" + to_string(holeNumber) + "/" + to_string(subreadStart) + "_" + to_string(subreadEnd); EXPECT_EQ(expectedName, bamRecordImpl.Name()); using PacBio::BAM::QualityValue; using PacBio::BAM::QualityValues; const DNALength length = subreadEnd - subreadStart; string expectedSequence; expectedSequence.assign((const char*)baxRecord.seq + subreadStart, length); const string bamSequence = bamRecord.Sequence(); const QualityValues bamQualities = bamRecord.Qualities(); EXPECT_EQ(expectedSequence, bamSequence); EXPECT_TRUE(bamQualities.empty()); const QualityValues bamDeletionQVs = bamRecord.DeletionQV(); const QualityValues bamInsertionQVs = bamRecord.InsertionQV(); const QualityValues bamMergeQVs = bamRecord.MergeQV(); const QualityValues bamSubstitutionQVs = bamRecord.SubstitutionQV(); for (size_t i = 0; i < length; ++i) { const size_t pos = subreadStart + i; EXPECT_EQ((QualityValue)baxRecord.GetDeletionQV(pos), bamDeletionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetInsertionQV(pos), bamInsertionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetMergeQV(pos), bamMergeQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetSubstitutionQV(pos), bamSubstitutionQVs.at(i)); } if (baxRecord.deletionTag) { string expectedDeletionTags; expectedDeletionTags.assign((char*)baxRecord.deletionTag + subreadStart, (char*)baxRecord.deletionTag + subreadStart + length); const string& bamDeletionTags = bamRecord.DeletionTag(); EXPECT_EQ(expectedDeletionTags, bamDeletionTags); } if (baxRecord.substitutionTag) { string expectedSubstitutionTags; expectedSubstitutionTags.assign((char*)baxRecord.substitutionTag + subreadStart, (char*)baxRecord.substitutionTag + subreadStart + length); const string& bamSubstitutionTags = bamRecord.SubstitutionTag(); EXPECT_EQ(expectedSubstitutionTags, bamSubstitutionTags); } // TODO: IPDs const LocalContextFlags ctxFlags = subreadIntervals[intervalIdx].LocalContextFlags; EXPECT_EQ(md5Id, bamRecord.ReadGroupId()); EXPECT_EQ(movieName, bamRecord.MovieName()); EXPECT_EQ(1, bamRecord.NumPasses()); EXPECT_EQ(holeNumber, bamRecord.HoleNumber()); EXPECT_EQ(subreadStart, bamRecord.QueryStart()); EXPECT_EQ(subreadEnd, bamRecord.QueryEnd()); EXPECT_EQ(hqSnr, bamRecord.SignalToNoise()); EXPECT_EQ(ctxFlags, bamRecord.LocalContextFlags()); numTested++; intervalIdx++; } cleanup: EXPECT_GT(numTested, 1); // cleanup baxReader.Close(); RemoveFile(generatedBam); RemoveFile(scrapBam); }); // EXPECT_NO_THROW
int main(int argc, char* argv[]) { string program = "pls2fasta"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string plsFileName, fastaOutName; vector<string> plsFileNames; bool trimByRegion, maskByRegion; trimByRegion = false; maskByRegion = false; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector<string> regionFileNames; bool splitSubreads = true; int minSubreadLength = 0; bool addSimulatedData = false; bool printSimulatedCoordinate = false; bool printSimulatedSequenceIndex = false; bool printFastq = false; bool printCcs = false; int lineLength = 50; int minReadScore = 0; vector<int> holeNumbers; CommandLineParser clp; bool printOnlyBest = false; clp.SetProgramName(program); clp.SetVersion(versionString); clp.RegisterStringOption("in.pls.h5", &plsFileName, "Input pls.h5/bax.h5/fofn file.", true); clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions."); clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'."); clp.RegisterStringOption("regionTable", ®ionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset."); clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences."); clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers)."); clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality."); clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences"); clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read. The score is " "a number between 0 and 1000 and represents the expected accuracy percentage * 10. " "A typical value would be between 750 and 800. This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this. Otherwise, print the longest" "subread. This does not support fastq."); string description = ("Converts pls.h5/bax.h5/fofn files to fasta or fastq files. Although fasta files are provided" " with every run, they are not trimmed nor split into subreads. This program takes " "additional annotation information, such as the subread coordinates and high quality regions " "and uses them to create fasta sequences that are substrings of all bases called. Most of the time " "you will want to trim low quality reads, so you should specify -trimByRegion."); clp.SetProgramSummary(description); clp.ParseCommandLine(argc, argv); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; if (trimByRegion and maskByRegion) { cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl; exit(1); } if (printFastq) { // Setting lineLength to 0 flags to print on one line. lineLength = 0; } if (FileOfFileNames::IsFOFN(plsFileName)) { FileOfFileNames::FOFNToList(plsFileName, plsFileNames); } else { plsFileNames.push_back(plsFileName); } if (regionsFOFNName == "") { regionFileNames = plsFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; sort(holeNumbers.begin(), holeNumbers.end()); for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { if (trimByRegion or maskByRegion or splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); regionTable.SortTableByHoleNumber(); } ReaderAgglomerate reader; HDFBasReader ccsReader; if (printOnlyBest) { ccsReader.SetReadBasesFromCCS(); ccsReader.Initialize(plsFileNames[plsFileIndex]); } if (printCcs == false) { reader.IgnoreCCS(); } else { reader.hdfBasReader.SetReadBasesFromCCS(); } if (addSimulatedData) { reader.hdfBasReader.IncludeField("SimulatedCoordinate"); reader.hdfBasReader.IncludeField("SimulatedSequenceIndex"); } if (reader.SetReadFileName(plsFileNames[plsFileIndex]) == 0) { cout << "ERROR, could not determine file type." << plsFileNames[plsFileIndex] << endl; exit(1); } if (reader.Initialize() == 0) { cout << "ERROR, could not initialize file " << plsFileNames[plsFileIndex] << endl; exit(1); } DNALength simulatedCoordinate; DNALength simulatedSequenceIndex; reader.SkipReadQuality(); SMRTSequence seq; vector<ReadInterval> subreadIntervals;; SMRTSequence ccsSeq; while (reader.GetNext(seq)) { if (printOnlyBest) { ccsReader.GetNext(ccsSeq); } if (holeNumbers.size() != 0 and binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) { continue; } if (seq.length == 0) { continue; } if (addSimulatedData) { reader.hdfBasReader.simulatedCoordinateArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedCoordinate); reader.hdfBasReader.simulatedSequenceIndexArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedSequenceIndex); } if (printCcs == true) { if (printFastq == false) { seq.PrintSeq(fastaOut); } else { seq.PrintFastq(fastaOut, lineLength); } continue; } // // Determine the high quality boundaries of the read. This is // the full read is no hq regions exist, or it is stated to // ignore regions. // DNALength hqReadStart, hqReadEnd; int hqRegionScore; if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or (trimByRegion == false and maskByRegion == false)) { hqReadStart = 0; hqReadEnd = seq.length; } // // Mask off the low quality portions of the reads. // if (maskByRegion) { if (hqReadStart > 0) { fill(&seq.seq[0], &seq.seq[hqReadStart], 'N'); } if (hqReadEnd != seq.length) { fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N'); } } // // Now possibly print the full read with masking. This could be handled by making a // if (splitSubreads == false) { ReadInterval wholeRead(0, seq.length); // The set of subread intervals is just the entire read. subreadIntervals.clear(); subreadIntervals.push_back(wholeRead); } else { // // Print subread coordinates no matter whether or not reads have subreads. // subreadIntervals.clear(); // clear old, new intervals are appended. CollectSubreadIntervals(seq, ®ionTable, subreadIntervals); } // // Output all subreads as separate sequences. // int intvIndex; SMRTSequence bestSubreadSequence; int bestSubreadScore = -1; int bestSubreadIndex = 0; int bestSubreadStart = 0, bestSubreadEnd = 0; SMRTSequence bestSubread; for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; subreadSequence.subreadStart = subreadIntervals[intvIndex].start; subreadSequence.subreadEnd = subreadIntervals[intvIndex].end; // // When trimming by region, only output the parts of the // subread that overlap the hq region. // if (trimByRegion == true) { subreadSequence.subreadStart = max((DNALength) subreadIntervals[intvIndex].start, hqReadStart); subreadSequence.subreadEnd = min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd); } if (subreadSequence.subreadStart >= subreadSequence.subreadEnd or subreadSequence.subreadEnd - subreadSequence.subreadStart <= minSubreadLength) { // // There is no high qualty portion of this subread. Skip it. // continue; } if (hqRegionScore < minReadScore) { continue; } // // Print the subread, adding the coordinates as part of the title. // subreadSequence.ReferenceSubstring(seq, subreadSequence.subreadStart, subreadSequence.subreadEnd - subreadSequence.subreadStart); stringstream titleStream; titleStream << seq.title; if (splitSubreads) { // // Add the subread coordinates if splitting on subread. // titleStream << "/" << subreadSequence.subreadStart << "_" << subreadSequence.subreadEnd; } // // If running on simulated data, add where the values were simulated from. // if (addSimulatedData) { titleStream << ((FASTASequence*)&seq)->title << "/chrIndex_" << simulatedSequenceIndex << "/position_"<< simulatedCoordinate; ((FASTASequence*)&seq)->CopyTitle(titleStream.str()); } subreadSequence.CopyTitle(titleStream.str()); // // Eventually replace with WriterAgglomerate. // if (printOnlyBest == false) { if (subreadSequence.length > 0) { if (printFastq == false) { ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut); } else { subreadSequence.PrintFastq(fastaOut, lineLength); } } delete[] subreadSequence.title; } else { int subreadWeightedScore = subreadSequence.length * hqRegionScore; if (subreadWeightedScore > bestSubreadScore) { bestSubreadIndex = intvIndex; bestSubread = subreadSequence; bestSubreadScore = subreadWeightedScore; } } } if (printOnlyBest) { if (ccsSeq.length > 0) { if (printFastq == false) { ccsSeq.PrintSeq(fastaOut); } else { ccsSeq.PrintFastq(fastaOut, ccsSeq.length); } } else { if (bestSubreadScore >= 0) { if (printFastq == false) { bestSubread.PrintSeq(fastaOut); } else { bestSubread.PrintFastq(fastaOut, bestSubread.length); } bestSubread.Free(); } } ccsSeq.Free(); } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; }