bool SubreadConverter::ConvertFile(HDFBasReader* reader, PacBio::BAM::BamWriter* writer, PacBio::BAM::BamWriter* scrapsWriter) { assert(reader); // initialize with default values (shared across all unmapped subreads) BamRecordImpl bamRecord; // read region table info std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader); RegionTable regionTable; string fn = filenameForReader_[reader]; assert(!fn.empty()); if (regionTableReader->Initialize(fn) == 0) { AddErrorMessage("could not read region table on "+fn); return false; } regionTable.Reset(); regionTableReader->ReadTable(regionTable); regionTableReader->Close(); // initialize read scores InitReadScores(reader); // fetch records from HDF5 file SMRTSequence smrtRecord; while (reader->GetNext(smrtRecord)) { // compute subread & adapter intervals SubreadInterval hqInterval; deque<SubreadInterval> subreadIntervals; deque<SubreadInterval> adapterIntervals; try { hqInterval = ComputeSubreadIntervals(&subreadIntervals, &adapterIntervals, regionTable, smrtRecord.zmwData.holeNumber, smrtRecord.length); } catch (runtime_error& e) { AddErrorMessage(string(e.what())); smrtRecord.Free(); return false; } // sequencing ZMW if (IsSequencingZmw(smrtRecord)) { // write subreads to main BAM file for (const SubreadInterval& interval : subreadIntervals) { // skip invalid or 0-sized intervals if (interval.End <= interval.Start) continue; if (!WriteSubreadRecord(smrtRecord, interval.Start, interval.End, ReadGroupId(), static_cast<uint8_t>(interval.LocalContextFlags), writer)) { smrtRecord.Free(); return false; } } // if scraps BAM file present if (scrapsWriter) { // write 5-end LQ sequence if (hqInterval.Start > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqInterval.Start, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write adapters for (const SubreadInterval& interval : adapterIntervals) { // skip invalid or 0-sized adapters if (interval.End <= interval.Start) continue; if (!WriteAdapterRecord(smrtRecord, interval.Start, interval.End, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write 3'-end LQ sequence if (hqInterval.End < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqInterval.End, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } // sequencing ZMW // non-sequencing ZMW else { assert(!IsSequencingZmw(smrtRecord)); // only write these if scraps BAM present & we are in 'internal mode' if (settings_.isInternal && scrapsWriter) { // write 5-end LQ sequence to scraps BAM if (hqInterval.Start > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqInterval.Start, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write subreads & adapters to scraps BAM, sorted by query start while (!subreadIntervals.empty() && !adapterIntervals.empty()) { const SubreadInterval& subread = subreadIntervals.front(); const SubreadInterval& adapter = adapterIntervals.front(); assert(subread.Start != adapter.Start); if (subread.Start < adapter.Start) { if (!WriteFilteredRecord(smrtRecord, subread.Start, subread.End, ScrapsReadGroupId(), static_cast<uint8_t>(subread.LocalContextFlags), scrapsWriter)) { smrtRecord.Free(); return false; } subreadIntervals.pop_front(); } else { if (!WriteAdapterRecord(smrtRecord, adapter.Start, adapter.End, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } adapterIntervals.pop_front(); } } // flush any traling subread intervals while (!subreadIntervals.empty()) { assert(adapterIntervals.empty()); const SubreadInterval& subread = subreadIntervals.front(); if (!WriteFilteredRecord(smrtRecord, subread.Start, subread.End, ScrapsReadGroupId(), static_cast<uint8_t>(subread.LocalContextFlags), scrapsWriter)) { smrtRecord.Free(); return false; } subreadIntervals.pop_front(); } // flush any remaining adapter intervals while (!adapterIntervals.empty()) { assert(subreadIntervals.empty()); const SubreadInterval& adapter = adapterIntervals.front(); if (!WriteAdapterRecord(smrtRecord, adapter.Start, adapter.End, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } adapterIntervals.pop_front(); } // write 3'-end LQ sequence to scraps BAM if (hqInterval.End < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqInterval.End, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } // non-sequencing ZMW smrtRecord.Free(); } // if we get here, all OK return true; }
TEST(SubreadsTest, EndToEnd_Multiple) { // setup const string movieName = "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"; vector<string> baxFilenames; baxFilenames.push_back(tests::Data_Dir + "/" + movieName + ".1.bax.h5"); const string generatedBam = movieName + ".subreads.bam"; const string scrapBam = movieName + ".scraps.bam"; // run conversion const int result = RunBax2Bam(baxFilenames, "--subread"); EXPECT_EQ(0, result); // open BAX reader on original data HDFBasReader baxReader; baxReader.IncludeField("Basecall"); baxReader.IncludeField("DeletionQV"); baxReader.IncludeField("DeletionTag"); baxReader.IncludeField("InsertionQV"); baxReader.IncludeField("PreBaseFrames"); baxReader.IncludeField("MergeQV"); baxReader.IncludeField("SubstitutionQV"); baxReader.IncludeField("HQRegionSNR"); // not using SubTag or PulseWidth here string baxBasecallerVersion; string baxBindingKit; string baxSequencingKit; const int initOk = baxReader.Initialize(baxFilenames.front()); EXPECT_EQ(1, initOk); if (initOk == 1) { if (baxReader.scanDataReader.fileHasScanData && baxReader.scanDataReader.initializedRunInfoGroup) { if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("BindingKit")) { HDFAtom<std::string> bkAtom; if (bkAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "BindingKit")) { bkAtom.Read(baxBindingKit); bkAtom.dataspace.close(); } } if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("SequencingKit")) { HDFAtom<std::string> skAtom; if (skAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "SequencingKit")) { skAtom.Read(baxSequencingKit); skAtom.dataspace.close(); } } } baxReader.GetChangeListID(baxBasecallerVersion); } // read region table info boost::scoped_ptr<HDFRegionTableReader> regionTableReader(new HDFRegionTableReader); RegionTable regionTable; std::string fn = baxFilenames.front(); EXPECT_TRUE(regionTableReader->Initialize(fn) != 0); regionTable.Reset(); regionTableReader->ReadTable(regionTable); regionTableReader->Close(); EXPECT_NO_THROW( { // open BAM file BamFile bamFile(generatedBam); // check BAM header information const BamHeader& header = bamFile.Header(); EXPECT_EQ(string("1.5"), header.Version()); EXPECT_EQ(string("unknown"), header.SortOrder()); EXPECT_EQ(string("3.0.1"), header.PacBioBamVersion()); EXPECT_TRUE(header.Sequences().empty()); EXPECT_TRUE(header.Comments().empty()); ASSERT_FALSE(header.Programs().empty()); const vector<string> readGroupIds = header.ReadGroupIds(); ASSERT_FALSE(readGroupIds.empty()); const ReadGroupInfo& rg = header.ReadGroup(readGroupIds.front()); string rawId = movieName + "//SUBREAD"; string md5Id; MakeMD5(rawId, md5Id, 8); EXPECT_EQ(md5Id, rg.Id()); EXPECT_EQ(string("PACBIO"), rg.Platform()); EXPECT_EQ(movieName, rg.MovieName()); EXPECT_TRUE(rg.SequencingCenter().empty()); EXPECT_TRUE(rg.Date().empty()); EXPECT_TRUE(rg.FlowOrder().empty()); EXPECT_TRUE(rg.KeySequence().empty()); EXPECT_TRUE(rg.Library().empty()); EXPECT_TRUE(rg.Programs().empty()); EXPECT_TRUE(rg.PredictedInsertSize().empty()); EXPECT_TRUE(rg.Sample().empty()); EXPECT_EQ("SUBREAD", rg.ReadType()); EXPECT_EQ(baxBasecallerVersion, rg.BasecallerVersion()); EXPECT_EQ(baxBindingKit, rg.BindingKit()); EXPECT_EQ(baxSequencingKit, rg.SequencingKit()); EXPECT_EQ(75, std::stod(rg.FrameRateHz())); EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV)); EXPECT_EQ("dt", rg.BaseFeatureTag(BaseFeature::DELETION_TAG)); EXPECT_EQ("iq", rg.BaseFeatureTag(BaseFeature::INSERTION_QV)); EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD)); EXPECT_EQ("mq", rg.BaseFeatureTag(BaseFeature::MERGE_QV)); EXPECT_EQ("sq", rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV)); EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_TAG)); EXPECT_EQ(FrameCodec::V1, rg.IpdCodec()); // compare 1st record from each file SMRTSequence baxRecord; UInt holeNumber = 0; vector<float> hqSnr; size_t intervalIdx = 0; vector<SubreadInterval> subreadIntervals; size_t numTested = 0; EntireFileQuery entireFile(bamFile); for (BamRecord& bamRecord : entireFile) { if (intervalIdx >= subreadIntervals.size()) { while (baxReader.GetNext(baxRecord)) { holeNumber = baxRecord.zmwData.holeNumber; ComputeSubreadIntervals(&subreadIntervals, regionTable, holeNumber); /* this is for debugging subread interval problems int hqStart = 0; int hqEnd = 0; int hqScore = 0; LookupHQRegion(holeNumber, regionTable, hqStart, hqEnd, hqScore); vector<ReadInterval> subreadIntervals_; CollectSubreadIntervals(baxRecord, ®ionTable, subreadIntervals_); for (int i = subreadIntervals_.size() - 1; i >= 0; --i) { auto& in = subreadIntervals_[i]; int inStart = max(hqStart, in.start); int inEnd = min(hqEnd, in.end); if (inEnd <= inStart) subreadIntervals_.erase(subreadIntervals_.begin() + i); } cerr << "hqRegion: " << hqStart << ", " << hqEnd << endl; cerr << "subreadRegions:" << endl; for (const auto& in : subreadIntervals_) cerr << " l, r: " << in.start << ", " << in.end << endl; cerr << "adapterDerived:" << endl; for (const auto& in : subreadIntervals) cerr << " l, r: " << in.Start << ", " << in.End << endl; cerr << endl; // */ if (subreadIntervals.empty()) continue; intervalIdx = 0; hqSnr.clear(); hqSnr.push_back(baxRecord.HQRegionSnr('A')); hqSnr.push_back(baxRecord.HQRegionSnr('C')); hqSnr.push_back(baxRecord.HQRegionSnr('G')); hqSnr.push_back(baxRecord.HQRegionSnr('T')); EXPECT_GT(hqSnr[0], 0); EXPECT_GT(hqSnr[1], 0); EXPECT_GT(hqSnr[2], 0); EXPECT_GT(hqSnr[3], 0); goto compare; } goto cleanup; } compare: const BamRecordImpl& bamRecordImpl = bamRecord.Impl(); EXPECT_EQ(4680,bamRecordImpl.Bin()); EXPECT_EQ(0, bamRecordImpl.InsertSize()); EXPECT_EQ(255, bamRecordImpl.MapQuality()); EXPECT_EQ(-1, bamRecordImpl.MatePosition()); EXPECT_EQ(-1, bamRecordImpl.MateReferenceId()); EXPECT_EQ(-1, bamRecordImpl.Position()); EXPECT_EQ(-1, bamRecordImpl.ReferenceId()); EXPECT_FALSE(bamRecordImpl.IsMapped()); const int subreadStart = subreadIntervals[intervalIdx].Start; const int subreadEnd = subreadIntervals[intervalIdx].End; const string expectedName = movieName + "/" + to_string(holeNumber) + "/" + to_string(subreadStart) + "_" + to_string(subreadEnd); EXPECT_EQ(expectedName, bamRecordImpl.Name()); using PacBio::BAM::QualityValue; using PacBio::BAM::QualityValues; const DNALength length = subreadEnd - subreadStart; string expectedSequence; expectedSequence.assign((const char*)baxRecord.seq + subreadStart, length); const string bamSequence = bamRecord.Sequence(); const QualityValues bamQualities = bamRecord.Qualities(); EXPECT_EQ(expectedSequence, bamSequence); EXPECT_TRUE(bamQualities.empty()); const QualityValues bamDeletionQVs = bamRecord.DeletionQV(); const QualityValues bamInsertionQVs = bamRecord.InsertionQV(); const QualityValues bamMergeQVs = bamRecord.MergeQV(); const QualityValues bamSubstitutionQVs = bamRecord.SubstitutionQV(); for (size_t i = 0; i < length; ++i) { const size_t pos = subreadStart + i; EXPECT_EQ((QualityValue)baxRecord.GetDeletionQV(pos), bamDeletionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetInsertionQV(pos), bamInsertionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetMergeQV(pos), bamMergeQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetSubstitutionQV(pos), bamSubstitutionQVs.at(i)); } if (baxRecord.deletionTag) { string expectedDeletionTags; expectedDeletionTags.assign((char*)baxRecord.deletionTag + subreadStart, (char*)baxRecord.deletionTag + subreadStart + length); const string& bamDeletionTags = bamRecord.DeletionTag(); EXPECT_EQ(expectedDeletionTags, bamDeletionTags); } if (baxRecord.substitutionTag) { string expectedSubstitutionTags; expectedSubstitutionTags.assign((char*)baxRecord.substitutionTag + subreadStart, (char*)baxRecord.substitutionTag + subreadStart + length); const string& bamSubstitutionTags = bamRecord.SubstitutionTag(); EXPECT_EQ(expectedSubstitutionTags, bamSubstitutionTags); } // TODO: IPDs const LocalContextFlags ctxFlags = subreadIntervals[intervalIdx].LocalContextFlags; EXPECT_EQ(md5Id, bamRecord.ReadGroupId()); EXPECT_EQ(movieName, bamRecord.MovieName()); EXPECT_EQ(1, bamRecord.NumPasses()); EXPECT_EQ(holeNumber, bamRecord.HoleNumber()); EXPECT_EQ(subreadStart, bamRecord.QueryStart()); EXPECT_EQ(subreadEnd, bamRecord.QueryEnd()); EXPECT_EQ(hqSnr, bamRecord.SignalToNoise()); EXPECT_EQ(ctxFlags, bamRecord.LocalContextFlags()); numTested++; intervalIdx++; } cleanup: EXPECT_GT(numTested, 1); // cleanup baxReader.Close(); RemoveFile(generatedBam); RemoveFile(scrapBam); }); // EXPECT_NO_THROW
bool HqRegionConverter::ConvertFile(HDFBasReader* reader, PacBio::BAM::BamWriter* writer, PacBio::BAM::BamWriter* scrapsWriter) { assert(reader); // read region table info std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader); RegionTable regionTable; std::string fn = filenameForReader_[reader]; assert(!fn.empty()); if (regionTableReader->Initialize(fn) == 0) { AddErrorMessage("could not read region table on "+fn); return false; } regionTable.Reset(); regionTableReader->ReadTable(regionTable); regionTableReader->Close(); // initialize read scores InitReadScores(reader); // fetch records from HDF5 file SMRTSequence smrtRecord; int hqStart, hqEnd, score; while (reader->GetNext(smrtRecord)) { // attempt get high quality region if (!LookupHQRegion(smrtRecord.zmwData.holeNumber, regionTable, hqStart, hqEnd, score)) { stringstream s; s << "could not find HQ region for hole number: " << smrtRecord.zmwData.holeNumber; AddErrorMessage(s.str()); smrtRecord.Free(); return false; } // Catch and repair 1-off errors in the HQ region hqEnd = (hqEnd == static_cast<int>(smrtRecord.length)-1) ? smrtRecord.length : hqEnd; // sequencing ZMW if (IsSequencingZmw(smrtRecord)) { // write HQRegion to main BAM file if (hqStart < hqEnd) { if (!WriteRecord(smrtRecord, hqStart, hqEnd, ReadGroupId(), writer)) { smrtRecord.Free(); return false; } } // if scraps BAM file present if (scrapsWriter) { // write 5'-end LQ sequence if (hqStart > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqStart, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write 3'-end LQ sequence if (static_cast<size_t>(hqEnd) < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqEnd, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } // non-sequencing ZMW else { assert(!IsSequencingZmw(smrtRecord)); // only write these if scraps BAM present & we are in 'internal mode' if (settings_.isInternal && scrapsWriter) { // write 5'-end LQ sequence if (hqStart > 0) { if (!WriteLowQualityRecord(smrtRecord, 0, hqStart, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write HQRegion to scraps BAM file if (hqStart < hqEnd) { if (!WriteFilteredRecord(smrtRecord, hqStart, hqEnd, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } // write 3'-end LQ sequence if (static_cast<size_t>(hqEnd) < smrtRecord.length) { if (!WriteLowQualityRecord(smrtRecord, hqEnd, smrtRecord.length, ScrapsReadGroupId(), scrapsWriter)) { smrtRecord.Free(); return false; } } } } smrtRecord.Free(); } // if we get here, all OK return true; }