bool CcsConverter::ConvertFile(HdfCcsReader* reader, BamWriter* writer) { assert(reader); // initialize with default values (shared across all unmapped subreads) BamRecordImpl bamRecord; // initialize read scores InitReadScores(reader); // fetch records from HDF5 file CCSSequence smrtRecord; while (reader->GetNext(smrtRecord)) { // Skip empty records if (smrtRecord.length == 0) continue; // attempt convert BAX to BAM if (!WriteRecord(smrtRecord, 0, smrtRecord.length, ReadGroupId(), writer)) { smrtRecord.Free(); return false; } smrtRecord.Free(); } // if we get here, all OK return true; }
int main(int argc, char* argv[]) { string inputFileName, outputFileName; if (argc < 2) { PrintUsage(); exit(0); } vector<string> inputFileNames; inputFileName = argv[1]; outputFileName = argv[2]; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector<string> regionFileNames; bool splitSubreads = true; bool useCCS = false; int minSubreadLength = 1; while (argi < argc) { if (strcmp(argv[argi], "-regionTable") == 0) { regionsFOFNName = argv[++argi]; } else if (strcmp(argv[argi], "-noSplitSubreads") == 0) { splitSubreads = false; } else if (strcmp(argv[argi], "-minSubreadLength") == 0) { minSubreadLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-useccsdenovo") == 0) { useCCS = true; } else { PrintUsage(); cout << "ERROR! Option " << argv[argi] << " is not supported." << endl; } argi++; } if (FileOfFileNames::IsFOFN(inputFileName)) { FileOfFileNames::FOFNToList(inputFileName, inputFileNames); } else { inputFileNames.push_back(inputFileName); } if (regionsFOFNName == "") { regionFileNames = inputFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(outputFileName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; AfgBasWriter afgWriter; afgWriter.Initialize(outputFileName); for (plsFileIndex = 0; plsFileIndex < inputFileNames.size(); plsFileIndex++) { if (splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); regionTable.SortTableByHoleNumber(); } ReaderAgglomerate reader; // reader.SkipReadQuality(); // should have been taken care of by *Filter modules if (useCCS){ reader.UseCCS(); } else { reader.IgnoreCCS(); } reader.Initialize(inputFileNames[plsFileIndex]); CCSSequence seq; int seqIndex = 0; int numRecords = 0; vector<ReadInterval> subreadIntervals; while (reader.GetNext(seq)){ ++seqIndex; if (splitSubreads == false) { if (seq.length >= minSubreadLength) { afgWriter.Write(seq); } seq.Free(); continue; } DNALength hqReadStart, hqReadEnd; int score; GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, score); subreadIntervals.clear(); // clear old, new intervals are appended. CollectSubreadIntervals(seq,®ionTable, subreadIntervals); if (seq.length == 0 and subreadIntervals.size() > 0) { cout << "WARNING! A high quality interval region exists for a read of length 0." <<endl; cout << " The offending ZMW number is " << seq.zmwData.holeNumber << endl; seq.Free(); continue; } for (int intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence; int subreadStart = subreadIntervals[intvIndex].start > hqReadStart ? subreadIntervals[intvIndex].start : hqReadStart; int subreadEnd = subreadIntervals[intvIndex].end < hqReadEnd ? subreadIntervals[intvIndex].end : hqReadEnd; int subreadLength = subreadEnd - subreadStart; if (subreadLength < minSubreadLength) continue; subreadSequence.subreadStart = subreadStart; subreadSequence.subreadEnd = subreadEnd; subreadSequence.ReferenceSubstring(seq, subreadStart, subreadLength); stringstream titleStream; titleStream << seq.title << "/" << subreadIntervals[intvIndex].start << "_" << subreadIntervals[intvIndex].end; subreadSequence.CopyTitle(titleStream.str()); afgWriter.Write(subreadSequence); delete[] subreadSequence.title; } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } }
TEST(CcsTest, EndToEnd_Multiple) { // setup const string movieName = "m131018_081703_42161_c100585152550000001823088404281404_s1_p0"; vector<string> baxFilenames; baxFilenames.push_back(tests::Data_Dir + "/data/" + movieName + ".1.ccs.h5"); const string generatedBam = movieName + ".ccs.bam"; // run conversion const int result = RunBax2Bam(baxFilenames, "--ccs"); EXPECT_EQ(0, result); { // ensure PBI exists const BamFile generatedBamFile(generatedBam); EXPECT_TRUE(generatedBamFile.PacBioIndexExists()); } // open BAX reader on original data HDFCCSReader<CCSSequence> baxReader; baxReader.IncludeField("Basecall"); baxReader.IncludeField("QualityValue"); baxReader.IncludeField("DeletionQV"); baxReader.IncludeField("InsertionQV"); baxReader.IncludeField("SubstitutionQV"); string baxBasecallerVersion; string baxBindingKit; string baxSequencingKit; // set magic bits baxReader.SetReadBasesFromCCS(); const int initOk = baxReader.Initialize(baxFilenames.front()); EXPECT_EQ(1, initOk); if (initOk == 1) { if (baxReader.scanDataReader.fileHasScanData && baxReader.scanDataReader.initializedRunInfoGroup) { if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("BindingKit")) { HDFAtom<std::string> bkAtom; if (bkAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "BindingKit")) { bkAtom.Read(baxBindingKit); bkAtom.dataspace.close(); } } if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("SequencingKit")) { HDFAtom<std::string> skAtom; if (skAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "SequencingKit")) { skAtom.Read(baxSequencingKit); skAtom.dataspace.close(); } } { HDFGroup bcGroup; if (baxReader.pulseDataGroup.ContainsObject("BaseCalls") && bcGroup.Initialize(baxReader.pulseDataGroup.group, "BaseCalls")) { HDFAtom<std::string> clAtom; if (bcGroup.ContainsAttribute("ChangeListID") && clAtom.Initialize(bcGroup.group, "ChangeListID")) { clAtom.Read(baxBasecallerVersion); clAtom.dataspace.close(); } bcGroup.Close(); } } } } EXPECT_NO_THROW( { // open BAM file BamFile bamFile(generatedBam); // check BAM header information const BamHeader& header = bamFile.Header(); EXPECT_EQ(string("1.5"), header.Version()); EXPECT_EQ(string("unknown"), header.SortOrder()); EXPECT_EQ(string("3.0.2"), header.PacBioBamVersion()); EXPECT_TRUE(header.Sequences().empty()); EXPECT_TRUE(header.Comments().empty()); ASSERT_FALSE(header.Programs().empty()); const vector<string> readGroupIds = header.ReadGroupIds(); ASSERT_FALSE(readGroupIds.empty()); const ReadGroupInfo& rg = header.ReadGroup(readGroupIds.front()); string rawId = movieName + "//CCS"; string md5Id; MakeMD5(rawId, md5Id, 8); EXPECT_EQ(md5Id, rg.Id()); EXPECT_EQ(string("PACBIO"), rg.Platform()); EXPECT_EQ(movieName, rg.MovieName()); EXPECT_TRUE(rg.SequencingCenter().empty()); EXPECT_TRUE(rg.Date().empty()); EXPECT_TRUE(rg.FlowOrder().empty()); EXPECT_TRUE(rg.KeySequence().empty()); EXPECT_TRUE(rg.Library().empty()); EXPECT_TRUE(rg.Programs().empty()); EXPECT_TRUE(rg.PredictedInsertSize().empty()); EXPECT_TRUE(rg.Sample().empty()); EXPECT_EQ("CCS", rg.ReadType()); EXPECT_EQ(baxBasecallerVersion, rg.BasecallerVersion()); EXPECT_EQ(baxBindingKit, rg.BindingKit()); EXPECT_EQ(baxSequencingKit, rg.SequencingKit()); EXPECT_EQ(75, std::stod(rg.FrameRateHz())); EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV)); EXPECT_EQ("iq", rg.BaseFeatureTag(BaseFeature::INSERTION_QV)); EXPECT_EQ("sq", rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV)); EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_TAG)); EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::IPD)); EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::MERGE_QV)); EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_TAG)); // compare 1st record from each file CCSSequence baxRecord; const UInt holeNumber = baxRecord.zmwData.holeNumber; size_t numTested = 0; EntireFileQuery entireFile(bamFile); for (BamRecord& bamRecord : entireFile) { while (baxReader.GetNext(baxRecord)) { if (baxRecord.length > 0) goto compare; } goto cleanup; compare: EXPECT_GT(baxRecord.length, 0U); const BamRecordImpl& bamRecordImpl = bamRecord.Impl(); EXPECT_EQ(4680U,bamRecordImpl.Bin()); EXPECT_EQ(0, bamRecordImpl.InsertSize()); EXPECT_EQ(255, bamRecordImpl.MapQuality()); EXPECT_EQ(-1, bamRecordImpl.MatePosition()); EXPECT_EQ(-1, bamRecordImpl.MateReferenceId()); EXPECT_EQ(-1, bamRecordImpl.Position()); EXPECT_EQ(-1, bamRecordImpl.ReferenceId()); EXPECT_FALSE(bamRecordImpl.IsMapped()); const int holeNumber = baxRecord.zmwData.holeNumber; const int numPasses = baxRecord.numPasses; const string expectedName = baxRecord.GetName(); EXPECT_EQ(expectedName, bamRecordImpl.Name()); using PacBio::BAM::QualityValue; using PacBio::BAM::QualityValues; const DNALength length = baxRecord.length; string expectedSequence; expectedSequence.assign((const char*)baxRecord.seq, length); QualityValues expectedQualities; expectedQualities.assign((uint8_t*)baxRecord.qual.data, baxRecord.qual.data + length); const string bamSequence = bamRecord.Sequence(); const QualityValues bamQualities = bamRecord.Qualities(); EXPECT_EQ(expectedSequence, bamSequence); EXPECT_EQ(expectedQualities, bamQualities); const QualityValues bamDeletionQVs = bamRecord.DeletionQV(); const QualityValues bamInsertionQVs = bamRecord.InsertionQV(); const QualityValues bamSubstitutionQVs = bamRecord.SubstitutionQV(); for (size_t i = 0; i < length; ++i) { EXPECT_EQ((QualityValue)baxRecord.GetDeletionQV(i), bamDeletionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetInsertionQV(i), bamInsertionQVs.at(i)); EXPECT_EQ((QualityValue)baxRecord.GetSubstitutionQV(i), bamSubstitutionQVs.at(i)); } EXPECT_EQ(md5Id, bamRecord.ReadGroupId()); EXPECT_EQ(movieName, bamRecord.MovieName()); EXPECT_EQ(numPasses, bamRecord.NumPasses()); EXPECT_EQ(holeNumber, bamRecord.HoleNumber()); EXPECT_FALSE(bamRecord.HasLocalContextFlags()); EXPECT_FALSE(bamRecord.HasSignalToNoise()); numTested++; } cleanup: EXPECT_GT(numTested, 1UL); // cleanup baxReader.Close(); RemoveFile(generatedBam); RemoveFile(generatedBam + ".pbi"); }); // EXPECT_NO_THROW
int main(int argc, char* argv[]) { string plsFileName, fastaOutName; vector<string> plsFileNames; bool trimByRegion, maskByRegion; trimByRegion = false; maskByRegion = false; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector<string> regionFileNames; bool splitSubreads = true; int minSubreadLength = 0; bool addSimulatedData = false; bool printSimulatedCoordinate = false; bool printSimulatedSequenceIndex = false; bool printFastq = false; bool printCcs = false; int lineLength = 50; int minReadScore = 0; bool encodeCCSPassesInTitle = false; vector<int> holeNumbers; CommandLineParser clp; bool printOnlyBest = false; clp.SetProgramName("pls2fasta"); clp.RegisterStringOption("file.pls.h5", &plsFileName, "Input pls/bas.h5 file.", true); clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions."); clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'."); clp.RegisterStringOption("regionTable", ®ionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset."); clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences."); clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers)."); clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality."); clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences"); clp.RegisterFlagOption("passesInTitle", &encodeCCSPassesInTitle, "Append /N_passes/ to ccs sequence title, where" " N is the number of passes."); clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read. The score is " "a number between 0 and 1000 and represents the expected accuracy percentage * 10. " "A typical value would be between 750 and 800. This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this. Otherwise, print the longest" "subread. This does not support fastq."); clp.SetProgramSummary("Converts bas.h5 files to fasta or fastq files. Although fasta files are provided" " with every run, they are not trimmed nor split into subreads. This program takes " "additional annotation information, such as the subread coordinates and high quality regions " "and uses them to create fasta sequences that are substrings of all bases called. Most of the time " "you will want to trim low quality reads, so you should specify -trimByRegion."); clp.ParseCommandLine(argc, argv); if (trimByRegion and maskByRegion) { cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl; exit(1); } if (printFastq) { // Setting lineLength to 0 flags to print on one line. lineLength = 0; } if (FileOfFileNames::IsFOFN(plsFileName)) { FileOfFileNames::FOFNToList(plsFileName, plsFileNames); } else { plsFileNames.push_back(plsFileName); } if (regionsFOFNName == "") { regionFileNames = plsFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; sort(holeNumbers.begin(), holeNumbers.end()); for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { if (trimByRegion or maskByRegion or splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); regionTable.SortTableByHoleNumber(); } HDFCCSReader<CCSSequence> ccsReader; HDFBasReader smrtReader; if (printCcs) { ccsReader.Initialize(plsFileNames[plsFileIndex]); } else { smrtReader.Initialize(plsFileNames[plsFileIndex]); } vector<ReadInterval> subreadIntervals;; SMRTSequence seq; CCSSequence ccsSeq; while (true) { if (printCcs == true or printOnlyBest == true) { if (ccsReader.GetNext(ccsSeq) == false) { break; } else { seq = ccsSeq.unrolledRead; } } else { if (smrtReader.GetNext(seq) == false) { break; } } if (holeNumbers.size() != 0 and binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) { continue; } if (encodeCCSPassesInTitle) { assert(printCcs); string title = ccsSeq.title; stringstream titleStrm; titleStrm << title << "/"<<ccsSeq.numPasses << "_passes/"; ccsSeq.CopyTitle(titleStrm.str()); } if (printCcs == true or (printOnlyBest and ccsSeq.length > 0)) { // // The reason for checking to see if the length is greater // than 0 again is in case -ccs flag is specified, but the ccs // sequence is empty, nothing should be printed. When // printing only the best this block should only be entered if // a ccs sequence exists, because the rest of the loop is // skipped after this. if (ccsSeq.length > 0) { if (printFastq == false) { ccsSeq.PrintSeq(fastaOut); } else { ccsSeq.PrintFastq(fastaOut, lineLength); } } ccsSeq.Free(); seq.Free(); continue; } if (seq.length == 0) { continue; } // // Determine the high quality boundaries of the read. This is // the full read is no hq regions exist, or it is stated to // ignore regions. // DNALength hqReadStart, hqReadEnd; int hqRegionScore; if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or (trimByRegion == false and maskByRegion == false)) { hqReadStart = 0; hqReadEnd = seq.length; } // // Mask off the low quality portions of the reads. // if (maskByRegion) { if (hqReadStart > 0) { fill(&seq.seq[0], &seq.seq[hqReadStart], 'N'); } if (hqReadEnd != seq.length) { fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N'); } } // // Now possibly print the full read with masking. This could be handled by making a // if (splitSubreads == false) { ReadInterval wholeRead(0, seq.length); // The set of subread intervals is just the entire read. subreadIntervals.clear(); subreadIntervals.push_back(wholeRead); } else { // // Print subread coordinates no matter whether or not reads have subreads. // subreadIntervals.clear(); // clear old, new intervals are appended. CollectSubreadIntervals(seq, ®ionTable, subreadIntervals); } // // Output all subreads as separate sequences. // int intvIndex; SMRTSequence bestSubreadSequence; int bestSubreadScore = -1; int bestSubreadIndex = 0; int bestSubreadStart = 0, bestSubreadEnd = 0; SMRTSequence bestSubread; for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; subreadSequence.subreadStart = subreadIntervals[intvIndex].start; subreadSequence.subreadEnd = subreadIntervals[intvIndex].end; // // When trimming by region, only output the parts of the // subread that overlap the hq region. // if (trimByRegion == true) { subreadSequence.subreadStart = max((DNALength) subreadIntervals[intvIndex].start, hqReadStart); subreadSequence.subreadEnd = min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd); } if (subreadSequence.subreadStart >= subreadSequence.subreadEnd or subreadSequence.subreadEnd - subreadSequence.subreadStart <= minSubreadLength) { // // There is no high qualty portion of this subread. Skip it. // continue; } if (hqRegionScore < minReadScore) { continue; } // // Print the subread, adding the coordinates as part of the title. // subreadSequence.ReferenceSubstring(seq, subreadSequence.subreadStart, subreadSequence.subreadEnd - subreadSequence.subreadStart); stringstream titleStream; titleStream << seq.title; if (splitSubreads) { // // Add the subread coordinates if splitting on subread. // titleStream << "/" << subreadSequence.subreadStart << "_" << subreadSequence.subreadEnd; } subreadSequence.CopyTitle(titleStream.str()); // // Eventually replace with WriterAgglomerate. // if (printOnlyBest == false) { if (subreadSequence.length > 0) { if (printFastq == false) { ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut); } else { subreadSequence.PrintFastq(fastaOut, lineLength); } } delete[] subreadSequence.title; } else { int subreadWeightedScore = subreadSequence.length * hqRegionScore; if (subreadWeightedScore > bestSubreadScore) { bestSubreadIndex = intvIndex; bestSubread = subreadSequence; bestSubreadScore = subreadWeightedScore; } } } if (printOnlyBest) { if (bestSubreadScore >= 0) { if (printFastq == false) { bestSubread.PrintSeq(fastaOut); } else { bestSubread.PrintFastq(fastaOut, bestSubread.length); } bestSubread.Free(); } } ccsSeq.Free(); seq.Free(); } if (printCcs or printOnlyBest) { ccsReader.Close(); } else { smrtReader.Close(); } hdfRegionReader.Close(); } }