예제 #1
0
TEST_F(HDFCCSReaderTEST, ReadCCSFromBasH5) {
    string fileName = baxFile2;
    HDFCCSReader<CCSSequence> reader;
    reader.InitializeDefaultIncludedFields();
    ASSERT_EQ(reader.Initialize(fileName), 1);
    ASSERT_EQ(reader.GetMovieName(),
            "m130220_114643_42129_c100471902550000001823071906131347_s1_p0");

    CCSSequence seq;
    for(int i=0; i < 1000; i++) {
        reader.GetNext(seq);
    }
    reader.Close();
}
예제 #2
0
TEST_F(HDFCCSReaderTEST, ReadCCSFromCCSH5) {
    string fileName = ccsFile1;
    HDFCCSReader<CCSSequence> reader;
    reader.SetReadBasesFromCCS();
    reader.InitializeDefaultIncludedFields();
    ASSERT_EQ(reader.Initialize(fileName), 1);
    ASSERT_EQ(reader.GetMovieName(),
            "m130328_211423_ethan_c100499512550000001823070408081371_s1_p0");

    CCSSequence seq;
    for(int i=0; i < 1000; i++) {
        reader.GetNext(seq);
    }

    reader.Close();
}
예제 #3
0
TEST(CcsTest, EndToEnd_Multiple)
{
    // setup
    const string movieName = "m131018_081703_42161_c100585152550000001823088404281404_s1_p0";

    vector<string> baxFilenames;
    baxFilenames.push_back(tests::Data_Dir + "/data/" + movieName + ".1.ccs.h5");

    const string generatedBam = movieName + ".ccs.bam";

    // run conversion
    const int result = RunBax2Bam(baxFilenames, "--ccs");
    EXPECT_EQ(0, result);

    {   // ensure PBI exists
        const BamFile generatedBamFile(generatedBam);
        EXPECT_TRUE(generatedBamFile.PacBioIndexExists());
    }

    // open BAX reader on original data
    HDFCCSReader<CCSSequence> baxReader;
    baxReader.IncludeField("Basecall");
    baxReader.IncludeField("QualityValue");
    baxReader.IncludeField("DeletionQV");
    baxReader.IncludeField("InsertionQV");
    baxReader.IncludeField("SubstitutionQV");

    string baxBasecallerVersion;
    string baxBindingKit;
    string baxSequencingKit;

    // set magic bits
    baxReader.SetReadBasesFromCCS();

    const int initOk = baxReader.Initialize(baxFilenames.front());
    EXPECT_EQ(1, initOk);
    if (initOk == 1) {

        if (baxReader.scanDataReader.fileHasScanData && baxReader.scanDataReader.initializedRunInfoGroup) {

            if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("BindingKit")) {
                HDFAtom<std::string> bkAtom;
                if (bkAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "BindingKit")) {
                    bkAtom.Read(baxBindingKit);
                    bkAtom.dataspace.close();
                }
            }

            if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("SequencingKit")) {
                HDFAtom<std::string> skAtom;
                if (skAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "SequencingKit")) {
                    skAtom.Read(baxSequencingKit);
                    skAtom.dataspace.close();
                }
            }

            {
                HDFGroup bcGroup;
                if (baxReader.pulseDataGroup.ContainsObject("BaseCalls") &&
                        bcGroup.Initialize(baxReader.pulseDataGroup.group, "BaseCalls"))
                {
                    HDFAtom<std::string> clAtom;
                    if (bcGroup.ContainsAttribute("ChangeListID") &&
                            clAtom.Initialize(bcGroup.group, "ChangeListID"))
                    {
                        clAtom.Read(baxBasecallerVersion);
                        clAtom.dataspace.close();
                    }
                    bcGroup.Close();
                }
            }
        }
    }

    EXPECT_NO_THROW(
    {
        // open BAM file
        BamFile bamFile(generatedBam);

        // check BAM header information
        const BamHeader& header = bamFile.Header();
        EXPECT_EQ(string("1.5"),     header.Version());
        EXPECT_EQ(string("unknown"), header.SortOrder());
        EXPECT_EQ(string("3.0.2"),   header.PacBioBamVersion());
        EXPECT_TRUE(header.Sequences().empty());
        EXPECT_TRUE(header.Comments().empty());
        ASSERT_FALSE(header.Programs().empty());

        const vector<string> readGroupIds = header.ReadGroupIds();
        ASSERT_FALSE(readGroupIds.empty());
        const ReadGroupInfo& rg = header.ReadGroup(readGroupIds.front());

        string rawId = movieName + "//CCS";
        string md5Id;
        MakeMD5(rawId, md5Id, 8);
        EXPECT_EQ(md5Id, rg.Id());

        EXPECT_EQ(string("PACBIO"), rg.Platform());
        EXPECT_EQ(movieName, rg.MovieName());

        EXPECT_TRUE(rg.SequencingCenter().empty());
        EXPECT_TRUE(rg.Date().empty());
        EXPECT_TRUE(rg.FlowOrder().empty());
        EXPECT_TRUE(rg.KeySequence().empty());
        EXPECT_TRUE(rg.Library().empty());
        EXPECT_TRUE(rg.Programs().empty());
        EXPECT_TRUE(rg.PredictedInsertSize().empty());
        EXPECT_TRUE(rg.Sample().empty());

        EXPECT_EQ("CCS", rg.ReadType());
        EXPECT_EQ(baxBasecallerVersion, rg.BasecallerVersion());
        EXPECT_EQ(baxBindingKit, rg.BindingKit());
        EXPECT_EQ(baxSequencingKit, rg.SequencingKit());
        EXPECT_EQ(75, std::stod(rg.FrameRateHz()));
        EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
        EXPECT_EQ("iq", rg.BaseFeatureTag(BaseFeature::INSERTION_QV));
        EXPECT_EQ("sq", rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV));
        EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_TAG));
        EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::IPD));
        EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::MERGE_QV));
        EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_TAG));

        // compare 1st record from each file
        CCSSequence baxRecord;
        const UInt holeNumber = baxRecord.zmwData.holeNumber;

        size_t numTested = 0;
        EntireFileQuery entireFile(bamFile);
        for (BamRecord& bamRecord : entireFile) {
            while (baxReader.GetNext(baxRecord)) {
                if (baxRecord.length > 0)
                    goto compare;
            }
            goto cleanup;

compare:
            EXPECT_GT(baxRecord.length, 0U);
            const BamRecordImpl& bamRecordImpl = bamRecord.Impl();
            EXPECT_EQ(4680U,bamRecordImpl.Bin());
            EXPECT_EQ(0,   bamRecordImpl.InsertSize());
            EXPECT_EQ(255, bamRecordImpl.MapQuality());
            EXPECT_EQ(-1,  bamRecordImpl.MatePosition());
            EXPECT_EQ(-1,  bamRecordImpl.MateReferenceId());
            EXPECT_EQ(-1,  bamRecordImpl.Position());
            EXPECT_EQ(-1,  bamRecordImpl.ReferenceId());
            EXPECT_FALSE(bamRecordImpl.IsMapped());

            const int holeNumber      = baxRecord.zmwData.holeNumber;
            const int numPasses       = baxRecord.numPasses;
            const string expectedName = baxRecord.GetName();
            EXPECT_EQ(expectedName, bamRecordImpl.Name());

            using PacBio::BAM::QualityValue;
            using PacBio::BAM::QualityValues;

            const DNALength length = baxRecord.length;

            string expectedSequence;
            expectedSequence.assign((const char*)baxRecord.seq, length);

            QualityValues expectedQualities;
            expectedQualities.assign((uint8_t*)baxRecord.qual.data, baxRecord.qual.data + length);

            const string bamSequence = bamRecord.Sequence();
            const QualityValues bamQualities = bamRecord.Qualities();
            EXPECT_EQ(expectedSequence,  bamSequence);
            EXPECT_EQ(expectedQualities, bamQualities);

            const QualityValues bamDeletionQVs     = bamRecord.DeletionQV();
            const QualityValues bamInsertionQVs    = bamRecord.InsertionQV();
            const QualityValues bamSubstitutionQVs = bamRecord.SubstitutionQV();

            for (size_t i = 0; i < length; ++i) {
                EXPECT_EQ((QualityValue)baxRecord.GetDeletionQV(i),     bamDeletionQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetInsertionQV(i),    bamInsertionQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetSubstitutionQV(i), bamSubstitutionQVs.at(i));
            }

            EXPECT_EQ(md5Id,        bamRecord.ReadGroupId());
            EXPECT_EQ(movieName,    bamRecord.MovieName());
            EXPECT_EQ(numPasses,    bamRecord.NumPasses());
            EXPECT_EQ(holeNumber,   bamRecord.HoleNumber());
            EXPECT_FALSE(bamRecord.HasLocalContextFlags());
            EXPECT_FALSE(bamRecord.HasSignalToNoise());
            numTested++;
        }

cleanup:
        EXPECT_GT(numTested, 1UL);

        // cleanup
        baxReader.Close();
        RemoveFile(generatedBam);
        RemoveFile(generatedBam + ".pbi");

    }); // EXPECT_NO_THROW
예제 #4
0
int main(int argc, char* argv[]) {

	string plsFileName, fastaOutName;
	vector<string> plsFileNames;
	bool trimByRegion, maskByRegion;
	trimByRegion = false;
	maskByRegion = false;
	int argi = 3;
	RegionTable regionTable;
	string regionsFOFNName = "";
	vector<string> regionFileNames;
	bool splitSubreads = true;
	int minSubreadLength = 0;
	bool addSimulatedData = false;
	bool printSimulatedCoordinate = false;
	bool printSimulatedSequenceIndex = false;
  bool printFastq = false;
  bool printCcs   = false;
  int  lineLength = 50;
  int minReadScore = 0;
  bool encodeCCSPassesInTitle = false;
  vector<int> holeNumbers;
  CommandLineParser clp;
  bool printOnlyBest = false;
  clp.SetProgramName("pls2fasta");
  clp.RegisterStringOption("file.pls.h5", &plsFileName, "Input pls/bas.h5 file.", true);
  clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions.");
  clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'.");
  clp.RegisterStringOption("regionTable", &regionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset.");
  clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences.");
  clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers).");
  clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality.");
  clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences");
  clp.RegisterFlagOption("passesInTitle", &encodeCCSPassesInTitle, "Append /N_passes/ to ccs sequence title, where"
                         " N is the number of passes.");
  clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger);
  clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read.  The score is "
                        "a number between 0 and 1000 and represents the expected accuracy percentage * 10. "
                        "A typical value would be between 750 and 800.  This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger);
  clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this.  Otherwise, print the longest"
                         "subread.  This does not support fastq.");
  clp.SetProgramSummary("Converts bas.h5 files to fasta or fastq files. Although fasta files are provided"
                        " with every run, they are not trimmed nor split into subreads. This program takes "
                        "additional annotation information, such as the subread coordinates and high quality regions "
                        "and uses them to create fasta sequences that are substrings of all bases called. Most of the time "
                        "you will want to trim low quality reads, so you should specify -trimByRegion.");
                        
  clp.ParseCommandLine(argc, argv);

	if (trimByRegion and maskByRegion) {
		cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl;
		exit(1);
	}
		 
  if (printFastq) {
    // Setting lineLength to 0 flags to print on one line.
    lineLength = 0;
  }

	if (FileOfFileNames::IsFOFN(plsFileName)) {
		FileOfFileNames::FOFNToList(plsFileName, plsFileNames);
	}
	else {
		plsFileNames.push_back(plsFileName);
	}
	if (regionsFOFNName == "") {
		regionFileNames = plsFileNames;
	}
	else {
		if (FileOfFileNames::IsFOFN(regionsFOFNName)) {
			FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames);
		}
		else {
			regionFileNames.push_back(regionsFOFNName);
		}
	}



	ofstream fastaOut;
	CrucialOpen(fastaOutName, fastaOut);
	int plsFileIndex;
	HDFRegionTableReader hdfRegionReader;
  sort(holeNumbers.begin(), holeNumbers.end());
	for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) {

		if (trimByRegion or maskByRegion or splitSubreads) {
			hdfRegionReader.Initialize(regionFileNames[plsFileIndex]);
			hdfRegionReader.ReadTable(regionTable);
			regionTable.SortTableByHoleNumber();
		}
    HDFCCSReader<CCSSequence> ccsReader;		
		HDFBasReader smrtReader;
		if (printCcs) {
			ccsReader.Initialize(plsFileNames[plsFileIndex]);
		}
		else {
			smrtReader.Initialize(plsFileNames[plsFileIndex]);
		}

		vector<ReadInterval> subreadIntervals;;
    SMRTSequence seq;
    CCSSequence  ccsSeq;

		while (true) {
			if (printCcs == true or printOnlyBest == true) {
				if (ccsReader.GetNext(ccsSeq) == false) {
					break;
				}
				else {
					seq = ccsSeq.unrolledRead;
				}
			}
			else {
				if (smrtReader.GetNext(seq) == false) {
					break;
				}
			}

      if (holeNumbers.size() != 0 and 
          binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) {
        continue;
      }

      if (encodeCCSPassesInTitle) {
				assert(printCcs);
        string title = ccsSeq.title;
        stringstream titleStrm;
        titleStrm << title << "/"<<ccsSeq.numPasses << "_passes/";
        ccsSeq.CopyTitle(titleStrm.str());
      }

		  if (printCcs == true or (printOnlyBest and ccsSeq.length > 0)) {
        //
        // The reason for checking to see if the length is greater
        // than 0 again is in case -ccs flag is specified, but the ccs
        // sequence is empty, nothing should be printed.  When
        // printing only the best this block should only be entered if
        // a ccs sequence exists, because the rest of the loop is
        // skipped after this.

        if (ccsSeq.length > 0) {
          if (printFastq == false) {
            ccsSeq.PrintSeq(fastaOut);
          }
          else {
            ccsSeq.PrintFastq(fastaOut, lineLength);
          }
        }
				ccsSeq.Free();
				seq.Free();
        continue;
      }	


      if (seq.length == 0) {
        continue;
      }


      //
      // Determine the high quality boundaries of the read.  This is
      // the full read is no hq regions exist, or it is stated to
      // ignore regions.
      //
      DNALength hqReadStart, hqReadEnd;
      int hqRegionScore;
      if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or 
          (trimByRegion == false and maskByRegion == false)) {
        hqReadStart = 0;
        hqReadEnd   = seq.length;
      }
      
      //
      // Mask off the low quality portions of the reads.
      //
			if (maskByRegion) {
        if (hqReadStart > 0) {
          fill(&seq.seq[0], &seq.seq[hqReadStart], 'N');
        }
        if (hqReadEnd != seq.length) {
          fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N');
        }
			}


      //
      // Now possibly print the full read with masking.  This could be handled by making a 
      // 
			if (splitSubreads == false) {
        ReadInterval wholeRead(0, seq.length);
        // The set of subread intervals is just the entire read.
        subreadIntervals.clear();
        subreadIntervals.push_back(wholeRead);
			}
			else {
				//
				// Print subread coordinates no matter whether or not reads have subreads.
				//
				subreadIntervals.clear(); // clear old, new intervals are appended.
				CollectSubreadIntervals(seq, &regionTable, subreadIntervals);
      }
      //
      // Output all subreads as separate sequences.
      //
      int intvIndex;
      SMRTSequence bestSubreadSequence;
      int bestSubreadScore = -1;
      int bestSubreadIndex = 0;
      int bestSubreadStart = 0, bestSubreadEnd = 0;
      SMRTSequence bestSubread;
      for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) {
        SMRTSequence subreadSequence, subreadSequenceRC;
					
        subreadSequence.subreadStart = subreadIntervals[intvIndex].start;
        subreadSequence.subreadEnd   = subreadIntervals[intvIndex].end;
          
        // 
        // When trimming by region, only output the parts of the
        // subread that overlap the hq region.
        //
        if (trimByRegion == true) {
          subreadSequence.subreadStart = max((DNALength) subreadIntervals[intvIndex].start, hqReadStart);
          subreadSequence.subreadEnd   = min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd);
        }

        if (subreadSequence.subreadStart >= subreadSequence.subreadEnd or 
            subreadSequence.subreadEnd - subreadSequence.subreadStart <= minSubreadLength) {
          //
          // There is no high qualty portion of this subread. Skip it.
          //
          continue;
        }

        if (hqRegionScore < minReadScore) {
          continue;
        }

        //
        // Print the subread, adding the coordinates as part of the title.
        //
        subreadSequence.ReferenceSubstring(seq, subreadSequence.subreadStart, 
                                           subreadSequence.subreadEnd - subreadSequence.subreadStart);
        stringstream titleStream;
        titleStream << seq.title;
        if (splitSubreads) {
          //
          // Add the subread coordinates if splitting on subread.
          //
          titleStream << "/" 
                      << subreadSequence.subreadStart
                      << "_" << subreadSequence.subreadEnd;
        }
          
        subreadSequence.CopyTitle(titleStream.str());

        //
        // Eventually replace with WriterAgglomerate.
        //
        if (printOnlyBest == false) {
          if (subreadSequence.length > 0) {
            if (printFastq == false) {
              ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut);
            }
            else {
              subreadSequence.PrintFastq(fastaOut, lineLength);
            }
          }
          delete[] subreadSequence.title;
        }
        else {
          int subreadWeightedScore = subreadSequence.length * hqRegionScore;
          if (subreadWeightedScore > bestSubreadScore) {
            bestSubreadIndex = intvIndex;
            bestSubread = subreadSequence;
            bestSubreadScore = subreadWeightedScore;
          }
        }
      }

      if (printOnlyBest) {
        if (bestSubreadScore >= 0) {
          if (printFastq == false) {
            bestSubread.PrintSeq(fastaOut);
          }
          else {
            bestSubread.PrintFastq(fastaOut, bestSubread.length);
          }
          bestSubread.Free();
        }
      }
      ccsSeq.Free();
      seq.Free();
    }
		if (printCcs or printOnlyBest) {
			ccsReader.Close();
		}
		else {
			smrtReader.Close();
		}
    hdfRegionReader.Close();
  }
}