Example #1
0
// General functions.
bool LookupHQRegion(int holeNumber, RegionTable &regionTable, 
    int &start, int &end, int &score) {

	int regionLowIndex, regionHighIndex;
	regionLowIndex = regionHighIndex = 0;

	regionTable.LookupRegionsByHoleNumber(holeNumber,
        regionLowIndex, regionHighIndex);

	bool readHasGoodRegion = true;
	int  regionIndex = regionLowIndex;
	while (regionIndex < regionHighIndex and 
		   regionTable.GetType(regionIndex) != HQRegion) {
		regionIndex++;
	}
	
	if (regionIndex == regionHighIndex) {
    start = end = score = 0;
		return false;
	}
	else {
		start = regionTable.GetStart(regionIndex);
		end   = regionTable.GetEnd(regionIndex);
        score = regionTable.GetScore(regionIndex);
		return true;
	}
}
Example #2
0
//
// Collect region indices for either all region types, or just a few specific region types.
//
//
int CollectRegionIndices(SMRTSequence &read, RegionTable &regionTable, 
    std::vector<int> &regionIndices, RegionType *regionTypes,
    int numRegionTypes) {

    int regionLow, regionHigh;
    int prevNumRegionIndices = regionIndices.size();
    if (FindRegionIndices(read, &regionTable, regionLow, regionHigh)) {
        int i;
        for (i = regionLow; i < regionHigh; i++) {
            if (regionTypes == NULL) {
                regionIndices.push_back(i);
            }
            else {
                int t;
                for (t = 0; t < numRegionTypes; t++) {
                    if (regionTable.GetType(i) == regionTypes[t]) {
                        regionIndices.push_back(i);
                        break;
                    }
                }
            }
        }
    }
    return regionIndices.size() - prevNumRegionIndices;
}
// General functions.
bool LookupHQRegion(int holeNumber, RegionTable &regionTable, int &start, int &end, int &score)
{

    if (regionTable.HasHoleNumber(holeNumber)) {
        RegionAnnotations zmwRegions = regionTable[holeNumber];
        if (zmwRegions.HasHQRegion()) {
            start = zmwRegions.HQStart();
            end = zmwRegions.HQEnd();
            score = zmwRegions.HQScore();
            return true;
        }
    }

    start = end = score = 0;
    return false;
}
Example #4
0
TEST(SubreadsTest, EndToEnd_Multiple)
{
    // setup
    const string movieName = "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0";

    vector<string> baxFilenames;
    baxFilenames.push_back(tests::Data_Dir + "/" + movieName + ".1.bax.h5");

    const string generatedBam = movieName + ".subreads.bam";
    const string scrapBam = movieName + ".scraps.bam";

    // run conversion
    const int result = RunBax2Bam(baxFilenames, "--subread");
    EXPECT_EQ(0, result);

    // open BAX reader on original data
    HDFBasReader baxReader;
    baxReader.IncludeField("Basecall");
    baxReader.IncludeField("DeletionQV");
    baxReader.IncludeField("DeletionTag");
    baxReader.IncludeField("InsertionQV");
    baxReader.IncludeField("PreBaseFrames");
    baxReader.IncludeField("MergeQV");
    baxReader.IncludeField("SubstitutionQV");
    baxReader.IncludeField("HQRegionSNR");
    // not using SubTag or PulseWidth here

    string baxBasecallerVersion;
    string baxBindingKit;
    string baxSequencingKit;

    const int initOk = baxReader.Initialize(baxFilenames.front());
    EXPECT_EQ(1, initOk);
    if (initOk == 1) {

        if (baxReader.scanDataReader.fileHasScanData && baxReader.scanDataReader.initializedRunInfoGroup) {

            if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("BindingKit")) {
                HDFAtom<std::string> bkAtom;
                if (bkAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "BindingKit")) {
                    bkAtom.Read(baxBindingKit);
                    bkAtom.dataspace.close();
                }
            }

            if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("SequencingKit")) {
                HDFAtom<std::string> skAtom;
                if (skAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "SequencingKit")) {
                    skAtom.Read(baxSequencingKit);
                    skAtom.dataspace.close();
                }
            }
        }

        baxReader.GetChangeListID(baxBasecallerVersion);
    }

    // read region table info
    boost::scoped_ptr<HDFRegionTableReader> regionTableReader(new HDFRegionTableReader);
    RegionTable regionTable;
    std::string fn = baxFilenames.front();
    EXPECT_TRUE(regionTableReader->Initialize(fn) != 0);
    regionTable.Reset();
    regionTableReader->ReadTable(regionTable);
    regionTableReader->Close();

    EXPECT_NO_THROW(
    {
        // open BAM file
        BamFile bamFile(generatedBam);

        // check BAM header information
        const BamHeader& header = bamFile.Header();
        EXPECT_EQ(string("1.5"),     header.Version());
        EXPECT_EQ(string("unknown"), header.SortOrder());
        EXPECT_EQ(string("3.0.1"),   header.PacBioBamVersion());
        EXPECT_TRUE(header.Sequences().empty());
        EXPECT_TRUE(header.Comments().empty());
        ASSERT_FALSE(header.Programs().empty());

        const vector<string> readGroupIds = header.ReadGroupIds();
        ASSERT_FALSE(readGroupIds.empty());
        const ReadGroupInfo& rg = header.ReadGroup(readGroupIds.front());

        string rawId = movieName + "//SUBREAD";
        string md5Id;
        MakeMD5(rawId, md5Id, 8);
        EXPECT_EQ(md5Id, rg.Id());

        EXPECT_EQ(string("PACBIO"), rg.Platform());
        EXPECT_EQ(movieName, rg.MovieName());

        EXPECT_TRUE(rg.SequencingCenter().empty());
        EXPECT_TRUE(rg.Date().empty());
        EXPECT_TRUE(rg.FlowOrder().empty());
        EXPECT_TRUE(rg.KeySequence().empty());
        EXPECT_TRUE(rg.Library().empty());
        EXPECT_TRUE(rg.Programs().empty());
        EXPECT_TRUE(rg.PredictedInsertSize().empty());
        EXPECT_TRUE(rg.Sample().empty());

        EXPECT_EQ("SUBREAD", rg.ReadType());
        EXPECT_EQ(baxBasecallerVersion, rg.BasecallerVersion());
        EXPECT_EQ(baxBindingKit, rg.BindingKit());
        EXPECT_EQ(baxSequencingKit, rg.SequencingKit());
        EXPECT_EQ(75, std::stod(rg.FrameRateHz()));
        EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
        EXPECT_EQ("dt", rg.BaseFeatureTag(BaseFeature::DELETION_TAG));
        EXPECT_EQ("iq", rg.BaseFeatureTag(BaseFeature::INSERTION_QV));
        EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD));
        EXPECT_EQ("mq", rg.BaseFeatureTag(BaseFeature::MERGE_QV));
        EXPECT_EQ("sq", rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV));
        EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_TAG));
        EXPECT_EQ(FrameCodec::V1, rg.IpdCodec());

        // compare 1st record from each file
        SMRTSequence baxRecord;
        UInt holeNumber = 0;
        vector<float> hqSnr;

        size_t intervalIdx = 0;
        vector<SubreadInterval> subreadIntervals;

        size_t numTested = 0;
        EntireFileQuery entireFile(bamFile);
        for (BamRecord& bamRecord : entireFile) {
            if (intervalIdx >= subreadIntervals.size())
            {
                while (baxReader.GetNext(baxRecord))
                {
                    holeNumber  = baxRecord.zmwData.holeNumber;

                    ComputeSubreadIntervals(&subreadIntervals, regionTable, holeNumber);

                    /* this is for debugging subread interval problems
                    int hqStart = 0;
                    int hqEnd = 0;
                    int hqScore = 0;
                    LookupHQRegion(holeNumber,
                                   regionTable,
                                   hqStart,
                                   hqEnd,
                                   hqScore);

                    vector<ReadInterval> subreadIntervals_;
                    CollectSubreadIntervals(baxRecord, &regionTable, subreadIntervals_);

                    for (int i = subreadIntervals_.size() - 1; i >= 0; --i)
                    {
                        auto& in = subreadIntervals_[i];
                        int inStart = max(hqStart, in.start);
                        int inEnd   = min(hqEnd,   in.end);
                        if (inEnd <= inStart)
                            subreadIntervals_.erase(subreadIntervals_.begin() + i);
                    }

                    cerr << "hqRegion: " << hqStart << ", " << hqEnd << endl;
                    cerr << "subreadRegions:" << endl;
                    for (const auto& in : subreadIntervals_)
                        cerr << "  l, r: " << in.start << ", " << in.end << endl;

                    cerr << "adapterDerived:" << endl;
                    for (const auto& in : subreadIntervals)
                        cerr << "  l, r: " << in.Start << ", " << in.End << endl;

                    cerr << endl;
                    // */

                    if (subreadIntervals.empty())
                        continue;

                    intervalIdx = 0;

                    hqSnr.clear();
                    hqSnr.push_back(baxRecord.HQRegionSnr('A'));
                    hqSnr.push_back(baxRecord.HQRegionSnr('C'));
                    hqSnr.push_back(baxRecord.HQRegionSnr('G'));
                    hqSnr.push_back(baxRecord.HQRegionSnr('T'));

                    EXPECT_GT(hqSnr[0], 0);
                    EXPECT_GT(hqSnr[1], 0);
                    EXPECT_GT(hqSnr[2], 0);
                    EXPECT_GT(hqSnr[3], 0);

                    goto compare;
                }

                goto cleanup;
            }

compare:
            const BamRecordImpl& bamRecordImpl = bamRecord.Impl();
            EXPECT_EQ(4680,bamRecordImpl.Bin());
            EXPECT_EQ(0,   bamRecordImpl.InsertSize());
            EXPECT_EQ(255, bamRecordImpl.MapQuality());
            EXPECT_EQ(-1,  bamRecordImpl.MatePosition());
            EXPECT_EQ(-1,  bamRecordImpl.MateReferenceId());
            EXPECT_EQ(-1,  bamRecordImpl.Position());
            EXPECT_EQ(-1,  bamRecordImpl.ReferenceId());
            EXPECT_FALSE(bamRecordImpl.IsMapped());

            const int subreadStart = subreadIntervals[intervalIdx].Start;
            const int subreadEnd   = subreadIntervals[intervalIdx].End;

            const string expectedName = movieName + "/" +
                    to_string(holeNumber)   + "/" +
                    to_string(subreadStart) + "_" +
                    to_string(subreadEnd);
            EXPECT_EQ(expectedName, bamRecordImpl.Name());

            using PacBio::BAM::QualityValue;
            using PacBio::BAM::QualityValues;

            const DNALength length = subreadEnd - subreadStart;

            string expectedSequence;
            expectedSequence.assign((const char*)baxRecord.seq + subreadStart, length);

            const string bamSequence = bamRecord.Sequence();
            const QualityValues bamQualities = bamRecord.Qualities();
            EXPECT_EQ(expectedSequence, bamSequence);
            EXPECT_TRUE(bamQualities.empty());

            const QualityValues bamDeletionQVs = bamRecord.DeletionQV();
            const QualityValues bamInsertionQVs = bamRecord.InsertionQV();
            const QualityValues bamMergeQVs = bamRecord.MergeQV();
            const QualityValues bamSubstitutionQVs = bamRecord.SubstitutionQV();

            for (size_t i = 0; i < length; ++i) {
                const size_t pos = subreadStart + i;

                EXPECT_EQ((QualityValue)baxRecord.GetDeletionQV(pos),     bamDeletionQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetInsertionQV(pos),    bamInsertionQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetMergeQV(pos),        bamMergeQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetSubstitutionQV(pos), bamSubstitutionQVs.at(i));
            }

            if (baxRecord.deletionTag)
            {
                string expectedDeletionTags;
                expectedDeletionTags.assign((char*)baxRecord.deletionTag + subreadStart,
                                            (char*)baxRecord.deletionTag + subreadStart + length);
                const string& bamDeletionTags = bamRecord.DeletionTag();
                EXPECT_EQ(expectedDeletionTags, bamDeletionTags);
            }

            if (baxRecord.substitutionTag)
            {
                string expectedSubstitutionTags;
                expectedSubstitutionTags.assign((char*)baxRecord.substitutionTag + subreadStart,
                                            (char*)baxRecord.substitutionTag + subreadStart + length);
                const string& bamSubstitutionTags = bamRecord.SubstitutionTag();
                EXPECT_EQ(expectedSubstitutionTags, bamSubstitutionTags);
            }

            // TODO: IPDs
            const LocalContextFlags ctxFlags = subreadIntervals[intervalIdx].LocalContextFlags;

            EXPECT_EQ(md5Id,        bamRecord.ReadGroupId());
            EXPECT_EQ(movieName,    bamRecord.MovieName());
            EXPECT_EQ(1,            bamRecord.NumPasses());
            EXPECT_EQ(holeNumber,   bamRecord.HoleNumber());
            EXPECT_EQ(subreadStart, bamRecord.QueryStart());
            EXPECT_EQ(subreadEnd,   bamRecord.QueryEnd());
            EXPECT_EQ(hqSnr,        bamRecord.SignalToNoise());
            EXPECT_EQ(ctxFlags,     bamRecord.LocalContextFlags());

            numTested++;
            intervalIdx++;
        }

cleanup:
        EXPECT_GT(numTested, 1);

        // cleanup
        baxReader.Close();
        RemoveFile(generatedBam);
        RemoveFile(scrapBam);

    }); // EXPECT_NO_THROW
Example #5
0
int main(int argc, char* argv[]) {
    string program = "pls2fasta";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

	string plsFileName, fastaOutName;
	vector<string> plsFileNames;
	bool trimByRegion, maskByRegion;
	trimByRegion = false;
	maskByRegion = false;
	int argi = 3;
	RegionTable regionTable;
	string regionsFOFNName = "";
	vector<string> regionFileNames;
	bool splitSubreads = true;
	int minSubreadLength = 0;
	bool addSimulatedData = false;
	bool printSimulatedCoordinate = false;
	bool printSimulatedSequenceIndex = false;
  bool printFastq = false;
  bool printCcs   = false;
  int  lineLength = 50;
  int minReadScore = 0;
  vector<int> holeNumbers;
  CommandLineParser clp;
  bool printOnlyBest = false;

  clp.SetProgramName(program);
  clp.SetVersion(versionString);
  clp.RegisterStringOption("in.pls.h5", &plsFileName, "Input pls.h5/bax.h5/fofn file.", true);
  clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions.");
  clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'.");
  clp.RegisterStringOption("regionTable", &regionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset.");
  clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences.");
  clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers).");
  clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality.");
  clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences");
  clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger);
  clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read.  The score is "
                        "a number between 0 and 1000 and represents the expected accuracy percentage * 10. "
                        "A typical value would be between 750 and 800.  This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger);
  clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this.  Otherwise, print the longest"
                         "subread.  This does not support fastq.");
  string description = ("Converts pls.h5/bax.h5/fofn files to fasta or fastq files. Although fasta files are provided"
  " with every run, they are not trimmed nor split into subreads. This program takes "
  "additional annotation information, such as the subread coordinates and high quality regions "
  "and uses them to create fasta sequences that are substrings of all bases called. Most of the time "
  "you will want to trim low quality reads, so you should specify -trimByRegion.");
  clp.SetProgramSummary(description);
                        
  clp.ParseCommandLine(argc, argv);

    cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started."  << endl;
	if (trimByRegion and maskByRegion) {
		cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl;
		exit(1);
	}
		 
  if (printFastq) {
    // Setting lineLength to 0 flags to print on one line.
    lineLength = 0;
  }

	if (FileOfFileNames::IsFOFN(plsFileName)) {
		FileOfFileNames::FOFNToList(plsFileName, plsFileNames);
	}
	else {
		plsFileNames.push_back(plsFileName);
	}
	if (regionsFOFNName == "") {
		regionFileNames = plsFileNames;
	}
	else {
		if (FileOfFileNames::IsFOFN(regionsFOFNName)) {
			FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames);
		}
		else {
			regionFileNames.push_back(regionsFOFNName);
		}
	}



	ofstream fastaOut;
	CrucialOpen(fastaOutName, fastaOut);
	int plsFileIndex;
	HDFRegionTableReader hdfRegionReader;
  sort(holeNumbers.begin(), holeNumbers.end());
	for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) {
		if (trimByRegion or maskByRegion or splitSubreads) {
			hdfRegionReader.Initialize(regionFileNames[plsFileIndex]);
			hdfRegionReader.ReadTable(regionTable);
			regionTable.SortTableByHoleNumber();
		}
		
		ReaderAgglomerate reader;
    HDFBasReader ccsReader;

    if (printOnlyBest) {
      ccsReader.SetReadBasesFromCCS();
      ccsReader.Initialize(plsFileNames[plsFileIndex]);
    }
    if (printCcs == false) {
  		reader.IgnoreCCS();
    }
    else {
      reader.hdfBasReader.SetReadBasesFromCCS();
    }
		if (addSimulatedData) {
			reader.hdfBasReader.IncludeField("SimulatedCoordinate");
			reader.hdfBasReader.IncludeField("SimulatedSequenceIndex");
		}

        if (reader.SetReadFileName(plsFileNames[plsFileIndex]) == 0) {
          cout << "ERROR, could not determine file type."
               << plsFileNames[plsFileIndex] << endl;
          exit(1);
        }
        if (reader.Initialize() == 0) {
          cout << "ERROR, could not initialize file "
               << plsFileNames[plsFileIndex] << endl;
          exit(1);
        }

		DNALength simulatedCoordinate;
		DNALength simulatedSequenceIndex;
		reader.SkipReadQuality();
		SMRTSequence seq;
		vector<ReadInterval> subreadIntervals;;
    SMRTSequence ccsSeq;
		while (reader.GetNext(seq)) {
      if (printOnlyBest) {
        ccsReader.GetNext(ccsSeq);
      }

      if (holeNumbers.size() != 0 and 
          binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) {
        continue;
      }

      if (seq.length == 0) {
        continue;
      }

			if (addSimulatedData) {
				reader.hdfBasReader.simulatedCoordinateArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedCoordinate);
				reader.hdfBasReader.simulatedSequenceIndexArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedSequenceIndex);
			}

		  if (printCcs == true) {
        if (printFastq == false) {
          seq.PrintSeq(fastaOut);
        }
        else {
          seq.PrintFastq(fastaOut, lineLength);
        }
        continue;
      }	

      //
      // Determine the high quality boundaries of the read.  This is
      // the full read is no hq regions exist, or it is stated to
      // ignore regions.
      //
      DNALength hqReadStart, hqReadEnd;
      int hqRegionScore;
      if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or 
          (trimByRegion == false and maskByRegion == false)) {
        hqReadStart = 0;
        hqReadEnd   = seq.length;
      }
      
      //
      // Mask off the low quality portions of the reads.
      //
			if (maskByRegion) {
        if (hqReadStart > 0) {
          fill(&seq.seq[0], &seq.seq[hqReadStart], 'N');
        }
        if (hqReadEnd != seq.length) {
          fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N');
        }
			}
      


      //
      // Now possibly print the full read with masking.  This could be handled by making a 
      // 
			if (splitSubreads == false) {
        ReadInterval wholeRead(0, seq.length);
        // The set of subread intervals is just the entire read.
        subreadIntervals.clear();
        subreadIntervals.push_back(wholeRead);
			}
			else {
				//
				// Print subread coordinates no matter whether or not reads have subreads.
				//
				subreadIntervals.clear(); // clear old, new intervals are appended.
				CollectSubreadIntervals(seq, &regionTable, subreadIntervals);
      }
      //
      // Output all subreads as separate sequences.
      //
      int intvIndex;
      SMRTSequence bestSubreadSequence;
      int bestSubreadScore = -1;
      int bestSubreadIndex = 0;
      int bestSubreadStart = 0, bestSubreadEnd = 0;
      SMRTSequence bestSubread;
      for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) {
        SMRTSequence subreadSequence, subreadSequenceRC;
					
        subreadSequence.subreadStart = subreadIntervals[intvIndex].start;
        subreadSequence.subreadEnd   = subreadIntervals[intvIndex].end;
          
        // 
        // When trimming by region, only output the parts of the
        // subread that overlap the hq region.
        //
        if (trimByRegion == true) {
          subreadSequence.subreadStart = max((DNALength) subreadIntervals[intvIndex].start, hqReadStart);
          subreadSequence.subreadEnd   = min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd);
        }

        if (subreadSequence.subreadStart >= subreadSequence.subreadEnd or 
            subreadSequence.subreadEnd - subreadSequence.subreadStart <= minSubreadLength) {
          //
          // There is no high qualty portion of this subread. Skip it.
          //
          continue;
        }

        if (hqRegionScore < minReadScore) {
          continue;
        }

        //
        // Print the subread, adding the coordinates as part of the title.
        //
        subreadSequence.ReferenceSubstring(seq, subreadSequence.subreadStart, 
                                           subreadSequence.subreadEnd - subreadSequence.subreadStart);
        stringstream titleStream;
        titleStream << seq.title;
        if (splitSubreads) {
          //
          // Add the subread coordinates if splitting on subread.
          //
          titleStream << "/" 
                      << subreadSequence.subreadStart
                      << "_" << subreadSequence.subreadEnd;
        }
          
        // 
        // If running on simulated data, add where the values were simulated from.
        //
        if (addSimulatedData) {
          titleStream << ((FASTASequence*)&seq)->title << "/chrIndex_" 
                      << simulatedSequenceIndex << "/position_"<< simulatedCoordinate;
          ((FASTASequence*)&seq)->CopyTitle(titleStream.str());
        }

        subreadSequence.CopyTitle(titleStream.str());

        //
        // Eventually replace with WriterAgglomerate.
        //
        if (printOnlyBest == false) {
          if (subreadSequence.length > 0) {
            if (printFastq == false) {
              ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut);
            }
            else {
              subreadSequence.PrintFastq(fastaOut, lineLength);
            }
          }
          delete[] subreadSequence.title;
        }
        else {
          int subreadWeightedScore = subreadSequence.length * hqRegionScore;
          if (subreadWeightedScore > bestSubreadScore) {
            bestSubreadIndex = intvIndex;
            bestSubread = subreadSequence;
            bestSubreadScore = subreadWeightedScore;
          }
        }
      }

      if (printOnlyBest) {
        if (ccsSeq.length > 0) {
          if (printFastq == false) {
            ccsSeq.PrintSeq(fastaOut);
          }
          else {
            ccsSeq.PrintFastq(fastaOut, ccsSeq.length);
          }
        }
        else {
          if (bestSubreadScore >= 0) {
            if (printFastq == false) {
              bestSubread.PrintSeq(fastaOut);
            }
            else {
              bestSubread.PrintFastq(fastaOut, bestSubread.length);
            }
            bestSubread.Free();
          }
        }
        ccsSeq.Free();
      }
      seq.Free();
    }
    reader.Close();
    hdfRegionReader.Close();
  }
  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended."  << endl;
}
Example #6
0
bool SubreadConverter::ConvertFile(HDFBasReader* reader,
                                   PacBio::BAM::BamWriter* writer,
                                   PacBio::BAM::BamWriter* scrapsWriter) 
{
    assert(reader);

    // initialize with default values (shared across all unmapped subreads)
    BamRecordImpl bamRecord;

    // read region table info
    std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader);
    RegionTable regionTable;
    string fn = filenameForReader_[reader];
    assert(!fn.empty());
    if (regionTableReader->Initialize(fn) == 0) {
        AddErrorMessage("could not read region table on "+fn);
        return false;
    }
    regionTable.Reset();
    regionTableReader->ReadTable(regionTable);
    regionTableReader->Close();

    // initialize read scores
    InitReadScores(reader);

    // fetch records from HDF5 file
    SMRTSequence smrtRecord;
    while (reader->GetNext(smrtRecord)) {

        // compute subread & adapter intervals
        SubreadInterval hqInterval;
        deque<SubreadInterval> subreadIntervals;
        deque<SubreadInterval> adapterIntervals;
        try {
            hqInterval = ComputeSubreadIntervals(&subreadIntervals,
                                                 &adapterIntervals,
                                                 regionTable,
                                                 smrtRecord.zmwData.holeNumber,
                                                 smrtRecord.length);
        } catch (runtime_error& e) {
            AddErrorMessage(string(e.what()));
            smrtRecord.Free();
            return false;
        }

        // sequencing ZMW
        if (IsSequencingZmw(smrtRecord))
        {
            // write subreads to main BAM file
            for (const SubreadInterval& interval : subreadIntervals)
            {
                // skip invalid or 0-sized intervals
                if (interval.End <= interval.Start)
                    continue;

                if (!WriteSubreadRecord(smrtRecord,
                                        interval.Start,
                                        interval.End,
                                        ReadGroupId(),
                                        static_cast<uint8_t>(interval.LocalContextFlags),
                                        writer))
                {
                    smrtRecord.Free();
                    return false;
                }
            }

            // if scraps BAM file present
            if (scrapsWriter)
            {
                // write 5-end LQ sequence
                if (hqInterval.Start > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqInterval.Start,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write adapters
                for (const SubreadInterval& interval : adapterIntervals) {

                    // skip invalid or 0-sized adapters
                    if (interval.End <= interval.Start)
                        continue;

                    if (!WriteAdapterRecord(smrtRecord,
                                            interval.Start,
                                            interval.End,
                                            ScrapsReadGroupId(),
                                            scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write 3'-end LQ sequence
                if (hqInterval.End < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqInterval.End,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        } // sequencing ZMW

        // non-sequencing ZMW
        else
        {
            assert(!IsSequencingZmw(smrtRecord));

            // only write these if scraps BAM present & we are in 'internal mode'
            if (settings_.isInternal && scrapsWriter)
            {
                // write 5-end LQ sequence to scraps BAM
                if (hqInterval.Start > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqInterval.Start,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write subreads & adapters to scraps BAM, sorted by query start
                while (!subreadIntervals.empty() && !adapterIntervals.empty()) {

                    const SubreadInterval& subread = subreadIntervals.front();
                    const SubreadInterval& adapter = adapterIntervals.front();
                    assert(subread.Start != adapter.Start);

                    if (subread.Start < adapter.Start)
                    {
                        if (!WriteFilteredRecord(smrtRecord,
                                                 subread.Start,
                                                 subread.End,
                                                 ScrapsReadGroupId(),
                                                 static_cast<uint8_t>(subread.LocalContextFlags),
                                                 scrapsWriter))
                        {
                            smrtRecord.Free();
                            return false;
                        }

                        subreadIntervals.pop_front();
                    }
                    else
                    {
                        if (!WriteAdapterRecord(smrtRecord,
                                                adapter.Start,
                                                adapter.End,
                                                ScrapsReadGroupId(),
                                                scrapsWriter))
                        {
                            smrtRecord.Free();
                            return false;
                        }
                        adapterIntervals.pop_front();
                    }
                }

                // flush any traling subread intervals
                while (!subreadIntervals.empty())
                {
                    assert(adapterIntervals.empty());
                    const SubreadInterval& subread = subreadIntervals.front();
                    if (!WriteFilteredRecord(smrtRecord,
                                             subread.Start,
                                             subread.End,
                                             ScrapsReadGroupId(),
                                             static_cast<uint8_t>(subread.LocalContextFlags),
                                             scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }

                    subreadIntervals.pop_front();
                }

                // flush any remaining adapter intervals
                while (!adapterIntervals.empty())
                {
                    assert(subreadIntervals.empty());
                    const SubreadInterval& adapter = adapterIntervals.front();
                    if (!WriteAdapterRecord(smrtRecord,
                                            adapter.Start,
                                            adapter.End,
                                            ScrapsReadGroupId(),
                                            scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                    adapterIntervals.pop_front();
                }

                // write 3'-end LQ sequence to scraps BAM
                if (hqInterval.End < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqInterval.End,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        } // non-sequencing ZMW

        smrtRecord.Free();
    }

    // if we get here, all OK
    return true; 
} 
Example #7
0
bool HqRegionConverter::ConvertFile(HDFBasReader* reader,
                                    PacBio::BAM::BamWriter* writer,
                                    PacBio::BAM::BamWriter* scrapsWriter) 
{
    assert(reader);

    // read region table info
    std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader);
    RegionTable regionTable;
    std::string fn = filenameForReader_[reader];
    assert(!fn.empty());
    if (regionTableReader->Initialize(fn) == 0) {
        AddErrorMessage("could not read region table on "+fn);
        return false;
    }
    regionTable.Reset();
    regionTableReader->ReadTable(regionTable);
    regionTableReader->Close();

    // initialize read scores
    InitReadScores(reader);

    // fetch records from HDF5 file
    SMRTSequence smrtRecord;
    int hqStart, hqEnd, score;
    while (reader->GetNext(smrtRecord)) {

        // attempt get high quality region
        if (!LookupHQRegion(smrtRecord.zmwData.holeNumber,
                            regionTable,
                            hqStart,
                            hqEnd,
                            score))
        {
            stringstream s;
            s << "could not find HQ region for hole number: " << smrtRecord.zmwData.holeNumber;
            AddErrorMessage(s.str());
            smrtRecord.Free();
            return false;
        }

        // Catch and repair 1-off errors in the HQ region
        hqEnd = (hqEnd == static_cast<int>(smrtRecord.length)-1) ? smrtRecord.length
                                                                 : hqEnd;

        // sequencing ZMW
        if (IsSequencingZmw(smrtRecord))
        {
            // write HQRegion to main BAM file
            if (hqStart < hqEnd)
            {
                if (!WriteRecord(smrtRecord,
                                 hqStart,
                                 hqEnd,
                                 ReadGroupId(),
                                 writer))
                {
                    smrtRecord.Free();
                    return false;
                }
            }

            // if scraps BAM file present
            if (scrapsWriter)
            {
                // write 5'-end LQ sequence
                if (hqStart > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqStart,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write 3'-end LQ sequence
                if (static_cast<size_t>(hqEnd) < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqEnd,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        }

        // non-sequencing ZMW
        else
        {
            assert(!IsSequencingZmw(smrtRecord));

            // only write these if scraps BAM present & we are in 'internal mode'
            if (settings_.isInternal && scrapsWriter)
            {
                // write 5'-end LQ sequence
                if (hqStart > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqStart,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write HQRegion to scraps BAM file
                if (hqStart < hqEnd)
                {
                    if (!WriteFilteredRecord(smrtRecord,
                                             hqStart,
                                             hqEnd,
                                             ScrapsReadGroupId(),
                                             scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write 3'-end LQ sequence
                if (static_cast<size_t>(hqEnd) < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqEnd,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        }

        smrtRecord.Free();
    }

    // if we get here, all OK
    return true;
}
Example #8
0
int main(int argc, char* argv[]) {
    string refGenomeFileName = "";
    string lengthModelFileName = "";
    string outputModelFileName = "";
    DNALength numBasesPerFile = 0;
    string sourceReadsFileName = "";
    string titleTableFileName = "";
    int numBasH5Files = 1;
    string basH5BaseFileName = "simulated";
    string movieName = "m101211_092754_00114_cSIM_s1_p0";
    bool   doRandGenInit = true;
    bool   usePosMap     = false;
    bool   printPercentRepeat = false;
    string posMapFileName = "";
    vector<string> movieNames;
    bool useLengthModel = false;
    bool useFixedLength = false;
    ofstream posMapFile;
    int scaledLength = 0;
    int fixedLength = 0;
    int nBasFiles = 1;
    bool useLengthsModel = true;
    bool printHelp = false;


    //  Look to see if the refAsReads flag is specified anywhere before
    //  parsing the command line.

    CommandLineParser clp;
    string commandLine;
    string helpString;
    SetHelp(helpString);
    vector<string> fns;

    clp.RegisterStringOption("genome", &refGenomeFileName, "");
    clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("sourceReads", &sourceReadsFileName, "");
    clp.RegisterStringOption("lengthModel", &lengthModelFileName, "");
    clp.RegisterIntOption("fixedLength", &fixedLength, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterFlagOption("lengthModel", &useLengthModel, "");
    clp.RegisterStringOption("movieName", &movieName, "");
    clp.RegisterStringOption("titleTable", &titleTableFileName, "");
    clp.RegisterStringOption("baseFileName", &basH5BaseFileName, "");
    clp.RegisterIntOption("nFiles", &nBasFiles, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterIntOption("meanLength", &scaledLength, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("posMap", &posMapFileName, "");
    clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, "");
    clp.RegisterFlagOption("h", &printHelp, "");

    clp.SetHelp(helpString);
    clp.ParseCommandLine(argc, argv, fns);
    clp.CommandLineToString(argc, argv, commandLine);

    clp.SetProgramName("alchemy");

    outputModelFileName = fns[0];
    if (argc <= 1 or printHelp or outputModelFileName == "") {
        cout << helpString << endl;
        exit(0);
    }

    if (usePosMap) {
        CrucialOpen(posMapFileName, posMapFile, std::ios::out);
    }

    if (sourceReadsFileName == "" and fixedLength == 0) {
        useLengthModel = true;
    }

    if (useLengthModel and fixedLength != 0) {
        cout << "ERROR! You must either use a length model or a fixed length." << endl;
        exit(1);
    }

    if (sourceReadsFileName == "" and numBasesPerFile == 0) {
        cout << "ERROR! You must specify either a set of read to use as " << endl
             << "original reads for simulation or the total number of bases " << endl
             << "to simulate in each bas.h5 file." << endl;
        exit(1);
    }

    if (sourceReadsFileName == "" and refGenomeFileName == "") {
        cout << "ERROR! You must specify a genome to sample reads from or a set of read "<<endl
             << "to use as original reads for simulation." << endl;
        exit(1);
    }

    if (fixedLength != 0 and refGenomeFileName == "") {
        cout << "ERROR! You must specify a genome file if using a fixed length." << endl;
        exit(1);
    }

    if ((fixedLength != 0 or scaledLength != 0) and sourceReadsFileName != "") {
        cout << "ERROR! You cannot specify a fixed length nor mean length with a source " << endl
             << "reads file.  The read lengths are taken from the source reads or the length model." << endl;
        exit(1);
    }

    LengthHistogram   lengthHistogram;
    OutputSampleListSet   outputModel(0);
    TitleTable titleTable;

    if (doRandGenInit) {
        InitializeRandomGeneratorWithTime();
    }

    //
    // Read models.
    //
    if (titleTableFileName != "") {
        titleTable.Read(titleTableFileName);
    }


    outputModel.Read(outputModelFileName);

    if (useLengthModel) {
        lengthHistogram.BuildFromAlignmentLengths(outputModel.lengths);
    }


    vector<int> alignmentLengths;
    int meanAlignmentLength;


    if (scaledLength != 0 and useLengthModel) {
        //
        // Scale the histogram so that the average length is 'scaledLength'.
        //

        // 1. Integrate histogram
        long totalLength = 0;
        long totalSamples = 0;
        int hi;
        for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) {
            int ni;
            ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi];
            totalLength += ni * lengthHistogram.lengthHistogram.data[hi];
        }
        totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1];

        float meanSampleLength = totalLength / (1.0*totalSamples);
        float fractionIncrease = scaledLength / meanSampleLength;

        for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) {
            lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease;
        }
    }

    FASTAReader inReader, seqReader;
    vector<FASTASequence> reference;
    DNALength refLength = 0;
    int i;
    if (refGenomeFileName != "") {
        inReader.Init(refGenomeFileName);
        inReader.ReadAllSequences(reference);

        for (i = 0; i < reference.size(); i++) {
            refLength += reference[i].length;
        }
    }

    if (sourceReadsFileName !=  "") {
        seqReader.Init(sourceReadsFileName);
    }

    ofstream readsFile;

    //
    // Create and simulate bas.h5 files.
    //
    int baseFileIndex;
    bool readsRemain = true;
    for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles)  // case 1 is reads are generated by file
                             or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file.
            baseFileIndex++) {
        //
        // Prep the base file for writing.
        //
        stringstream fileNameStrm, movieNameStrm;
        //string movieName = "m000000_000000_00000_cSIMULATED_s";
        movieNameStrm << movieName << baseFileIndex << "_p0";
        string fullMovieName = movieNameStrm.str();
        fileNameStrm  << fullMovieName <<  ".bas.h5";


        HDFBasWriter basWriter;
        HDFRegionTableWriter regionWriter;
        //
        // This is mainly used to create the atributes.
        //
        RegionTable regionTable;
        regionTable.CreateDefaultAttributes();

        basWriter.SetPlatform(Springfield);
        //
        // Use a fixed set of fields for now.
        //

        // These are all pulled from the outputModel.
        basWriter.IncludeField("Basecall");
        basWriter.IncludeField("QualityValue");
        basWriter.IncludeField("SubstitutionQV");
        basWriter.IncludeField("SubstitutionTag");
        basWriter.IncludeField("InsertionQV");
        basWriter.IncludeField("DeletionQV");
        basWriter.IncludeField("DeletionTag");
        basWriter.IncludeField("WidthInFrames");
        basWriter.IncludeField("PreBaseFrames");
        basWriter.IncludeField("PulseIndex");

        vector<unsigned char> qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag;
        vector<HalfWord> widthInFrames, preBaseFrames, pulseIndex;

        // Just go from 0 .. hole Number
        basWriter.IncludeField("HoleNumber");
        // Fixed to 0.
        basWriter.IncludeField("HoleXY");
        if (usePosMap == false) {
            basWriter.IncludeField("SimulatedSequenceIndex");
            basWriter.IncludeField("SimulatedCoordinate");
        }
        basWriter.SetChangeListID("1.3.0.50.104380");


        DNALength numSimulatedBases  = 0;
        FASTASequence sampleSeq;
        //sampleSeq.length = readLength;
        int maxRetry = 10000000;
        int retryNumber = 0;
        int numReads = 0;
        int readLength = 0;

        while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) {
            DNALength seqIndex, seqPos;
            if (useLengthModel or fixedLength) {
                if (useLengthModel) {
                    lengthHistogram.GetRandomLength(readLength);
                }
                else {
                    readLength = fixedLength;
                }
            }
            if (refGenomeFileName != "") {
                FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1));
                sampleSeq.seq    = &reference[seqIndex].seq[seqPos];
                sampleSeq.length = readLength + (outputModel.keyLength - 1);
                assert(reference[seqIndex].length >= sampleSeq.length);
            }
            else if (sourceReadsFileName != "") {
                if (seqReader.GetNext(sampleSeq) == false) {
                    readsRemain = false;
                    break;
                }
                if (sampleSeq.length < outputModel.keyLength) {
                    continue;
                }
                //
                // Now attempt to parse the position from the fasta title.
                //

                if (useLengthModel) {
                    int tryNumber = 0;
                    readLength = 0;
                    int maxNTries = 1000;
                    int tryBuffer[5] = {-1,-1,-1,-1,-1};
                    while (tryNumber < maxNTries and readLength < outputModel.keyLength) {
                        lengthHistogram.GetRandomLength(readLength);
                        readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength);
                        tryBuffer[tryNumber%5] = readLength;
                        tryNumber++;
                    }
                    if (tryNumber >= maxNTries) {
                        cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " <<endl
                             << "minimum number of bases using the length model specified in the alchemy." <<endl
                             << "model.  Something is either wrong with the model or the context length is too large." <<endl;
                        cout << "The last few tries were: " << tryBuffer[0] << " " << tryBuffer[1] << " " << tryBuffer[2] << " " << tryBuffer[3] << " " << tryBuffer[4] << endl;
                        exit(1);
                    }
                }

                readLength = sampleSeq.length;
                vector<string> tokens;
                Tokenize(sampleSeq.title, "|", tokens);
                if (tokens.size() == 4) {
                    seqPos = atoi(tokens[2].c_str());
                    if (titleTableFileName == "") {
                        seqIndex = 0;
                    }
                    else {
                        int index;
                        titleTable.Lookup(tokens[1], index);
                        seqIndex = index;
                    }
                }
                else {
                    seqPos   = 0;
                }
            }

            //
            // If this is the first read printed to the base file, initialize it.
            //
            if (numSimulatedBases == 0) {
                basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield);
                regionWriter.Initialize(basWriter.pulseDataGroup);
            }

            numSimulatedBases += readLength;

            int p;
            // create the sample sequence
            int contextLength = outputModel.keyLength;
            int contextMiddle = contextLength / 2;
            string outputString;

            int nDel = 0;
            int nIns = 0;

            //
            // Simulate to beyond the sample length.
            //
            qualityValue.clear();
            substitutionQV.clear();
            substitutionTag.clear();
            insertionQV.clear();
            deletionQV.clear();
            deletionTag.clear();
            pulseIndex.clear();
            widthInFrames.clear();
            preBaseFrames.clear();
            assert(sampleSeq.length > contextMiddle + 1);
            for (p = contextMiddle;
                    p < sampleSeq.length - contextMiddle - 1; p++) {
                string refContext;
                refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength);

                string outputContext;
                int    contextWasFound;
                OutputSample sample;
                int i;
                for (i = 0; i < refContext.size(); i++) {
                    refContext[i] = toupper(refContext[i]);
                }
                outputModel.SampleRandomSample(refContext, sample);

                if (sample.type == OutputSample::Deletion ) {
                    //
                    // There was a deletion.  Advance in reference, then output
                    // the base after the deletion.
                    //
                    p++;
                    ++nDel;
                }

                int cp;
                //
                // Add the sampled context, possibly multiple characters because of an insertion.
                //
                for (i = 0; i < sample.nucleotides.size(); i++) {
                    outputString.push_back(sample.nucleotides[i]);
                    qualityValue.push_back(sample.qualities[i].qv[0]);
                    deletionQV.push_back(sample.qualities[i].qv[1]);
                    insertionQV.push_back(sample.qualities[i].qv[2]);
                    substitutionQV.push_back(sample.qualities[i].qv[3]);
                    deletionTag.push_back(sample.qualities[i].tags[0]);
                    substitutionTag.push_back(sample.qualities[i].tags[1]);
                    pulseIndex.push_back(sample.qualities[i].frameValues[0]);
                    preBaseFrames.push_back(sample.qualities[i].frameValues[1]);
                    widthInFrames.push_back(sample.qualities[i].frameValues[2]);
                }
                nIns += sample.qualities.size() - 1;
            }
            if (outputString.find('N') != outputString.npos or
                    outputString.find('n') != outputString.npos) {
                cout << "WARNING!  The sampled string " << endl << outputString << endl
                     << "should not contain N's, but it seems to.  This is being ignored "<<endl
                     << "for now so that simulation may continue, but this shouldn't happen"<<endl
                     << "and is really a bug." << endl;
                numSimulatedBases -= readLength;
                continue;
            }
            //
            // Ok, done creating the read, now time to create some quality values!!!!!
            //
            SMRTSequence read;
            read.length = outputString.size();
            read.Allocate(read.length);
            memcpy(read.seq, outputString.c_str(), read.length * sizeof(unsigned char));
            assert(qualityValue.size() == read.length * sizeof(unsigned char));
            memcpy(read.qual.data, &qualityValue[0], read.length * sizeof(unsigned char));
            memcpy(read.deletionQV.data, &deletionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.insertionQV.data, &insertionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.substitutionQV.data, &substitutionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.deletionTag, &deletionTag[0], read.length * sizeof(unsigned char));
            memcpy(read.substitutionTag, &substitutionTag[0], read.length * sizeof(unsigned char));
            memcpy(read.pulseIndex, &pulseIndex[0], read.length * sizeof(int));
            memcpy(read.preBaseFrames, &preBaseFrames[0], read.length * sizeof(HalfWord));
            memcpy(read.widthInFrames, &widthInFrames[0], read.length * sizeof(HalfWord));

            //
            // The pulse index for now is just fake data.
            //
            int i;
            for (i = 0; i < read.length; i++) {
                read.pulseIndex[i] = 1;
            }
            read.xy[0] = seqIndex;
            read.xy[1] = seqPos;
            read.zmwData.holeNumber = numReads;

            basWriter.Write(read);
            // Record where this was simulated from.
            if (usePosMap == false) {
                basWriter.WriteSimulatedCoordinate(seqPos);
                basWriter.WriteSimulatedSequenceIndex(seqIndex);
            }
            else {
                posMapFile << fullMovieName << "/" << numReads << "/0_" << read.length << " " << seqIndex << " "<< seqPos;
                if (printPercentRepeat) {
                    DNALength nRepeat = sampleSeq.GetRepeatContent();
                    posMapFile << " " << nRepeat*1.0/sampleSeq.length;
                }
                posMapFile << endl;
            }
            RegionAnnotation region;
            region.row[0] = read.zmwData.holeNumber;
            region.row[1] = 1;
            region.row[2] = 0;
            region.row[3] = read.length;
            region.row[4] = 1000; // Should be enough.
            regionWriter.Write(region);
            region.row[1] = 2; // Rewrite for hq region encompassing everything.
            regionWriter.Write(region);
            if (sourceReadsFileName != "") {
                sampleSeq.Free();
            }
            read.Free();
            ++numReads;
        }
        regionWriter.Finalize(regionTable.columnNames,
                              regionTable.regionTypes,
                              regionTable.regionDescriptions,
                              regionTable.regionSources);
        basWriter.Close();
        numReads = 0;
        //
        // The bas writer should automatically flush on closing.
        //
    }
    if (usePosMap) {
        posMapFile.close();
    }

    for (i = 0; i < reference.size(); i++) {
        reference[i].Free();
    }
}
Example #9
0
int main(int argc, char* argv[]) {

    string inputFileName, outputFileName;

    if (argc < 2) {
        PrintUsage();
        exit(0);
    }
    vector<string> inputFileNames;
    inputFileName = argv[1];
    outputFileName = argv[2];
    int argi = 3;
    RegionTable regionTable;
    string regionsFOFNName = "";
    vector<string> regionFileNames;
    bool splitSubreads = true;
    bool useCCS = false;
    int minSubreadLength = 1;
    while (argi < argc) {
        if (strcmp(argv[argi], "-regionTable") == 0) {
            regionsFOFNName = argv[++argi];
        }
        else if (strcmp(argv[argi], "-noSplitSubreads") == 0) {
            splitSubreads = false;
        }
        else if (strcmp(argv[argi], "-minSubreadLength") == 0) {
            minSubreadLength = atoi(argv[++argi]);
        }
        else if (strcmp(argv[argi], "-useccsdenovo") == 0) {
            useCCS = true;
        }
        else {
            PrintUsage();
            cout << "ERROR! Option " << argv[argi] << " is not supported." << endl;
        }
        argi++;
    }
         
    if (FileOfFileNames::IsFOFN(inputFileName)) {
        FileOfFileNames::FOFNToList(inputFileName, inputFileNames);
    }
    else {
        inputFileNames.push_back(inputFileName);
    }
    if (regionsFOFNName == "") {
        regionFileNames = inputFileNames;
    }
    else {
        if (FileOfFileNames::IsFOFN(regionsFOFNName)) {
            FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames);
        }
        else {
            regionFileNames.push_back(regionsFOFNName);
        }
    }


    ofstream fastaOut;
    CrucialOpen(outputFileName, fastaOut);
    int plsFileIndex;
    HDFRegionTableReader hdfRegionReader;
    AfgBasWriter afgWriter;
    afgWriter.Initialize(outputFileName);

    for (plsFileIndex = 0; plsFileIndex < inputFileNames.size(); plsFileIndex++) {
        if (splitSubreads) {
            hdfRegionReader.Initialize(regionFileNames[plsFileIndex]);
            hdfRegionReader.ReadTable(regionTable);
            regionTable.SortTableByHoleNumber();
        }
        
        ReaderAgglomerate reader;
        // reader.SkipReadQuality(); // should have been taken care of by *Filter modules
        if (useCCS){
            reader.UseCCS();
        } else {
            reader.IgnoreCCS();
        }
        reader.Initialize(inputFileNames[plsFileIndex]);
        CCSSequence seq; 
        int seqIndex = 0;
        int numRecords = 0;
        vector<ReadInterval> subreadIntervals;
        while (reader.GetNext(seq)){ 
            ++seqIndex;
            if (splitSubreads == false) {
                if (seq.length >= minSubreadLength) {
                    afgWriter.Write(seq);
                }
                seq.Free();
                continue;
            }

            DNALength hqReadStart, hqReadEnd;
            int score;
            GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, score);
            subreadIntervals.clear(); // clear old, new intervals are appended.
            CollectSubreadIntervals(seq,&regionTable, subreadIntervals);

            if (seq.length == 0 and subreadIntervals.size() > 0) {
                cout << "WARNING! A high quality interval region exists for a read of length 0." <<endl;
                cout << "  The offending ZMW number is " << seq.zmwData.holeNumber << endl;
                seq.Free();
                continue;
            }

            for (int intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) {
                SMRTSequence subreadSequence;
                
                int subreadStart = subreadIntervals[intvIndex].start > hqReadStart ? 
                                   subreadIntervals[intvIndex].start : hqReadStart;
                int subreadEnd   = subreadIntervals[intvIndex].end < hqReadEnd ?
                                   subreadIntervals[intvIndex].end : hqReadEnd;
                int subreadLength = subreadEnd - subreadStart;

                if (subreadLength < minSubreadLength) continue;

                subreadSequence.subreadStart = subreadStart;
                subreadSequence.subreadEnd   = subreadEnd;
                subreadSequence.ReferenceSubstring(seq, subreadStart, subreadLength);      
            
                stringstream titleStream;
                titleStream << seq.title << "/" << subreadIntervals[intvIndex].start 
                                         << "_" << subreadIntervals[intvIndex].end;
                subreadSequence.CopyTitle(titleStream.str());
                afgWriter.Write(subreadSequence);
                delete[] subreadSequence.title;
            }
            seq.Free();
        }
        reader.Close();
        hdfRegionReader.Close();
    }
}