int main(int argc, char* argv[])
{
    std::string outFileName;
    unsigned contextLength = 5;
    int minSamples = 500;
    int maxSamples = 1000;
    if (argc < 3) {
        PrintUsage();
        std::exit(EXIT_FAILURE);
    }

    int argi = 1;
    std::string cmpH5FileName;
    cmpH5FileName = argv[argi++];
    outFileName = argv[argi++];
    int minAverageQual = 0;
    bool onlyMaxLength = false;

    while (argi < argc) {
        if (strcmp(argv[argi], "-contextLength") == 0) {
            contextLength = atoi(argv[++argi]);
        } else if (strcmp(argv[argi], "-minSamples") == 0) {
            minSamples = atoi(argv[++argi]);
        } else if (strcmp(argv[argi], "-maxSamples") == 0) {
            maxSamples = atoi(argv[++argi]);
        } else if (strcmp(argv[argi], "-onlyMaxLength") == 0) {
            onlyMaxLength = true;
        } else {
            PrintUsage();
            std::cout << "ERROR, bad option: " << argv[argi] << std::endl;
            std::exit(EXIT_FAILURE);
        }
        ++argi;
    }
    std::map<std::string, ScoredLength> maxLengthMap;
    OutputSampleListSet samples(contextLength);
    SMRTSequence read;

    std::ofstream sampleOut;
    CrucialOpen(outFileName, sampleOut, std::ios::out | std::ios::binary);
    int fileNameIndex;

    int numContextsReached = 0;
    int numContexts = 1 << (contextLength * 2);
    ReaderAgglomerate reader;
    samples.keyLength = contextLength;
    HDFCmpFile<CmpAlignment> cmpReader;
    cmpReader.IncludeField("QualityValue");
    cmpReader.IncludeField("DeletionQV");
    cmpReader.IncludeField("InsertionQV");
    cmpReader.IncludeField("SubstitutionQV");
    cmpReader.IncludeField("SubstitutionTag");
    cmpReader.IncludeField("DeletionTag");
    cmpReader.IncludeField("PulseIndex");
    cmpReader.IncludeField("WidthInFrames");
    cmpReader.IncludeField("PreBaseFrames");

    if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) {
        std::cout << "ERROR, could not open the cmp file." << std::endl;
        std::exit(EXIT_FAILURE);
    }
    std::cout << "Reading cmp file." << std::endl;

    CmpFile cmpFile;

    cmpReader.ReadAlignmentDescriptions(cmpFile);
    cmpReader.ReadStructure(cmpFile);
    std::cout << "done reading structure." << std::endl;
    int alignmentIndex;
    int nAlignments = cmpReader.alnInfoGroup.GetNAlignments();
    std::vector<int> alignmentToBaseMap;

    for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient();
         alignmentIndex++) {
        //
        // For ease of use, store the length of the alignment to make another model.
        //

        ByteAlignment alignmentArray;
        cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray);
        Alignment alignment;
        ByteAlignmentToAlignment(alignmentArray, alignment);
        std::string readSequence, refSequence;
        readSequence.resize(alignmentArray.size());
        refSequence.resize(alignmentArray.size());
        DNASequence readDNA, refDNA;

        ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]);
        ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]);
        RemoveGaps(readSequence, readSequence);
        RemoveGaps(refSequence, refSequence);

        readDNA.seq = (Nucleotide*)readSequence.c_str();
        readDNA.length = readSequence.size();
        refDNA.seq = (Nucleotide*)refSequence.c_str();
        refDNA.length = refSequence.size();
        CmpAlignment cmpAlignment;

        cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read);

        CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap);

        if (read.length < contextLength) {
            continue;
        }
        int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() -
                             cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart());
        if (onlyMaxLength == false) {
            samples.lengths.push_back(subreadLength);
        } else {
            int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() -
                         cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions());
            std::stringstream nameStrm;
            nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber();
            std::string nameStr = nameStrm.str();
            if (maxLengthMap.find(nameStr) == maxLengthMap.end()) {
                maxLengthMap[nameStr] = ScoredLength(score, subreadLength);
            }
        }

        int sampleEnd = alignmentArray.size() - contextLength / 2;
        int a;
        for (a = contextLength / 2; a < sampleEnd; a++) {

            // Make sure the context begins on a real nucleotide.
            while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) {
                a++;
            }

            //
            // Move ab back to an index where there are contextLength/2 non-gap
            // characters, counted by nb
            //
            int ab;  //num bases
            int ae;  //alignment end
            ab = a - 1;
            int nb = 0, ne = 0;
            while (true) {
                if (RefChar[alignmentArray[ab]] != ' ') {
                    nb++;
                }
                if (ab == 0 or nb == static_cast<int>(contextLength) / 2) break;
                ab--;
            }

            //
            // Advance ae to an index where there are contextLength/2 non-gap
            // characters, counted by ne.
            //
            ae = a + 1;
            while (ae < static_cast<int>(alignmentArray.size()) and
                   ne < static_cast<int>(contextLength) / 2) {
                if (RefChar[alignmentArray[ae]] != ' ') {
                    ne++;
                }
                ae++;
            }

            //
            // Make sure there are no edge effects that prevent a context of the correct length from being assigned.
            //
            if (nb + ne + 1 != static_cast<int>(contextLength)) {
                continue;
            }
            int ai;
            std::string context;
            for (ai = ab; ai < ae; ai++) {
                if (RefChar[alignmentArray[ai]] != ' ') {
                    context.push_back(RefChar[alignmentArray[ai]]);
                }
            }
            assert(context.size() == contextLength);
            //
            // Now create the context.
            //
            OutputSample sample;

            //
            // This context is a deletion, create that.
            //
            sample.type = OutputSample::Deletion;

            //
            // This context is either an insertion or substitution
            //
            // Look to see if the previous aligned position was an
            // insertion, and move back as far as the insertion extends.
            int aq = a - 1;
            int sampleLength;

            if (QueryChar[alignmentArray[a]] == ' ') {
                sample.type = OutputSample::Deletion;
                sampleLength = 0;
            } else if (RefChar[alignmentArray[aq]] == ' ') {

                while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and
                       QueryChar[alignmentArray[aq]] != ' ') {
                    aq--;
                }
                sample.type = OutputSample::Insertion;
                sampleLength = a - aq;
            } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) {
                sample.type = OutputSample::Match;
                sampleLength = 1;
            } else {
                sample.type = OutputSample::Substitution;
                sampleLength = 1;
            }

            sample.Resize(sampleLength);
            if (sampleLength > 0) {
                int seqPos = alignmentToBaseMap[aq];
                if (seqPos < static_cast<int>(read.length)) {
                    sample.CopyFromSeq(read, seqPos, sampleLength);
                    std::string nucs;
                    for (size_t n = 0; n < sample.nucleotides.size(); n++) {
                        char c = sample.nucleotides[n];
                        assert(c == 'A' or c == 'T' or c == 'G' or c == 'C');
                        nucs.push_back(sample.nucleotides[n]);
                    }
                }
            }
            samples.AppendOutputSample(context, sample);
        }
        read.Free();
    }

    if (onlyMaxLength) {
        std::map<std::string, ScoredLength>::iterator maxScoreIt;
        for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) {
            std::cout << maxScoreIt->second.length << std::endl;
            samples.lengths.push_back(maxScoreIt->second.length);
        }
    }

    samples.Write(sampleOut);

    return 0;
}
Exemplo n.º 2
0
int main(int argc, char* argv[]) {
  string program = "samtoh5";
  string versionString = VERSION;
  AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);
  string samFileName, cmpFileName, refFileName;
  bool parseSmrtTitle = false;
  bool useShortRefName = false;
  CommandLineParser clp;
  string readType = "standard";
  int verbosity = 0;

  clp.SetProgramName(program);
  clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file.");
  clp.SetVersion(versionString);

  clp.RegisterStringOption("in.sam", &samFileName, 
                           "Input SAM file.", true);
  clp.RegisterStringOption("reference.fasta", &refFileName, 
                           "Reference used to generate reads.", true);
  clp.RegisterStringOption("out.cmp.h5", &cmpFileName, 
                           "Output cmp.h5 file.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, 
                         "Use this option when converting alignments "
                         "generated from reads produced by the "
                         "pls2fasta from bas.h5 files by parsing read "
                         "coordinates from the SMRT read title.  The title " 
                         "is in the format /name/hole/coordinates, where "
                         "coordinates are in the format \\d+_\\d+, and "
                         "represent the interval of the read that was "
                         "aligned.");
  clp.RegisterStringOption("readType", &readType, 
                         "Set the read type: 'standard', 'strobe', 'CCS', "
                         "or 'cDNA'");
  clp.RegisterIntOption("verbosity", &verbosity, 
                         "Set desired verbosity.", 
                         CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("useShortRefName", &useShortRefName, 
                         "Use abbreviated reference names obtained "
                         "from file.sam instead of using full names "
                         "from reference.fasta.");
  string description = ("Because SAM has optional tags that have different "
    "meanings in different programs, careful usage is required in order to "
    "have proper output. The \"xs\" tag in bwa-sw is used to show the "
    "suboptimal score, but in PacBio SAM (blasr) it is defined as the start "
    "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is "
    "specified, the xs tag is ignored, but when it is not specified, the "
    "coordinates given by the xs and xe tags are used to define the interval "
    "of a read that is aligned. The CIGAR string is relative to this interval.");
  clp.SetExamples(description);

  clp.ParseCommandLine(argc, argv);

  if (readType != "standard" and readType != "strobe" and 
      readType != "cDNA" and readType != "CCS") {
    cout << "ERROR. Read type '" << readType 
         << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." 
         << endl;
    exit(1);
  }
    
  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl;

  SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader;
  FASTAReader fastaReader;
  HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > cmpFile;

  //
  // Initialize input/output files.
  //
  samReader.Initialize(samFileName);
  fastaReader.Initialize(refFileName);
  cmpFile.Create(cmpFileName);

  //
  // Configure the file log.
  //
  string command;
  CommandLineParser::CommandLineToString(argc, argv, command);
  string log = "Convert sam to cmp.h5";
  cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString);

  //
  // Set the readType
  //
  cmpFile.SetReadType(readType);

  //
  // Read necessary input.
  //

  vector<FASTASequence> references;
  fastaReader.ReadAllSequences(references);
  
  //
  // This should probably be handled by the alignmentSetAdapter, but
  // time constraints...
  //
  AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet;
  samReader.ReadHeader(alignmentSet);
 
  //
  // The order of references in vector<FASTASequence> references and
  // AlignmentSet<, , >alignmentSet.references can be different.
  // Rearrange alignmentSet.references such that it is ordered in
  // exactly the same way as vector<FASTASequence> references.
  //
  alignmentSet.RearrangeReferences(references);

  //
  // Always recompute the MD5 values even if they exist in the input
  // sam file. Because MD5 is defined differently in sam and cmp.h5 files.
  // The SAM convention uppercases and normalizes before computing the MD5. 
  // For cmp.h5, we compute the MD5 on the sequence 'as is'.
  // 
  for(int i = 0; i < alignmentSet.references.size(); i++) {
      MakeMD5((const char*)&references[i].seq[0], 
              (unsigned int)references[i].length, alignmentSet.references[i].md5);
  }
 
  //
  // Map short names for references obtained from file.sam to full names obtained from reference.fasta
  //
  map<string, string> shortRefNameToFull;
  map<string, string>::iterator it;
  assert(references.size() == alignmentSet.references.size());
  if (!useShortRefName) {
      for (int i = 0; i < references.size(); i++) {
          string shortRefName = alignmentSet.references[i].GetSequenceName();
          string fullRefName(references[i].title); 
          if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
              cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl;
              exit(1);
          } 
          shortRefNameToFull[shortRefName] = fullRefName;
          alignmentSet.references[i].sequenceName = fullRefName;
      }
  }

  //
  // Start setting up the cmp.h5 file.
  //
  AlignmentSetToCmpH5Adapter<HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > > alignmentSetAdapter;
  alignmentSetAdapter.Initialize();
  alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile);
  
  //
  // Store the alignments.
  //
  SAMAlignment samAlignment;
  int alignIndex = 0;
  while (samReader.GetNextAlignment(samAlignment)) {
    if (samAlignment.rName == "*") {
      continue;
    }
    if (!useShortRefName) {
        //convert shortRefName to fullRefName
        it = shortRefNameToFull.find(samAlignment.rName);
        if (it == shortRefNameToFull.end()) {
            cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl;
            exit(1);
        }
        samAlignment.rName = (*it).second;
    }
    vector<AlignmentCandidate<> > convertedAlignments;
    if (verbosity > 0) {
      cout << "Storing alignment for " << samAlignment.qName << endl;
    }
    SAMAlignmentsToCandidates(samAlignment, 
                              references, alignmentSetAdapter.refNameToIndex,
                              convertedAlignments, parseSmrtTitle, false);

    alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, alignIndex);
    int a;
    for (a = 0; a < convertedAlignments.size(); a++) {
      convertedAlignments[a].FreeSubsequences();
    }
    ++alignIndex;
    /*    if (alignIndex == 100) {
      return 0;
      }*/
  }

  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl;
  return 0;
}
Exemplo n.º 3
0
int main(int argc, char* argv[]) {


	CommandLineParser clp;
	string cmpFileName;
	vector<int> holeNumbers;
	vector<string> patterns, refGroups;
  bool printAll = false;
	clp.RegisterStringOption("cmph5filename", &cmpFileName, "input cmp h5", false);
	clp.RegisterPreviousFlagsAsHidden();
	clp.RegisterIntListOption("holeNumbers", &holeNumbers, "hole numbers to print alignments", false);
	clp.RegisterStringListOption("pattern", &patterns, "patterns to search read names to print alignments", false);	
  clp.RegisterFlagOption("all", &printAll, "Just print all alignments.", false);
  clp.RegisterStringListOption("refgroups", &refGroups, "Reference groups to print.", false);
	clp.ParseCommandLine(argc, argv);

	
	CmpFile cmpFile;
	
	/*
	 * These readers pull information from the same pls file.
	 */
	HDFCmpFile<CmpAlignment> hdfcmpFile;

	if (hdfcmpFile.Initialize(cmpFileName) == 0) {
		cout << "ERROR, could not open the cmp file." << endl;
		exit(1);
	}
	
	hdfcmpFile.Read(cmpFile);
	
	int alignmentIndex;
	for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) {
		int alnHoleNumber;
		alnHoleNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber();
		int hi;
    bool printThisAlignment = false;

    //
    // Read the alignment string.  All alignments 
    //
    int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId();
    int alnGroupId  = cmpFile.alnInfo.alignments[alignmentIndex].GetAlnGroupId();

    int refGroupIndex = hdfcmpFile.refGroupIdToArrayIndex[refGroupId];
    string readGroupName = hdfcmpFile.alnGroupIdToReadGroupName[alnGroupId];
    int readGroupIndex = hdfcmpFile.refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName];

    string refGroupPath = cmpFile.refGroup.path[refGroupIndex];

		for (hi = 0; hi < holeNumbers.size(); hi++) {
			if (alnHoleNumber == holeNumbers[hi]) {
        printThisAlignment = true;
        break;
      }
    }
    int ri;
    for (ri = 0; ri < refGroups.size(); ri++) {
      if (refGroups[ri] == refGroupPath) {
        printThisAlignment = true;
        break;
      }
    }


    if (printThisAlignment or printAll) {
      unsigned int alignStartIndex, alignEndIndex;
      UInt offsetBegin, offsetEnd;
		
      string   refSequence;
      string   readSequence;
      vector<unsigned char> byteAlignment;

      offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin();
      offsetEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd();
      int alignedSequenceLength = offsetEnd - offsetBegin;
      if (alignedSequenceLength >= 0) {
        refSequence.resize(alignedSequenceLength);
        byteAlignment.resize(alignedSequenceLength);
      }
	
      
      hdfcmpFile.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, 
                                                                                               offsetEnd, 
                                                                                               &byteAlignment[0]);

      readSequence.resize(byteAlignment.size());
      refSequence.resize(byteAlignment.size());

      ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &readSequence[0]);
      ByteAlignmentToRefString(&byteAlignment[0], byteAlignment.size(), &refSequence[0]);				
      string ungappedRead, ungappedRef;
      RemoveGaps(readSequence, ungappedRead);
      RemoveGaps(refSequence, ungappedRef);
      Alignment alignment;
      GappedStringsToAlignment(readSequence, refSequence, alignment);
      DNASequence qAlignedSeq, rAlignedSeq;
      qAlignedSeq.seq = (Nucleotide*) &ungappedRead[0];
      qAlignedSeq.length = ungappedRead.size();
      rAlignedSeq.seq = (Nucleotide*) &ungappedRef[0];
      rAlignedSeq.length = ungappedRef.size();
				
      int qStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart();
      int tStart = cmpFile.alnInfo.alignments[alignmentIndex].GetRefStart();
      stringstream sstrm;
      sstrm << alnHoleNumber << "/" << qStart << "_" << cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd();
      alignment.qName = sstrm.str();
      StickPrintAlignment(alignment, qAlignedSeq, rAlignedSeq, cout, qStart, tStart);
				
    }
  }
}
Exemplo n.º 4
0
int main(int argc, char* argv[]) {
	string cmpFileName;

	CommandLineParser clp;
	bool printTotalAlignedBases = false;
	bool printNReads = false;
	bool printGlobalAccuracy = false;
	bool printNMatches = false;
	bool printAverageAccuracy = false;
	bool printDistBetweenErrorsHist = false;
	float identityCutoff = 0.0;
	bool printBinnedErrorRate = false;
	int minAlignLength = 0;
	bool printBreakdown = false;
	int nBins = 20;
	float totalPercentIdentity = 0;
	string binnedErrorDistributionFileName = "";
  bool countBasesByMovie = false;
  string matchGapFileName = "";
  string readMatchFileName = "";
  int matchGapK = 15;
  string matchRunFileName = "";
  string lengthsFileName = "";
  bool printAverageLength = false;
  bool printMovingAverage = false;
  string movingAverageFileName = "";
  string matchCountFileName = "";
  string errorMatrixName = "";
  bool discardFirstMatch = false;
  bool discardLastMatch  = false;
	clp.RegisterStringOption("cmpH5File", &cmpFileName, "Input cmp.h5 file.", true);
	clp.RegisterPreviousFlagsAsHidden();
	clp.RegisterFlagOption("nAlignments",            &printNReads, "Print the total number of reads.", false);
	clp.RegisterFlagOption("nAlignedBases", &printTotalAlignedBases, "Print the total number of aligned bases", false);
	clp.RegisterFloatOption("identityCutoff",   &identityCutoff, "Print the total number of aligned bases", CommandLineParser::PositiveFloat, false);
	clp.RegisterFlagOption("binnedErrorRate", &printBinnedErrorRate, "Divide each read into N bins, and count the error rate in each bin", false);
	clp.RegisterIntOption("nBins", &nBins, "The number of bins for binned error rate.", CommandLineParser::PositiveInteger, false);
	clp.RegisterIntOption("minAlignLength", &minAlignLength, "Disregard alignments less than n length.", CommandLineParser::PositiveInteger, false);
  clp.RegisterStringOption("matchCount", &matchCountFileName, "Print the count of matches/mismatches/ins/del per pos to file.", false);
	clp.RegisterFlagOption("globalAccuracy", &printGlobalAccuracy, "Print the accuracy across all sequences.", false);
	clp.RegisterFlagOption("averageAccuracy", &printAverageAccuracy, "Print average accuracy of reads.", false);
	clp.RegisterFlagOption("nMatches", &printNMatches, "Print the number of bases matched.", false);
	clp.RegisterFlagOption("breakdown", &printBreakdown, "Print insertion/deletion/mismatch breakdown.", false);
	clp.RegisterFlagOption("errdist", &printDistBetweenErrorsHist, "Print a histogram of distance between errors.", false);
	clp.RegisterFlagOption("bymovie", &countBasesByMovie, "Count the number of bases aligned in each movie.", false);
  clp.RegisterStringOption("matchRun", &matchRunFileName, "Print lengths of runs of matches to file.", false);
  clp.RegisterFlagOption("discardFirstMatch", &discardFirstMatch, "Do not print the first run of matches.", false);
  clp.RegisterFlagOption("discardLastMatch", &discardLastMatch, "Do not print the last run of matches.", false);
  clp.RegisterStringOption("lengths", &lengthsFileName, "Print all subread lengths to a file.", false);
  clp.RegisterFlagOption("averageLength", &printAverageLength, "Print the average subread length.", false);
  clp.RegisterStringOption("printMovingAverageErrorRate", &movingAverageFileName, "Print moving average of accuracy.", false);
	clp.RegisterStringOption("binnedErrorDistribution", &binnedErrorDistributionFileName, 
													 "Print all binned error rates to a file", false);
  clp.RegisterStringOption("matchGap", &matchGapFileName,
                           "Print all matches and the gap between them to file.", false);
  clp.RegisterStringOption("matchesPerRead", &readMatchFileName,
                           "Print statistics for the number of matches found in a read", false);
  clp.RegisterIntOption("matchGapK", &matchGapK,
                        "(15) The minimum word size to match.",
                        CommandLineParser::PositiveInteger, false);
  clp.RegisterStringOption("errmat", &errorMatrixName, "Store error rates by read length", false);
	clp.ParseCommandLine(argc, argv);
  vector<vector<int>  > matchMatrix;
  vector<vector<int>  > errorMatrix;

  matchMatrix.resize(20);
  errorMatrix.resize(20);

	map<int,int> distHist;
  map<string,int> basesByMovie, readsByMovie;

	CmpFile cmpFile;
	
	/*
	 * These readers pull information from the same pls file.
	 */
	HDFCmpFile<CmpAlignment> cmpReader;

	if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDONLY) == 0) {
		cout << "ERROR, could not open the cmp file." << endl;
		exit(0);
	}
  ofstream movingAverageFile;
  if (movingAverageFileName != "") {
    printMovingAverage= true;
    CrucialOpen(movingAverageFileName, movingAverageFile, std::ios::out);
  }
  ofstream matchCountFile;
  if (matchCountFileName != "") {
    CrucialOpen(matchCountFileName, matchCountFile, std::ios::out);
  }

	cmpReader.Read(cmpFile);
	int alignmentIndex;
	long nAlignments = 0;
	long totalAlignedBases = 0;
	long totalMatchedBases = 0;
	long totalAlignedLength = 0;
	long totalInsertion = 0, totalDeletion = 0, totalMismatch = 0;
	ofstream accuracyBinsFile, matchGapFile, readMatchFile, matchRunFile, lengthsFile, errMatFile;
  
	vector<vector<float> > accuracyDistributionBins;

  vector<int> mc, ic, dc, mmc;

	if (binnedErrorDistributionFileName != "") {
		CrucialOpen(binnedErrorDistributionFileName, accuracyBinsFile, std::ios::out);
		accuracyDistributionBins.resize(nBins);
	}
	
  if (lengthsFileName != "") {
    CrucialOpen(lengthsFileName, lengthsFile, std::ios::out);
  }

  if (errorMatrixName != "") {
    CrucialOpen(errorMatrixName, errMatFile, std::ios::out);
  }

	vector<float> accuracyBins;
	if (nBins > 0) {
		accuracyBins.resize(nBins);
	}
  
  if (matchRunFileName != "") {
    CrucialOpen(matchRunFileName, matchRunFile, std::ios::out);
    matchRunFile << "run_length gap_length" << endl;
  }

  if (matchGapFileName != "") {
    CrucialOpen(matchGapFileName, matchGapFile, std::ios::out);
    matchGapFile  << "match_length gap_length" << endl;
  }
	
  if (readMatchFileName != "") {
    CrucialOpen(readMatchFileName, readMatchFile, std::ios::out);
    readMatchFile << "read_length nmatches" << endl;
  }
  
	for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) {

		vector<unsigned char> byteAlignment;
		UInt offsetBegin, offsetEnd;
		
		offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin();
		offsetEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd();
		int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - 
                         cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart());

    if (lengthsFileName != "") {
      lengthsFile << subreadLength << endl;
    }

		int alignedSequenceLength = offsetEnd - offsetBegin;
		string alignedSequence;
		if (alignedSequenceLength >= 0) {
			alignedSequence.resize(alignedSequenceLength);
			byteAlignment.resize(alignedSequenceLength);
		}
	
    		//
		// Read the alignment string.  All alignments 
		//
		//
		// Alignments are groupsd by ref group id then movie id.
		//
		int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId();
		int movieId    = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId();
			
		//
		// Now locate where this movie is stored.
		//

		if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) {
			cout << "ERROR!  An alignment " << alignmentIndex << " is specified with reference group " << endl
					 << refGroupId << " that is not found as an alignment group." << endl;
			exit(1);
		}
		int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId];
		int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex[movieId];			
		cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, 
																																														 offsetEnd, 
																																														 &byteAlignment[0]);
    /*
    if (matchRunFileName != "") {
      PrintAlignment(byteAlignment, matchRunFile);
    }
    */
    int n1 = 0, ngt1 = 0;
    if (matchGapFileName != "" or readMatchFileName != "" or matchRunFileName != "") {
      //
      // Print the matches and the gaps in this alignment to 
      // the file.
      //
      int i = 0;
      int alignEnd = byteAlignment.size() - matchGapK;
      bool matchFound = false;
      int prevMatchEnd = 0;
      int prevMatchLength = 0;
      int nMatches = 0;
      int prevMatchAnyEnd = 0;
      while (i < alignEnd) {
        // Find the first matching character.
        int nonMatchLength = 0;
        while (i < alignEnd and 
               (QueryChar[byteAlignment[i]] == ' ' or
                RefChar[byteAlignment[i]] == ' ' or 
                QueryChar[byteAlignment[i]] != RefChar[byteAlignment[i]])) {
          i++;
          nonMatchLength++;
        }
        if (i >= alignEnd) {
          break;
        }
        // find the end of this match
        int matchStart = i;
        while (i < alignEnd and QueryChar[byteAlignment[i]] != ' ' 
               and RefChar[byteAlignment[i]] != ' ' 
               and QueryChar[byteAlignment[i]] == RefChar[byteAlignment[i]]) {
          i++;
        }
        int matchEnd = i;
               
        if (matchRunFileName != "") {
          bool printAlignment = true;
          if (discardFirstMatch and matchStart == 0) {
            printAlignment = false;
          }
          if (discardLastMatch and matchEnd == alignEnd) {
            printAlignment = false;
          }
          if (printAlignment) {
            matchRunFile << matchEnd - matchStart << " " << nonMatchLength << endl;
          }
          if  (matchEnd - matchStart == 1) {
            n1++;
          }
          else { ngt1++;}
        }
        prevMatchAnyEnd = i;
        // If this match counts as an anchor, process it.
        if (i - matchStart >= matchGapK) {
          // Processing starts by looking to see if a previous anchor was found
          // and if so, printing that and the gap to the current anchor.
          //
          if (matchFound == true) {
            if (matchGapFileName != "") {
              matchGapFile << prevMatchLength << " " << matchStart - prevMatchEnd << endl;
            }
            ++nMatches;
          }
          // Processing ends by storing the length of the match, and where
          // it ended so that the next iteration can use it.
          matchFound = true;
          prevMatchLength = i - matchStart;
          prevMatchEnd    = i;
        }
      }
      if (readMatchFileName != "") {
        readMatchFile << byteAlignment.size() << " " << nMatches << endl;
      }
    }
    if (matchRunFileName != "") {
      //      matchRunFile << "n1: " << n1 << " ngt1 " << ngt1 << endl;
    }

    if (countBasesByMovie) {
      string movieName = cmpFile.movieInfo.name[readGroupIndex];
      if (basesByMovie.find(movieName) ==
          basesByMovie.end()) {
        readsByMovie[movieName] = 0;
        basesByMovie[movieName] = 0;
      }
      basesByMovie[movieName]+= offsetEnd - offsetBegin;
      readsByMovie[movieName]++;
    }
      
		if (printDistBetweenErrorsHist) {
			AddDistancesBetweenErrors(byteAlignment, distHist);
		}

		float readPctIdentity;
		readPctIdentity = ComputePacBioAccuracy(byteAlignment);
		
		if (readPctIdentity < identityCutoff) {
			continue;
		}

		if (byteAlignment.size() < minAlignLength) {
			continue;
		}

		++nAlignments;
    //
    // several stats use total aligned length
    int qStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart();
    int qEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd();
    totalAlignedBases += qEnd - qStart;

    if (errorMatrixName != "") {
      StoreErrorRateMatrix(byteAlignment, matchMatrix, errorMatrix);
    }

		if (printBinnedErrorRate) {
			StoreBinnedErrorRate(byteAlignment, accuracyBins);
		}

		if (binnedErrorDistributionFileName != "") {
			AppendBinnedErrorRate(byteAlignment, accuracyDistributionBins);
		}
		
		int nMatch, nMismatch, nIns, nDel;
		CountStats(byteAlignment, nMatch, nMismatch, nIns, nDel);
		float total = nMatch + nMismatch + nIns + nDel;

		totalMatchedBases    += nMatch;
		totalMismatch        += nMismatch;
		totalInsertion       += nIns;
		totalDeletion        += nDel;
		//		totalMatchedBases    += CountNMatches(byteAlignment);
		totalAlignedLength   += byteAlignment.size();
		totalPercentIdentity += ComputePercentIdentity(byteAlignment);

    if (printMovingAverage) {
      vector<float> movingAverage;
      StoreMovingAverage(byteAlignment, movingAverage);
      int i;
      for (i = 0; i < movingAverage.size(); i++) {
        movingAverageFile << i << " " << movingAverage[i] << endl;
      }
    }

    if (matchCountFileName != "") {
      StoreMatchCounts(byteAlignment, mc, ic, dc, mmc);
    }
	}

  if (countBasesByMovie) {
    map<string,int>::iterator mapIt;
    for (mapIt = basesByMovie.begin(); mapIt != basesByMovie.end(); ++mapIt) {
      cout << mapIt->first << " " << readsByMovie[mapIt->first] << " " << mapIt->second << endl;;
    }
  }

  if (matchCountFileName != "") {
    int i;
    for (i = 0; i < mc.size(); i++) {
      matchCountFile << mc[i] << " " << ic[i] << " " << dc[i] << " " << mmc[i] << endl;
    }
    matchCountFile.close();
  }

	if (printNReads) {
		cout << "nAlignments\t"<< nAlignments << endl;
	}

	if (printTotalAlignedBases) {
		cout << "totalAlignedBases\t" << totalAlignedBases << endl;
	}

	if (printBreakdown) {
		float totalTemplate = totalMatchedBases + totalMismatch + totalDeletion + totalInsertion;
		cout << "M_MM_I_D " << totalMatchedBases << " " << totalMismatch << " " << totalInsertion << " " << totalDeletion << " "
				 << totalMatchedBases / totalTemplate << " " 
				 << totalMismatch     / totalTemplate << " " 
				 << totalInsertion    / totalTemplate << " " 
				 << totalDeletion     / totalTemplate << endl;
	}

	if (printNMatches) {
		cout << "totalMatches\t" << totalMatchedBases << endl;
	}

	if (printGlobalAccuracy) {
		cout << "globalAccuracy\t"<< ((float)totalMatchedBases) / totalAlignedLength << endl;
	}

	if (printAverageAccuracy) {
		cout << "averageAccuracy\t" << totalPercentIdentity / nAlignments << endl;
	}

  if (printAverageLength) {
    cout << "averageAlignmentLength\t" << totalAlignedBases / (float)nAlignments << endl;
  }

  
	if (printBinnedErrorRate) {
		int i;
		for (i = 0; i < nBins; i++) {
			accuracyBins[i] /= nAlignments;
			cout << accuracyBins[i] << " ";
		}
		cout << endl;
	}

	if (binnedErrorDistributionFileName != "") {
		int i, b;
		for (i = 0; i < accuracyDistributionBins[0].size(); i++) {
			for (b = 0; b < nBins; b++) {
				accuracyBinsFile << accuracyDistributionBins[b][i] << " ";
			}
			accuracyBinsFile << endl;
		}
		accuracyBinsFile.close();
	}

  if (lengthsFileName != "") {
    lengthsFile.close();
  }
    
	if (printDistBetweenErrorsHist) {
		map<int,int>::iterator histIt;
		for (histIt = distHist.begin(); histIt != distHist.end(); ++histIt) {
			cout << "hist " << histIt->first << " " << histIt->second << endl;
		}
	}
	if (printMovingAverage){ 
    movingAverageFile.close();
  }
  if (errorMatrixName != "") {
    int i, j;
    for (i = 0; i< matchMatrix.size(); i++) {
      for (j = 0; j < matchMatrix[i].size(); j++) {
        if ((matchMatrix[i][j] + errorMatrix[i][j]) > 0) {
          errMatFile << ((float)matchMatrix[i][j]) / ((matchMatrix[i][j] + errorMatrix[i][j])) << " ";
        }
        else {
          errMatFile << " 0 ";
        }
      }
      errMatFile << endl;
    }
    errMatFile.close();
  }

	return 0;
}
Exemplo n.º 5
0
int main(int argc, char* argv[]) {

	string cmpFileName, movieFileName;

	int argi = 3;
	int numMetrics = 8;
	map<string,bool> metricOptions;
	int maxElements = 0;
	//
	// Default is all options are true
	//
	CreateMetricOptions(metricOptions);
	string metricList = "";
  bool useCcs = false;
  bool byRead = false;
  bool failOnMissingData = false;
  CommandLineParser clp;
  bool printVersion = false;

  clp.RegisterStringOption("basFileName", &movieFileName, "The input {bas,pls}.h5 or input.fofn.", true);
  clp.RegisterStringOption("cmpFileName", &cmpFileName, "The cmp.h5 file to load pulse information into.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterStringOption("metrics", &metricList, "The a string delimited list of metrics (with no spaces).The "
                           "valid options are:  QualityValue, ClassifierQV, MergeQV, StartFrame,"
                           "PulseWidth, pkmid, IPD, and Light.");
  clp.RegisterFlagOption("useccs", &useCcs, "Load pulse information for CCS sequences and not raw bases.");
  clp.RegisterFlagOption("byread", &byRead, "Load pulse information by read rather than buffering an entire pls.h5 file.  "
                         "This option will soon be deprecated and on by default.");
  clp.RegisterIntOption("maxElements", &maxElements, "Set a limit on the size of pls/bas file to buffer in.", CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("failOnMissingData", &failOnMissingData, "Exit if any data fields are missing from the bas.h5 or pls.h5 input that are required to load a metric. Defualt is a warning.");

  clp.SetProgramSummary("Load pulse information such as inter pulse distance, or quality information into the cmp.h5 file."
                        "This allows one to analyze kinetic and quality information by alignment column.");
  clp.ParseCommandLine(argc, argv);

  if (printVersion) {
    cout << VERSION << endl;
    exit(1);
  }

	if (metricList == "") {
		SetDefaultMetricOptions(metricOptions);
	}
  else {
    ParseMetricsList(metricList, metricOptions);
  }


	// 
	// Always read in basecalls since they are used to check the sanity
	// of the alignment indices.
	//
	metricOptions["Basecall"] = true;
	
	//
	// Translate from the metrics to be loaded to the ones that are
	// required to compute them.
	//
	
	vector<string> datasetFields;
	RequirementMap fieldRequirements;
	BuildRequirementMap(fieldRequirements);
	StoreDatasetFieldsFromPulseFields(metricOptions, fieldRequirements, datasetFields);

	
	vector<string> movieFileNames;
	vector<string> fofnMovieNames;

	FileOfFileNames::StoreFileOrFileList(movieFileName, movieFileNames);

	HDFBasReader hdfBasReader;
	HDFPlsReader hdfPlsReader;
  HDFCCSReader<SMRTSequence> hdfCcsReader;

	vector<string> baseFileFields, pulseFileFields;
	int fieldIndex;

	bool useBaseFile = false, usePulseFile = false;

	for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) {
		if (hdfBasReader.ContainsField(datasetFields[fieldIndex])) {
			useBaseFile = true;
			baseFileFields.push_back(datasetFields[fieldIndex]);
		}
	}

	if (maxElements != 0) {
		hdfBasReader.maxAllocNElements = maxElements;
		hdfPlsReader.maxAllocNElements = maxElements;
	}

	//
	// For now, all runs will attempt to use information from a .bas
	// file, since it's assumed that if one has alignments, one has a
	// .bas file.
	//
	useBaseFile = true;
	//
	// Add some default fields.
	//
	hdfBasReader.IncludeField("Basecall");
	hdfBasReader.IncludeField("PulseIndex");

  hdfBasReader.InitializeFields(baseFileFields);

  
	for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) {
		if (hdfPlsReader.ContainsField(datasetFields[fieldIndex])) {
			usePulseFile = true;
			pulseFileFields.push_back(datasetFields[fieldIndex]);
		}
	}
	if (usePulseFile) {
		hdfPlsReader.InitializeFields(pulseFileFields);
	}
	hdfPlsReader.IncludeField("NumEvent");

		
	
	int nMovies = movieFileNames.size();
	int movieIndex;
	MovieNameToArrayIndex movieNameMap;
	//
	// Initialize movies. This accomplishes two tasks.  First, all movie
	// files are opened and initialized, so that if there are data
	// fields missing the program will exit now rather than in the
	// middle of loading pulses.  
	// Next, a list of movie names is created in fofnMovieNames.  The
	// cmp file does not necessarily index movies in the order of the
	// fofn, and so when loading pulses from a movie indexed by a cmp
	// file, one needs to look up the file name of the movie.  This is
	// done by scanning the fofnMovieNames list in order until the movie
	// is found. 

	for (movieIndex = 0; movieIndex < nMovies; movieIndex++) {

    if (!hdfBasReader.Initialize(movieFileNames[movieIndex])) {
      cout << "ERROR, could not initialize HDF file "
           << movieFileNames[movieIndex] << " for reading bases." << endl;
      exit(1);
    }
    else {
      fofnMovieNames.push_back(hdfBasReader.GetMovieName());
      movieNameMap[hdfBasReader.GetMovieName()] = movieIndex;
      hdfBasReader.Close();
    }

		// 
		// The pulse file is optional.  
		//
		if (usePulseFile) {
			if (hdfPlsReader.Initialize(movieFileNames[movieIndex]) == 0) {
				usePulseFile = false;
			}
		}		
	}

	CmpFile cmpFile;
	
	/*
	 * These readers pull information from the same pls file.
	 */
	HDFCmpFile<CmpAlignment> cmpReader;

	if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDWR) == 0) {
		cout << "ERROR, could not open the cmp file." << endl;
		exit(0);
	}
	
  cmpReader.Read(cmpFile);

  string commandLine;
  clp.CommandLineToString(argc, argv, commandLine);
  string versionStr(VERSION);
  AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionStr);
  cmpReader.fileLogGroup.AddEntry(commandLine, "Loading pulse metrics", "loadPulses", GetTimestamp(), versionStr);


	//
	// Group alignment indices by movie so that they may be processed one movie at a time
	// later on.  The movie indices set keeps track of all indices
	// listed in alignment files.  This keeps a reference to all
	// alignments in memory at once.   At the time of writing this, most
	// projects will have at most a few million alignments, and so the
	// size of this structure is modest.
	//

	UInt alignmentIndex;
	map<int, vector<int> > movieIndexSets;

	for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) {
		movieIndexSets[cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId()].push_back(alignmentIndex);
	}

	vector<float>  computedPulseField;
	string   alignedSequence;
	string   readSequence;
	vector<unsigned char> byteAlignment;
	int m;
	vector<int> baseToAlignmentMap;

	//
	// Load pulses from movies in order they appear in the input fofn.
	//
  int fofnMovieIndex;
  for (fofnMovieIndex = 0; fofnMovieIndex < fofnMovieNames.size(); fofnMovieIndex++) {
    
    if (cmpFile.readType == ReadType::CCS or useCcs) {
      hdfBasReader.SetReadBasesFromCCS();
      hdfCcsReader.Initialize(movieFileNames[fofnMovieIndex]);
    }
    hdfBasReader.Initialize(movieFileNames[fofnMovieIndex]);
		BaseFile  baseFile;
		PulseFile pulseFile;

    if (byRead == false) {
      //
      // Read the entire bas file at once, and then extract values
      // from memory.  This can be faster depending on the chunk
      // size and size of the movie.
      //
      hdfBasReader.ReadBaseFile(baseFile);
      hdfBasReader.Close();
    }
    else {
      //
      // Reads are scanned one by instead of caching all.  It is
      // still necessary to read in some of the datasets entirely,
      // in particular the start positions and hole numbers.
      //


      // This is repeated below for a pulse file.  Since the pulse
      // and base files are separate objects, the scan data is
      // read into each separately.  Somehow later the information
      // should be merged into just one.
      if (hdfBasReader.scanDataReader.fileHasScanData) {
        hdfBasReader.scanDataReader.Read(baseFile.scanData);
      }
      baseFile.readStartPositions.resize(hdfBasReader.nReads+1);
      baseFile.readStartPositions[0] = 0;
      hdfBasReader.GetAllReadLengths(baseFile.readLengths);
      int i;
      assert(baseFile.readLengths.size() + 1 == baseFile.readStartPositions.size());
      for (i = 1; i < hdfBasReader.nReads + 1; i++ ) {
        baseFile.readStartPositions[i] = baseFile.readLengths[i-1] + baseFile.readStartPositions[i-1];
      }
      
      //
      // Although the whole bas file isn't being read in, it is
      // necessary to read in which hole numbers are contained in this
      // bas file since it is possible that the alignment for a
      // particular hole number may be in a different input bas.h5
      // file even if it is the same movie. 
      //
      hdfBasReader.GetAllHoleNumbers(baseFile.holeNumbers);
    }
    set<uint32_t> moviePartHoleNumbers;
    copy(baseFile.holeNumbers.begin(), baseFile.holeNumbers.end(), inserter(moviePartHoleNumbers, moviePartHoleNumbers.begin()));

		
		if (usePulseFile) {
			hdfPlsReader.Initialize(movieFileNames[fofnMovieIndex]);
			hdfPlsReader.IncludeField("NumEvent");
      hdfPlsReader.IncludeField("StartFrame");
      if (byRead == false) { 
        hdfPlsReader.ReadPulseFile(pulseFile);
        hdfPlsReader.Close();
      }
      else {
        if (usePulseFile) {
          pulseFile.pulseStartPositions.resize(hdfBasReader.nReads+1);
          pulseFile.pulseStartPositions[0] = 0;
          hdfPlsReader.GetAllNumEvent(pulseFile.numEvent);
          int i;
          for (i = 1; i < hdfBasReader.nReads + 1; i++ ) {
            pulseFile.pulseStartPositions[i] = pulseFile.numEvent[i-1] + pulseFile.pulseStartPositions[i-1];
          }
          if (hdfPlsReader.scanDataReader.fileHasScanData) {
            hdfPlsReader.scanDataReader.Read(pulseFile.scanData);
          }
        }
      }
		}

    string cmpFileMovieName;

    for (m = 0; m < cmpFile.movieInfo.name.size(); m++) {
      //
      // First find the file name for the movie 'm'
      //
      cmpFileMovieName = cmpFile.movieInfo.name[m];
      int fofnMovieIndex;
      
      if (baseFile.GetMovieName() == cmpFileMovieName) {
				break;
			}
		}

    //
    // If the movie specified in the input.fofn is not found in the
    // cmp file, that indicates something bad is happeing.  Either the
    // input.fofn was not used to generate the cmp.h5 file, or no
    // alignments were found between the input bas.h5 and the
    // reference.  That shouldn't happen.
    // 
		if (m == cmpFile.movieInfo.name.size()) {
			cout << "WARNING: The movie indexed in the compare file " << cmpFileMovieName << " is not listed in the file " << movieFileName << endl;
			continue;
		}
		
		//
		// Open the movie and load its pulses into memory.
		//
		movieIndex = cmpFile.movieInfo.id[m];
		int movieAlignmentIndex;
		float NaN = 0.0/0.0;
    
    UChar missingQualityValue = 255;
    HalfWord missingFrameRateValue    = USHRT_MAX;
    unsigned int missingPulseIndex = UINT_MAX;
    //
    // Since usePulseFile is set when the input file is a pulseFile,
    // and ReadType::CCS becomes the read type when the alignments are
    // ccs, when pulse files are specified for de novo ccs alignments,
    // they will be opened as pulse files.  Since the de novo ccs
    // sequences do not have pulse file information, the auto-reading
    // of pulse files needs to be disabled.  Do that here.
    //
    if (cmpFile.readType == ReadType::CCS or useCcs) {
      usePulseFile = false;
    }


		//
		// Now check the sanity of metric options.
		//

		map<string,bool>::iterator metricIt;
		for (metricIt = metricOptions.begin(); metricIt != metricOptions.end(); ++metricIt) {
			if (metricIt->second == false) {
				continue;
			}
			bool metricMayBeComputed = true;
      if (cmpFile.readType == ReadType::CCS and
          metricIt->first != "QualityValue"  and
          metricIt->first != "DeletionQV" and
          metricIt->first != "SubstitutionQV" and
          metricIt->first != "InsertionQV" and
          metricIt->first != "DeletionTag" and
          metricIt->first != "SubstitutionTag" and
          metricIt->first != "Basecall") {
        cout << "ERROR! The metric " << metricIt->first << " cannot be loaded into de novo ccs alignemnts." << endl;
        //        exit(0);
        metricMayBeComputed = false;
      }
      
			if (metricIt->first == "IPD") {
				//
				// The field requirements for IPD are special. 
				//
				if ((useBaseFile and !hdfBasReader.FieldIsIncluded("PreBaseFrames")) or
						(usePulseFile and (!hdfPlsReader.FieldIsIncluded("StartFrame") and
															 !hdfPlsReader.FieldIsIncluded("WidthInFrames")))) {
					metricMayBeComputed = false;
				}
			}
			else {
				if (fieldRequirements.find(metricIt->first) != fieldRequirements.end()) {
					//
					// There are requirements for this field. Make sure all are
					// present before trying to compute this field.
					//
					int requirementIndex;
					for (requirementIndex = 0; requirementIndex < fieldRequirements[metricIt->first].size(); ++requirementIndex) {
						string requirement;
						requirement = fieldRequirements[metricIt->first][requirementIndex];
				
						if (((useBaseFile == false or ((hdfBasReader.includedFields.find(requirement) == hdfBasReader.includedFields.end() or
 																						hdfBasReader.includedFields[requirement] == false))) and
								 ((usePulseFile == false or (hdfPlsReader.includedFields.find(requirement) == hdfPlsReader.includedFields.end() or
																						 hdfPlsReader.includedFields[requirement] == false))))) {
							metricMayBeComputed = false;
						}
					}
				}
				else {
					//
					// There are no requirements for this field, so it must exist as
					// a datset in either the bas or pls file.
					//
					if ((useBaseFile  == false or ((hdfBasReader.includedFields.find(metricIt->first) == hdfBasReader.includedFields.end() or
																					hdfBasReader.includedFields[metricIt->first] == false))) and
							(usePulseFile == false or (((hdfPlsReader.includedFields.find(metricIt->first) == hdfPlsReader.includedFields.end() or
																					 hdfPlsReader.includedFields[metricIt->first] == false))))) {
						metricMayBeComputed = false;
					}
				}
			}
			if (metricMayBeComputed == false) {
        if (failOnMissingData) {
          cout << "ERROR";
        }
        else {
          cout << "WARNING";
        }
        cout << ": There is insufficient data to compute metric: " << metricIt->first << " in the file " << movieFileNames[fofnMovieIndex] << " ";
        cout << " It will be ignored." << endl;
        if (failOnMissingData) {
          exit(1);
        }
				metricOptions[metricIt->first] = false;
			}
		}

		
		UInt i;
		//
		// This is currently used as a sentinal for showing that an array
		// element does not have a value stored for it, as in deleted
		// bases. 
		//

		
		vector<int> pulseIndexArray;
		vector<unsigned int> statTime;

		if (metricOptions["WhenStarted"]) {
			string whenStarted;
			if (hdfPlsReader.scanDataReader.useWhenStarted == false) {
				cout << "ERROR! Attempting to read WhenStarted from " 
						 << movieFileNames[fofnMovieIndex]
						 << " but the attriubte does not exist." << endl;
				exit(1);
			}
			hdfPlsReader.scanDataReader.ReadWhenStarted(whenStarted);
			
			if (!cmpReader.movieInfoGroup.whenStartedArray.IsInitialized()) {
				cmpReader.movieInfoGroup.whenStartedArray.Initialize(cmpReader.movieInfoGroup.movieInfoGroup, "WhenStarted");
			}

			cmpReader.movieInfoGroup.whenStartedArray.Write(&whenStarted, 1);
		}

    if (AnyFieldRequiresFrameRate(datasetFields)) {
      if (useBaseFile) {
        cmpReader.movieInfoGroup.StoreFrameRate(m, baseFile.GetFrameRate());
      }
      else if (usePulseFile) {
        cmpReader.movieInfoGroup.StoreFrameRate(m, pulseFile.GetFrameRate());
      }
    }
				
		//
		// An index set is a set of indices into the alignment array that
		// are of reads generated by this movie.  Load pulses for all
		// alignments generated for this movie.
		//

		//
		// Movie index sets should be sorted by alignment index. Build a lookup table for this.
		//
		
		std::vector<std::pair<int,int> > toFrom;
		for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) {
			alignmentIndex = movieIndexSets[movieIndex][movieAlignmentIndex];
			toFrom.push_back(std::pair<int,int>(cmpFile.alnInfo.alignments[alignmentIndex].GetAlignmentId(), movieAlignmentIndex));
		}
		// orders by first by default.
		std::sort(toFrom.begin(), toFrom.end());

    //
    // Load metrics for alignments from movie 'movieIndex'.
    //
    cout << "loading " <<  movieIndexSets[movieIndex].size() << " alignments for movie " << movieIndex << endl;
		for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) {
			alignmentIndex = movieIndexSets[movieIndex][toFrom[movieAlignmentIndex].second];


			//
			// Alignments are groupsd by ref group id then movie id.
			//
			int refGroupId  = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId();
			int movieId     = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId();
      UInt holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber();

      //
      // Since the movie may be split into multiple parts, look to see
      // if this hole number is one of the ones covered by this
      // set. If it is not, just continue. It will be loaded on
      // another pass through a different movie part.
      //
      if (moviePartHoleNumbers.find(holeNumber) == moviePartHoleNumbers.end()) {
        continue;
      }

			//
			// Now locate where this movie is stored.
			//

			if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) {
				cout << "ERROR!  An alignment " << alignmentIndex << " is specified with reference group " << endl
						 << refGroupId << " that is not found as an alignment group." << endl;
				exit(1);
			}
			int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId];
			
			//
			// Now find the group containing the alignment for this movie.
			//
			if (cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.find(movieId) ==
					cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.end()) {
				cout << "ERROR!  An alignment " << alignmentIndex << " is specified with movie index " << endl
						 << movieId << " that is not found in the alignment group " << refGroupIndex << endl;
				exit(1);
			}

			int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex[movieId];
      
			//
			// First do sanity check on the read to make sure the pules and the bases match.
			//

			//
			// Look to see if the output HDF arrays need to be created.
			//
			UInt offsetBegin, offsetEnd;
		
			offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin();
			offsetEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd();
		
			int alignedSequenceLength = offsetEnd - offsetBegin;
			if (alignedSequenceLength >= 0) {
				alignedSequence.resize(alignedSequenceLength);
				byteAlignment.resize(alignedSequenceLength);
			}
	
			//
			// Read the alignment string.  All alignments 
			//
			cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, 
																																															 offsetEnd, 
																																															 &byteAlignment[0]);
		
			//
			// Convert to something we can compare easily.
			//
			ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]);


			//
			// Do a sanity check to make sure the pulses and the alignment
			// make sense.  The main check is to see if the query sequence
			// in the alignment is the same as the query sequence in the
			// read. 
			//
		
			//
			// First pull out the bases corresponding to this read.
			//
			int queryStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart();
			int queryEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd();

      // Build a map of where 
      CreateSequenceToAlignmentMap(byteAlignment, 
                                   baseToAlignmentMap);


			//
			// Condense gaps in the alignment for easy comparison.
			//
      //
      RemoveGaps(alignedSequence, alignedSequence);
      
			
			//
			// Query the cmp file for a way to look up a read based on
			// coordinate information.  For Astro reads, the coords are
			// based on x and y.  For Springfield, it is read index.  The
			// base files should be able to look up reads by x,y or by
			// index. 
			//
			int readIndex;

          
      if (cmpFile.platformId == Astro) {
        cout << "ASTRO pulse loading is deprecated." << endl;
        exit(0);
      }

      if (baseFile.LookupReadIndexByHoleNumber(holeNumber, readIndex) == false) {
          cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl;
          assert(0);
      }

			int readStart, readLength, alignBaseStart, alignBaseEnd, alignBaseLength;
			readStart       = baseFile.readStartPositions[readIndex];
			readLength      = baseFile.readStartPositions[readIndex+1] - baseFile.readStartPositions[readIndex];
			alignBaseStart  = readStart + queryStart;
			alignBaseEnd    = readStart + queryEnd;
			alignBaseLength = alignBaseEnd - alignBaseStart;

			int pulseStart;
			if (usePulseFile) {
				pulseStart      = pulseFile.pulseStartPositions[readIndex];
			}

	    
      //
      // This maps from pulse to a base, since there are more pulses
      // called than bases, and the is one pulse for every base.
      //
			pulseIndexArray.resize(readLength);

      
      SMRTSequence sourceRead;
      unsigned int numPasses;
      //
      // These are not allocated in the regular allocate function
      // since they are only used in loadPulses. (maybe I should
      // subclass SMRTSequence here).
      //
      
      if (byRead) {
        // Read in the data from the bas file if it exsts.
        if (useBaseFile) {
          hdfBasReader.GetReadAt(readIndex, sourceRead);
          if (cmpFile.readType == ReadType::CCS or useCcs) {
            numPasses = hdfCcsReader.GetNumPasses(readIndex);
          }
        }
        // Read in the data from the pls file if it exists.
        if (usePulseFile) {
          hdfPlsReader.GetReadAt(readIndex, sourceRead.pulseIndex, sourceRead);
        }
      }
      else {
        //
        // The entire base/pulse file was read in, so copy data from that into a read
        // For the data used in the read, it is possible to simply
        // reference the data,  but for the pls file it is necessary
        // to copy since there is a packing of data.
        //
        if (useBaseFile) {
          baseFile.CopyReadAt(readIndex, sourceRead);
          if (cmpFile.readType == ReadType::CCS or useCcs) {
            numPasses = hdfCcsReader.GetNumPasses(readIndex);
          }
        }
        if (usePulseFile) {
          //
          // Copy the subset of pulses that correspond to the ones called as bases.
          //
          int i;
          for (i = 0; i < readLength; i++) {
            pulseIndexArray[i] = pulseStart + baseFile.pulseIndex[readStart + i];
          }
          pulseFile.CopyReadAt(readIndex, &pulseIndexArray[0], sourceRead);
        }
      }

      readSequence.resize(queryEnd - queryStart);
      CapQualityValues(sourceRead);
			copy((char*) (sourceRead.seq + queryStart),
					 (char*) (sourceRead.seq + queryEnd),
					 readSequence.begin());
      
			bool stringsMatch = true;
			if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) {
				cout << "ERROR, the query sequence does not match the aligned query sequence." << endl;
				cout << "HoleNumber: "<< holeNumber << ", MovieName: " << cmpFileMovieName;
        cout << " ,ReadIndex: " << (int) readIndex << 
				cout << ", qStart: "<< queryStart << ", qEnd: " << queryEnd << endl;
				cout << "Aligned sequence: "<< endl;
				cout << alignedSequence << endl;
				cout << "Original sequence: " << endl;
				cout << readSequence << endl;
				assert(0);
      }

			/*
			 * Compute any necessary data fields.  These usually involve
			 * using differences of pulse indices, pulse widths, etc..
			 * Missing fields are stored as 0's. 
			 */

			vector<float> readPulseMetric;
            vector<float> floatMetric;
      vector<UChar> qvMetric;
      vector<HalfWord> frameRateMetric;
      vector<uint32_t> timeMetric;
			int ungappedAlignedSequenceLength = alignedSequence.size();
			
      floatMetric.resize(alignedSequenceLength+1);
      readPulseMetric.resize(alignedSequenceLength+1);
      qvMetric.resize(alignedSequenceLength+1);
      frameRateMetric.resize(alignedSequenceLength+1);
      timeMetric.resize(alignedSequenceLength+1);

			UInt i;
			UInt pi;

			HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex];

      if (cmpFile.readType == ReadType::CCS or useCcs) {
        if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) {
          cmpReader.alnInfoGroup.InitializeNumPasses();
        }
        cmpReader.alnInfoGroup.numPasses.WriteToPos(&numPasses, 1, alignmentIndex);
      }
      
			if (metricOptions["StartTimeOffset"] == true) {
				if (!expGroup->startTimeOffset.IsInitialized()) {
					expGroup->startTimeOffset.Initialize(expGroup->experimentGroup, "StartTimeOffset");
				}
        unsigned int readStartTimeOffset = sourceRead.startFrame[queryStart];
				expGroup->startTimeOffset.WriteToPos(&readStartTimeOffset, 1, alignmentIndex);
			}

			if (metricOptions["QualityValue"] == true) {
				if (!expGroup->qualityValue.IsInitialized()) {
					expGroup->qualityValue.Initialize(expGroup->experimentGroup, "QualityValue");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.qual[queryStart + i];
        }
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->qualityValue.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["InsertionQV"] == true) {
				if (!expGroup->insertionQV.IsInitialized()) {
					expGroup->insertionQV.Initialize(expGroup->experimentGroup, "InsertionQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.insertionQV[queryStart+ i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->insertionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["MergeQV"] == true) {
				if (!expGroup->mergeQV.IsInitialized()) {
					expGroup->mergeQV.Initialize(expGroup->experimentGroup, "MergeQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.mergeQV[queryStart+ i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->mergeQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["DeletionQV"] == true) {
				if (!expGroup->deletionQV.IsInitialized()) {
					expGroup->deletionQV.Initialize(expGroup->experimentGroup, "DeletionQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.deletionQV[queryStart+i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->deletionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["DeletionTag"] == true) {
				if (!expGroup->deletionTag.IsInitialized()) {
					expGroup->deletionTag.Initialize(expGroup->experimentGroup, "DeletionTag");
				}
        vector<char> readDeletionTagMetric;
        readDeletionTagMetric.resize(readPulseMetric.size());
				// Store start time normalized to frame rate.
				for (i = 0; i < readDeletionTagMetric.size()-1; i++ ) {
					readDeletionTagMetric[i] = '-';
				}
        readDeletionTagMetric[i] = '\0';
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          assert(baseToAlignmentMap[i] < readDeletionTagMetric.size());
					readDeletionTagMetric[baseToAlignmentMap[i]] = sourceRead.deletionTag[queryStart+i];
				}
				readDeletionTagMetric[readDeletionTagMetric.size()-1] = 0;
				expGroup->deletionTag.WriteToPos(&readDeletionTagMetric[0], readDeletionTagMetric.size(), offsetBegin);
			}

			if (metricOptions["PulseIndex"] == true) {
        
				if (!expGroup->pulseIndex.IsInitialized()) {
					expGroup->pulseIndex.Initialize(expGroup->experimentGroup, "PulseIndex");
				}
				vector<uint32_t> readPulseIndexMetric;
        fill(readPulseIndexMetric.begin(), readPulseIndexMetric.end(), missingPulseIndex);
        readPulseIndexMetric.resize(readPulseMetric.size());
				// Store start time normalized to frame rate.
        assert(readPulseIndexMetric.size() > 0);
				for (i = 0; i < readPulseIndexMetric.size(); i++ ) {
          readPulseIndexMetric[i] = 0;
				}
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					readPulseIndexMetric[baseToAlignmentMap[i]] = sourceRead.pulseIndex[queryStart+i];
				}
				readPulseIndexMetric[readPulseIndexMetric.size()-1] = 0;
				expGroup->pulseIndex.WriteToPos(&readPulseIndexMetric[0], readPulseIndexMetric.size(), offsetBegin);
			}

			if (metricOptions["SubstitutionTag"] == true) {
				if (!expGroup->substitutionTag.IsInitialized()) {
					expGroup->substitutionTag.Initialize(expGroup->experimentGroup, "SubstitutionTag");
				}
				vector<char> readSubstitutionTagMetric;
        readSubstitutionTagMetric.resize(readPulseMetric.size());
				// Store start time normalized to frame rate.
				for (i = 0; i < readSubstitutionTagMetric.size()-1; i++ ) {
          readSubstitutionTagMetric[i] = '-';
				}
        readSubstitutionTagMetric[i] = '\0';
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					readSubstitutionTagMetric[baseToAlignmentMap[i]] = sourceRead.substitutionTag[queryStart+i];
				}
				readSubstitutionTagMetric[readSubstitutionTagMetric.size()-1] = 0;
				expGroup->substitutionTag.WriteToPos(&readSubstitutionTagMetric[0], readSubstitutionTagMetric.size(), offsetBegin);
      }

			if (metricOptions["SubstitutionQV"] == true) {
				if (!expGroup->substitutionQV.IsInitialized()) {
					expGroup->substitutionQV.Initialize(expGroup->experimentGroup, "SubstitutionQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.substitutionQV[queryStart+i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->substitutionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["ClassifierQV"] == true) {
				
				if (!expGroup->classifierQV.IsInitialized()) {
					expGroup->classifierQV.Initialize(expGroup->experimentGroup, "ClassifierQV");			
				}
				// Store start time normalized to frame rate.
        fill(floatMetric.begin(), floatMetric.end(), missingQualityValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					floatMetric[baseToAlignmentMap[i]] = sourceRead.classifierQV[i+queryStart];
				}
				floatMetric[floatMetric.size()-1] = 0;
				expGroup->classifierQV.WriteToPos(&floatMetric[0], floatMetric.size(), offsetBegin);
			}

			if (metricOptions["StartFrame"] == true) {
				if (!expGroup->startTime.IsInitialized()) {
					expGroup->startTime.Initialize(expGroup->experimentGroup, "StartFrame");			
				}

        if (useBaseFile) {
          sourceRead.startFrame = new unsigned int[sourceRead.length];
          copy(sourceRead.preBaseFrames, &sourceRead.preBaseFrames[sourceRead.length], sourceRead.startFrame);
          for (i = 0; i < sourceRead.length-1; i++) {
            sourceRead.startFrame[i+1] += sourceRead.widthInFrames[i];
          }
          partial_sum(sourceRead.startFrame, &sourceRead.startFrame[sourceRead.length],  sourceRead.startFrame);
        }
				
				// Store start time normalized to frame rate.
        fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					timeMetric[baseToAlignmentMap[i]] = sourceRead.startFrame[i+queryStart];
				}
				timeMetric[timeMetric.size()-1] = 0;
				expGroup->startTime.WriteToPos(&timeMetric[0], timeMetric.size(), offsetBegin);
			}

			if (metricOptions["PulseWidth"] == true) {
				if (!expGroup->pulseWidth.IsInitialized()) {
					expGroup->pulseWidth.Initialize(expGroup->experimentGroup, "PulseWidth");			
				}
				// Store start time normalized to frame rate.
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);

        //
        // For legacy reasons, it's possible the width in frames is
        // stored in the bas file. If this is the case, use the width
        // in frames there.  Otherwise, use the width in frames stored
        // in the pls file.
        for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[queryStart + i];
        }
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->pulseWidth.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);
			}

			if (metricOptions["PreBaseFrames"] == true) {
				if (!expGroup->preBaseFrames.IsInitialized()) {
					expGroup->preBaseFrames.Initialize(expGroup->experimentGroup, "PreBaseFrames");
				}
				// Compute width in frames normalized to frame rate.
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i+queryStart];
				}
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->preBaseFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);
			}

			if (metricOptions["WidthInFrames"] == true) {
				if (!expGroup->widthInFrames.IsInitialized()) {
					expGroup->widthInFrames.Initialize(expGroup->experimentGroup, "WidthInFrames");
				}
				// Compute width in frames normalized to frame rate.
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					if (usePulseFile) {
						frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart];
					}
					else {
						frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart];
          }
				}
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->widthInFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);
			}

			if (metricOptions["pkmid"] == true) {

				if (!expGroup->pkmid.IsInitialized()) {
					expGroup->pkmid.Initialize(expGroup->experimentGroup, "pkmid");
				}

				for (i = 0; i < readPulseMetric.size(); i++ ) {
          readPulseMetric[i] = NaN;
				}
				
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          readPulseMetric[baseToAlignmentMap[i]] = sourceRead.midSignal[i+queryStart];
        }
				readPulseMetric[readPulseMetric.size()-1] = 0;
				expGroup->pkmid.WriteToPos(&readPulseMetric[0], readPulseMetric.size(), offsetBegin);
			}

			if (metricOptions["IPD"] == true) {
				if (!expGroup->ipd.IsInitialized()) {
					expGroup->ipd.Initialize(expGroup->experimentGroup, "IPD");
				}
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);				

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {

					//
					// The IPD is undefined for the first base in a read.
          //
					if (usePulseFile ) {
						if (queryStart == 0 and i == 0) {
              frameRateMetric[baseToAlignmentMap[i]] = 0;
						}
						else {
              frameRateMetric[baseToAlignmentMap[i]] = (sourceRead.startFrame[i+queryStart]  
                                                        - sourceRead.startFrame[i+queryStart-1]
                                                        - sourceRead.widthInFrames[i+queryStart-1]);
						}
					}
					else if (useBaseFile) {
            frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i + queryStart];
					}
        }
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->ipd.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);			
			}

			
			if (metricOptions["Light"] == true) {
				if (!expGroup->light.IsInitialized()) {
					expGroup->light.Initialize(expGroup->experimentGroup, "Light");
				}
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          frameRateMetric[baseToAlignmentMap[i]] = sourceRead.meanSignal[i+queryStart];
          frameRateMetric[baseToAlignmentMap[i]] = (frameRateMetric[baseToAlignmentMap[i]] * 
                                                    sourceRead.widthInFrames[i+queryStart]);
				}
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->light.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);			
			}
    
      sourceRead.Free();
      Free(sourceRead.meanSignal);
      Free(sourceRead.maxSignal);
      Free(sourceRead.midSignal);
      Free(sourceRead.startFrame);
      Free(sourceRead.classifierQV);
      Free(sourceRead.widthInFrames);
		}

    if (byRead == true) {
      if (useBaseFile) {
        hdfBasReader.Close();
      }
      if (cmpFile.readType == ReadType::CCS or useCcs) {
        hdfCcsReader.Close();
      }
      if (usePulseFile) {
        hdfPlsReader.Close();
      }
    }
	} // done loading movies


	cmpReader.Close();
}