Beispiel #1
0
int main(int argc, char* argv[]) {

	string cmpFileName, movieFileName;

	int argi = 3;
	int numMetrics = 8;
	map<string,bool> metricOptions;
	int maxElements = 0;
	//
	// Default is all options are true
	//
	CreateMetricOptions(metricOptions);
	string metricList = "";
  bool useCcs = false;
  bool byRead = false;
  bool failOnMissingData = false;
  CommandLineParser clp;
  bool printVersion = false;

  clp.RegisterStringOption("basFileName", &movieFileName, "The input {bas,pls}.h5 or input.fofn.", true);
  clp.RegisterStringOption("cmpFileName", &cmpFileName, "The cmp.h5 file to load pulse information into.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterStringOption("metrics", &metricList, "The a string delimited list of metrics (with no spaces).The "
                           "valid options are:  QualityValue, ClassifierQV, MergeQV, StartFrame,"
                           "PulseWidth, pkmid, IPD, and Light.");
  clp.RegisterFlagOption("useccs", &useCcs, "Load pulse information for CCS sequences and not raw bases.");
  clp.RegisterFlagOption("byread", &byRead, "Load pulse information by read rather than buffering an entire pls.h5 file.  "
                         "This option will soon be deprecated and on by default.");
  clp.RegisterIntOption("maxElements", &maxElements, "Set a limit on the size of pls/bas file to buffer in.", CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("failOnMissingData", &failOnMissingData, "Exit if any data fields are missing from the bas.h5 or pls.h5 input that are required to load a metric. Defualt is a warning.");

  clp.SetProgramSummary("Load pulse information such as inter pulse distance, or quality information into the cmp.h5 file."
                        "This allows one to analyze kinetic and quality information by alignment column.");
  clp.ParseCommandLine(argc, argv);

  if (printVersion) {
    cout << VERSION << endl;
    exit(1);
  }

	if (metricList == "") {
		SetDefaultMetricOptions(metricOptions);
	}
  else {
    ParseMetricsList(metricList, metricOptions);
  }


	// 
	// Always read in basecalls since they are used to check the sanity
	// of the alignment indices.
	//
	metricOptions["Basecall"] = true;
	
	//
	// Translate from the metrics to be loaded to the ones that are
	// required to compute them.
	//
	
	vector<string> datasetFields;
	RequirementMap fieldRequirements;
	BuildRequirementMap(fieldRequirements);
	StoreDatasetFieldsFromPulseFields(metricOptions, fieldRequirements, datasetFields);

	
	vector<string> movieFileNames;
	vector<string> fofnMovieNames;

	FileOfFileNames::StoreFileOrFileList(movieFileName, movieFileNames);

	HDFBasReader hdfBasReader;
	HDFPlsReader hdfPlsReader;
  HDFCCSReader<SMRTSequence> hdfCcsReader;

	vector<string> baseFileFields, pulseFileFields;
	int fieldIndex;

	bool useBaseFile = false, usePulseFile = false;

	for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) {
		if (hdfBasReader.ContainsField(datasetFields[fieldIndex])) {
			useBaseFile = true;
			baseFileFields.push_back(datasetFields[fieldIndex]);
		}
	}

	if (maxElements != 0) {
		hdfBasReader.maxAllocNElements = maxElements;
		hdfPlsReader.maxAllocNElements = maxElements;
	}

	//
	// For now, all runs will attempt to use information from a .bas
	// file, since it's assumed that if one has alignments, one has a
	// .bas file.
	//
	useBaseFile = true;
	//
	// Add some default fields.
	//
	hdfBasReader.IncludeField("Basecall");
	hdfBasReader.IncludeField("PulseIndex");

  hdfBasReader.InitializeFields(baseFileFields);

  
	for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) {
		if (hdfPlsReader.ContainsField(datasetFields[fieldIndex])) {
			usePulseFile = true;
			pulseFileFields.push_back(datasetFields[fieldIndex]);
		}
	}
	if (usePulseFile) {
		hdfPlsReader.InitializeFields(pulseFileFields);
	}
	hdfPlsReader.IncludeField("NumEvent");

		
	
	int nMovies = movieFileNames.size();
	int movieIndex;
	MovieNameToArrayIndex movieNameMap;
	//
	// Initialize movies. This accomplishes two tasks.  First, all movie
	// files are opened and initialized, so that if there are data
	// fields missing the program will exit now rather than in the
	// middle of loading pulses.  
	// Next, a list of movie names is created in fofnMovieNames.  The
	// cmp file does not necessarily index movies in the order of the
	// fofn, and so when loading pulses from a movie indexed by a cmp
	// file, one needs to look up the file name of the movie.  This is
	// done by scanning the fofnMovieNames list in order until the movie
	// is found. 

	for (movieIndex = 0; movieIndex < nMovies; movieIndex++) {

    if (!hdfBasReader.Initialize(movieFileNames[movieIndex])) {
      cout << "ERROR, could not initialize HDF file "
           << movieFileNames[movieIndex] << " for reading bases." << endl;
      exit(1);
    }
    else {
      fofnMovieNames.push_back(hdfBasReader.GetMovieName());
      movieNameMap[hdfBasReader.GetMovieName()] = movieIndex;
      hdfBasReader.Close();
    }

		// 
		// The pulse file is optional.  
		//
		if (usePulseFile) {
			if (hdfPlsReader.Initialize(movieFileNames[movieIndex]) == 0) {
				usePulseFile = false;
			}
		}		
	}

	CmpFile cmpFile;
	
	/*
	 * These readers pull information from the same pls file.
	 */
	HDFCmpFile<CmpAlignment> cmpReader;

	if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDWR) == 0) {
		cout << "ERROR, could not open the cmp file." << endl;
		exit(0);
	}
	
  cmpReader.Read(cmpFile);

  string commandLine;
  clp.CommandLineToString(argc, argv, commandLine);
  string versionStr(VERSION);
  AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionStr);
  cmpReader.fileLogGroup.AddEntry(commandLine, "Loading pulse metrics", "loadPulses", GetTimestamp(), versionStr);


	//
	// Group alignment indices by movie so that they may be processed one movie at a time
	// later on.  The movie indices set keeps track of all indices
	// listed in alignment files.  This keeps a reference to all
	// alignments in memory at once.   At the time of writing this, most
	// projects will have at most a few million alignments, and so the
	// size of this structure is modest.
	//

	UInt alignmentIndex;
	map<int, vector<int> > movieIndexSets;

	for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) {
		movieIndexSets[cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId()].push_back(alignmentIndex);
	}

	vector<float>  computedPulseField;
	string   alignedSequence;
	string   readSequence;
	vector<unsigned char> byteAlignment;
	int m;
	vector<int> baseToAlignmentMap;

	//
	// Load pulses from movies in order they appear in the input fofn.
	//
  int fofnMovieIndex;
  for (fofnMovieIndex = 0; fofnMovieIndex < fofnMovieNames.size(); fofnMovieIndex++) {
    
    if (cmpFile.readType == ReadType::CCS or useCcs) {
      hdfBasReader.SetReadBasesFromCCS();
      hdfCcsReader.Initialize(movieFileNames[fofnMovieIndex]);
    }
    hdfBasReader.Initialize(movieFileNames[fofnMovieIndex]);
		BaseFile  baseFile;
		PulseFile pulseFile;

    if (byRead == false) {
      //
      // Read the entire bas file at once, and then extract values
      // from memory.  This can be faster depending on the chunk
      // size and size of the movie.
      //
      hdfBasReader.ReadBaseFile(baseFile);
      hdfBasReader.Close();
    }
    else {
      //
      // Reads are scanned one by instead of caching all.  It is
      // still necessary to read in some of the datasets entirely,
      // in particular the start positions and hole numbers.
      //


      // This is repeated below for a pulse file.  Since the pulse
      // and base files are separate objects, the scan data is
      // read into each separately.  Somehow later the information
      // should be merged into just one.
      if (hdfBasReader.scanDataReader.fileHasScanData) {
        hdfBasReader.scanDataReader.Read(baseFile.scanData);
      }
      baseFile.readStartPositions.resize(hdfBasReader.nReads+1);
      baseFile.readStartPositions[0] = 0;
      hdfBasReader.GetAllReadLengths(baseFile.readLengths);
      int i;
      assert(baseFile.readLengths.size() + 1 == baseFile.readStartPositions.size());
      for (i = 1; i < hdfBasReader.nReads + 1; i++ ) {
        baseFile.readStartPositions[i] = baseFile.readLengths[i-1] + baseFile.readStartPositions[i-1];
      }
      
      //
      // Although the whole bas file isn't being read in, it is
      // necessary to read in which hole numbers are contained in this
      // bas file since it is possible that the alignment for a
      // particular hole number may be in a different input bas.h5
      // file even if it is the same movie. 
      //
      hdfBasReader.GetAllHoleNumbers(baseFile.holeNumbers);
    }
    set<uint32_t> moviePartHoleNumbers;
    copy(baseFile.holeNumbers.begin(), baseFile.holeNumbers.end(), inserter(moviePartHoleNumbers, moviePartHoleNumbers.begin()));

		
		if (usePulseFile) {
			hdfPlsReader.Initialize(movieFileNames[fofnMovieIndex]);
			hdfPlsReader.IncludeField("NumEvent");
      hdfPlsReader.IncludeField("StartFrame");
      if (byRead == false) { 
        hdfPlsReader.ReadPulseFile(pulseFile);
        hdfPlsReader.Close();
      }
      else {
        if (usePulseFile) {
          pulseFile.pulseStartPositions.resize(hdfBasReader.nReads+1);
          pulseFile.pulseStartPositions[0] = 0;
          hdfPlsReader.GetAllNumEvent(pulseFile.numEvent);
          int i;
          for (i = 1; i < hdfBasReader.nReads + 1; i++ ) {
            pulseFile.pulseStartPositions[i] = pulseFile.numEvent[i-1] + pulseFile.pulseStartPositions[i-1];
          }
          if (hdfPlsReader.scanDataReader.fileHasScanData) {
            hdfPlsReader.scanDataReader.Read(pulseFile.scanData);
          }
        }
      }
		}

    string cmpFileMovieName;

    for (m = 0; m < cmpFile.movieInfo.name.size(); m++) {
      //
      // First find the file name for the movie 'm'
      //
      cmpFileMovieName = cmpFile.movieInfo.name[m];
      int fofnMovieIndex;
      
      if (baseFile.GetMovieName() == cmpFileMovieName) {
				break;
			}
		}

    //
    // If the movie specified in the input.fofn is not found in the
    // cmp file, that indicates something bad is happeing.  Either the
    // input.fofn was not used to generate the cmp.h5 file, or no
    // alignments were found between the input bas.h5 and the
    // reference.  That shouldn't happen.
    // 
		if (m == cmpFile.movieInfo.name.size()) {
			cout << "WARNING: The movie indexed in the compare file " << cmpFileMovieName << " is not listed in the file " << movieFileName << endl;
			continue;
		}
		
		//
		// Open the movie and load its pulses into memory.
		//
		movieIndex = cmpFile.movieInfo.id[m];
		int movieAlignmentIndex;
		float NaN = 0.0/0.0;
    
    UChar missingQualityValue = 255;
    HalfWord missingFrameRateValue    = USHRT_MAX;
    unsigned int missingPulseIndex = UINT_MAX;
    //
    // Since usePulseFile is set when the input file is a pulseFile,
    // and ReadType::CCS becomes the read type when the alignments are
    // ccs, when pulse files are specified for de novo ccs alignments,
    // they will be opened as pulse files.  Since the de novo ccs
    // sequences do not have pulse file information, the auto-reading
    // of pulse files needs to be disabled.  Do that here.
    //
    if (cmpFile.readType == ReadType::CCS or useCcs) {
      usePulseFile = false;
    }


		//
		// Now check the sanity of metric options.
		//

		map<string,bool>::iterator metricIt;
		for (metricIt = metricOptions.begin(); metricIt != metricOptions.end(); ++metricIt) {
			if (metricIt->second == false) {
				continue;
			}
			bool metricMayBeComputed = true;
      if (cmpFile.readType == ReadType::CCS and
          metricIt->first != "QualityValue"  and
          metricIt->first != "DeletionQV" and
          metricIt->first != "SubstitutionQV" and
          metricIt->first != "InsertionQV" and
          metricIt->first != "DeletionTag" and
          metricIt->first != "SubstitutionTag" and
          metricIt->first != "Basecall") {
        cout << "ERROR! The metric " << metricIt->first << " cannot be loaded into de novo ccs alignemnts." << endl;
        //        exit(0);
        metricMayBeComputed = false;
      }
      
			if (metricIt->first == "IPD") {
				//
				// The field requirements for IPD are special. 
				//
				if ((useBaseFile and !hdfBasReader.FieldIsIncluded("PreBaseFrames")) or
						(usePulseFile and (!hdfPlsReader.FieldIsIncluded("StartFrame") and
															 !hdfPlsReader.FieldIsIncluded("WidthInFrames")))) {
					metricMayBeComputed = false;
				}
			}
			else {
				if (fieldRequirements.find(metricIt->first) != fieldRequirements.end()) {
					//
					// There are requirements for this field. Make sure all are
					// present before trying to compute this field.
					//
					int requirementIndex;
					for (requirementIndex = 0; requirementIndex < fieldRequirements[metricIt->first].size(); ++requirementIndex) {
						string requirement;
						requirement = fieldRequirements[metricIt->first][requirementIndex];
				
						if (((useBaseFile == false or ((hdfBasReader.includedFields.find(requirement) == hdfBasReader.includedFields.end() or
 																						hdfBasReader.includedFields[requirement] == false))) and
								 ((usePulseFile == false or (hdfPlsReader.includedFields.find(requirement) == hdfPlsReader.includedFields.end() or
																						 hdfPlsReader.includedFields[requirement] == false))))) {
							metricMayBeComputed = false;
						}
					}
				}
				else {
					//
					// There are no requirements for this field, so it must exist as
					// a datset in either the bas or pls file.
					//
					if ((useBaseFile  == false or ((hdfBasReader.includedFields.find(metricIt->first) == hdfBasReader.includedFields.end() or
																					hdfBasReader.includedFields[metricIt->first] == false))) and
							(usePulseFile == false or (((hdfPlsReader.includedFields.find(metricIt->first) == hdfPlsReader.includedFields.end() or
																					 hdfPlsReader.includedFields[metricIt->first] == false))))) {
						metricMayBeComputed = false;
					}
				}
			}
			if (metricMayBeComputed == false) {
        if (failOnMissingData) {
          cout << "ERROR";
        }
        else {
          cout << "WARNING";
        }
        cout << ": There is insufficient data to compute metric: " << metricIt->first << " in the file " << movieFileNames[fofnMovieIndex] << " ";
        cout << " It will be ignored." << endl;
        if (failOnMissingData) {
          exit(1);
        }
				metricOptions[metricIt->first] = false;
			}
		}

		
		UInt i;
		//
		// This is currently used as a sentinal for showing that an array
		// element does not have a value stored for it, as in deleted
		// bases. 
		//

		
		vector<int> pulseIndexArray;
		vector<unsigned int> statTime;

		if (metricOptions["WhenStarted"]) {
			string whenStarted;
			if (hdfPlsReader.scanDataReader.useWhenStarted == false) {
				cout << "ERROR! Attempting to read WhenStarted from " 
						 << movieFileNames[fofnMovieIndex]
						 << " but the attriubte does not exist." << endl;
				exit(1);
			}
			hdfPlsReader.scanDataReader.ReadWhenStarted(whenStarted);
			
			if (!cmpReader.movieInfoGroup.whenStartedArray.IsInitialized()) {
				cmpReader.movieInfoGroup.whenStartedArray.Initialize(cmpReader.movieInfoGroup.movieInfoGroup, "WhenStarted");
			}

			cmpReader.movieInfoGroup.whenStartedArray.Write(&whenStarted, 1);
		}

    if (AnyFieldRequiresFrameRate(datasetFields)) {
      if (useBaseFile) {
        cmpReader.movieInfoGroup.StoreFrameRate(m, baseFile.GetFrameRate());
      }
      else if (usePulseFile) {
        cmpReader.movieInfoGroup.StoreFrameRate(m, pulseFile.GetFrameRate());
      }
    }
				
		//
		// An index set is a set of indices into the alignment array that
		// are of reads generated by this movie.  Load pulses for all
		// alignments generated for this movie.
		//

		//
		// Movie index sets should be sorted by alignment index. Build a lookup table for this.
		//
		
		std::vector<std::pair<int,int> > toFrom;
		for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) {
			alignmentIndex = movieIndexSets[movieIndex][movieAlignmentIndex];
			toFrom.push_back(std::pair<int,int>(cmpFile.alnInfo.alignments[alignmentIndex].GetAlignmentId(), movieAlignmentIndex));
		}
		// orders by first by default.
		std::sort(toFrom.begin(), toFrom.end());

    //
    // Load metrics for alignments from movie 'movieIndex'.
    //
    cout << "loading " <<  movieIndexSets[movieIndex].size() << " alignments for movie " << movieIndex << endl;
		for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) {
			alignmentIndex = movieIndexSets[movieIndex][toFrom[movieAlignmentIndex].second];


			//
			// Alignments are groupsd by ref group id then movie id.
			//
			int refGroupId  = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId();
			int movieId     = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId();
      UInt holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber();

      //
      // Since the movie may be split into multiple parts, look to see
      // if this hole number is one of the ones covered by this
      // set. If it is not, just continue. It will be loaded on
      // another pass through a different movie part.
      //
      if (moviePartHoleNumbers.find(holeNumber) == moviePartHoleNumbers.end()) {
        continue;
      }

			//
			// Now locate where this movie is stored.
			//

			if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) {
				cout << "ERROR!  An alignment " << alignmentIndex << " is specified with reference group " << endl
						 << refGroupId << " that is not found as an alignment group." << endl;
				exit(1);
			}
			int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId];
			
			//
			// Now find the group containing the alignment for this movie.
			//
			if (cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.find(movieId) ==
					cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex.end()) {
				cout << "ERROR!  An alignment " << alignmentIndex << " is specified with movie index " << endl
						 << movieId << " that is not found in the alignment group " << refGroupIndex << endl;
				exit(1);
			}

			int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->movieIdToIndex[movieId];
      
			//
			// First do sanity check on the read to make sure the pules and the bases match.
			//

			//
			// Look to see if the output HDF arrays need to be created.
			//
			UInt offsetBegin, offsetEnd;
		
			offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin();
			offsetEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd();
		
			int alignedSequenceLength = offsetEnd - offsetBegin;
			if (alignedSequenceLength >= 0) {
				alignedSequence.resize(alignedSequenceLength);
				byteAlignment.resize(alignedSequenceLength);
			}
	
			//
			// Read the alignment string.  All alignments 
			//
			cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, 
																																															 offsetEnd, 
																																															 &byteAlignment[0]);
		
			//
			// Convert to something we can compare easily.
			//
			ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]);


			//
			// Do a sanity check to make sure the pulses and the alignment
			// make sense.  The main check is to see if the query sequence
			// in the alignment is the same as the query sequence in the
			// read. 
			//
		
			//
			// First pull out the bases corresponding to this read.
			//
			int queryStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart();
			int queryEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd();

      // Build a map of where 
      CreateSequenceToAlignmentMap(byteAlignment, 
                                   baseToAlignmentMap);


			//
			// Condense gaps in the alignment for easy comparison.
			//
      //
      RemoveGaps(alignedSequence, alignedSequence);
      
			
			//
			// Query the cmp file for a way to look up a read based on
			// coordinate information.  For Astro reads, the coords are
			// based on x and y.  For Springfield, it is read index.  The
			// base files should be able to look up reads by x,y or by
			// index. 
			//
			int readIndex;

          
      if (cmpFile.platformId == Astro) {
        cout << "ASTRO pulse loading is deprecated." << endl;
        exit(0);
      }

      if (baseFile.LookupReadIndexByHoleNumber(holeNumber, readIndex) == false) {
          cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl;
          assert(0);
      }

			int readStart, readLength, alignBaseStart, alignBaseEnd, alignBaseLength;
			readStart       = baseFile.readStartPositions[readIndex];
			readLength      = baseFile.readStartPositions[readIndex+1] - baseFile.readStartPositions[readIndex];
			alignBaseStart  = readStart + queryStart;
			alignBaseEnd    = readStart + queryEnd;
			alignBaseLength = alignBaseEnd - alignBaseStart;

			int pulseStart;
			if (usePulseFile) {
				pulseStart      = pulseFile.pulseStartPositions[readIndex];
			}

	    
      //
      // This maps from pulse to a base, since there are more pulses
      // called than bases, and the is one pulse for every base.
      //
			pulseIndexArray.resize(readLength);

      
      SMRTSequence sourceRead;
      unsigned int numPasses;
      //
      // These are not allocated in the regular allocate function
      // since they are only used in loadPulses. (maybe I should
      // subclass SMRTSequence here).
      //
      
      if (byRead) {
        // Read in the data from the bas file if it exsts.
        if (useBaseFile) {
          hdfBasReader.GetReadAt(readIndex, sourceRead);
          if (cmpFile.readType == ReadType::CCS or useCcs) {
            numPasses = hdfCcsReader.GetNumPasses(readIndex);
          }
        }
        // Read in the data from the pls file if it exists.
        if (usePulseFile) {
          hdfPlsReader.GetReadAt(readIndex, sourceRead.pulseIndex, sourceRead);
        }
      }
      else {
        //
        // The entire base/pulse file was read in, so copy data from that into a read
        // For the data used in the read, it is possible to simply
        // reference the data,  but for the pls file it is necessary
        // to copy since there is a packing of data.
        //
        if (useBaseFile) {
          baseFile.CopyReadAt(readIndex, sourceRead);
          if (cmpFile.readType == ReadType::CCS or useCcs) {
            numPasses = hdfCcsReader.GetNumPasses(readIndex);
          }
        }
        if (usePulseFile) {
          //
          // Copy the subset of pulses that correspond to the ones called as bases.
          //
          int i;
          for (i = 0; i < readLength; i++) {
            pulseIndexArray[i] = pulseStart + baseFile.pulseIndex[readStart + i];
          }
          pulseFile.CopyReadAt(readIndex, &pulseIndexArray[0], sourceRead);
        }
      }

      readSequence.resize(queryEnd - queryStart);
      CapQualityValues(sourceRead);
			copy((char*) (sourceRead.seq + queryStart),
					 (char*) (sourceRead.seq + queryEnd),
					 readSequence.begin());
      
			bool stringsMatch = true;
			if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) {
				cout << "ERROR, the query sequence does not match the aligned query sequence." << endl;
				cout << "HoleNumber: "<< holeNumber << ", MovieName: " << cmpFileMovieName;
        cout << " ,ReadIndex: " << (int) readIndex << 
				cout << ", qStart: "<< queryStart << ", qEnd: " << queryEnd << endl;
				cout << "Aligned sequence: "<< endl;
				cout << alignedSequence << endl;
				cout << "Original sequence: " << endl;
				cout << readSequence << endl;
				assert(0);
      }

			/*
			 * Compute any necessary data fields.  These usually involve
			 * using differences of pulse indices, pulse widths, etc..
			 * Missing fields are stored as 0's. 
			 */

			vector<float> readPulseMetric;
            vector<float> floatMetric;
      vector<UChar> qvMetric;
      vector<HalfWord> frameRateMetric;
      vector<uint32_t> timeMetric;
			int ungappedAlignedSequenceLength = alignedSequence.size();
			
      floatMetric.resize(alignedSequenceLength+1);
      readPulseMetric.resize(alignedSequenceLength+1);
      qvMetric.resize(alignedSequenceLength+1);
      frameRateMetric.resize(alignedSequenceLength+1);
      timeMetric.resize(alignedSequenceLength+1);

			UInt i;
			UInt pi;

			HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex];

      if (cmpFile.readType == ReadType::CCS or useCcs) {
        if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) {
          cmpReader.alnInfoGroup.InitializeNumPasses();
        }
        cmpReader.alnInfoGroup.numPasses.WriteToPos(&numPasses, 1, alignmentIndex);
      }
      
			if (metricOptions["StartTimeOffset"] == true) {
				if (!expGroup->startTimeOffset.IsInitialized()) {
					expGroup->startTimeOffset.Initialize(expGroup->experimentGroup, "StartTimeOffset");
				}
        unsigned int readStartTimeOffset = sourceRead.startFrame[queryStart];
				expGroup->startTimeOffset.WriteToPos(&readStartTimeOffset, 1, alignmentIndex);
			}

			if (metricOptions["QualityValue"] == true) {
				if (!expGroup->qualityValue.IsInitialized()) {
					expGroup->qualityValue.Initialize(expGroup->experimentGroup, "QualityValue");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.qual[queryStart + i];
        }
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->qualityValue.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["InsertionQV"] == true) {
				if (!expGroup->insertionQV.IsInitialized()) {
					expGroup->insertionQV.Initialize(expGroup->experimentGroup, "InsertionQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.insertionQV[queryStart+ i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->insertionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["MergeQV"] == true) {
				if (!expGroup->mergeQV.IsInitialized()) {
					expGroup->mergeQV.Initialize(expGroup->experimentGroup, "MergeQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.mergeQV[queryStart+ i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->mergeQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["DeletionQV"] == true) {
				if (!expGroup->deletionQV.IsInitialized()) {
					expGroup->deletionQV.Initialize(expGroup->experimentGroup, "DeletionQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.deletionQV[queryStart+i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->deletionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["DeletionTag"] == true) {
				if (!expGroup->deletionTag.IsInitialized()) {
					expGroup->deletionTag.Initialize(expGroup->experimentGroup, "DeletionTag");
				}
        vector<char> readDeletionTagMetric;
        readDeletionTagMetric.resize(readPulseMetric.size());
				// Store start time normalized to frame rate.
				for (i = 0; i < readDeletionTagMetric.size()-1; i++ ) {
					readDeletionTagMetric[i] = '-';
				}
        readDeletionTagMetric[i] = '\0';
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          assert(baseToAlignmentMap[i] < readDeletionTagMetric.size());
					readDeletionTagMetric[baseToAlignmentMap[i]] = sourceRead.deletionTag[queryStart+i];
				}
				readDeletionTagMetric[readDeletionTagMetric.size()-1] = 0;
				expGroup->deletionTag.WriteToPos(&readDeletionTagMetric[0], readDeletionTagMetric.size(), offsetBegin);
			}

			if (metricOptions["PulseIndex"] == true) {
        
				if (!expGroup->pulseIndex.IsInitialized()) {
					expGroup->pulseIndex.Initialize(expGroup->experimentGroup, "PulseIndex");
				}
				vector<uint32_t> readPulseIndexMetric;
        fill(readPulseIndexMetric.begin(), readPulseIndexMetric.end(), missingPulseIndex);
        readPulseIndexMetric.resize(readPulseMetric.size());
				// Store start time normalized to frame rate.
        assert(readPulseIndexMetric.size() > 0);
				for (i = 0; i < readPulseIndexMetric.size(); i++ ) {
          readPulseIndexMetric[i] = 0;
				}
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					readPulseIndexMetric[baseToAlignmentMap[i]] = sourceRead.pulseIndex[queryStart+i];
				}
				readPulseIndexMetric[readPulseIndexMetric.size()-1] = 0;
				expGroup->pulseIndex.WriteToPos(&readPulseIndexMetric[0], readPulseIndexMetric.size(), offsetBegin);
			}

			if (metricOptions["SubstitutionTag"] == true) {
				if (!expGroup->substitutionTag.IsInitialized()) {
					expGroup->substitutionTag.Initialize(expGroup->experimentGroup, "SubstitutionTag");
				}
				vector<char> readSubstitutionTagMetric;
        readSubstitutionTagMetric.resize(readPulseMetric.size());
				// Store start time normalized to frame rate.
				for (i = 0; i < readSubstitutionTagMetric.size()-1; i++ ) {
          readSubstitutionTagMetric[i] = '-';
				}
        readSubstitutionTagMetric[i] = '\0';
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					readSubstitutionTagMetric[baseToAlignmentMap[i]] = sourceRead.substitutionTag[queryStart+i];
				}
				readSubstitutionTagMetric[readSubstitutionTagMetric.size()-1] = 0;
				expGroup->substitutionTag.WriteToPos(&readSubstitutionTagMetric[0], readSubstitutionTagMetric.size(), offsetBegin);
      }

			if (metricOptions["SubstitutionQV"] == true) {
				if (!expGroup->substitutionQV.IsInitialized()) {
					expGroup->substitutionQV.Initialize(expGroup->experimentGroup, "SubstitutionQV");
				}
				
				// Store start time normalized to frame rate.
        fill(qvMetric.begin(), qvMetric.end(), missingQualityValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          qvMetric[baseToAlignmentMap[i]] = sourceRead.substitutionQV[queryStart+i];
				}
				qvMetric[qvMetric.size()-1] = 0;
				expGroup->substitutionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin);
			}

			if (metricOptions["ClassifierQV"] == true) {
				
				if (!expGroup->classifierQV.IsInitialized()) {
					expGroup->classifierQV.Initialize(expGroup->experimentGroup, "ClassifierQV");			
				}
				// Store start time normalized to frame rate.
        fill(floatMetric.begin(), floatMetric.end(), missingQualityValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					floatMetric[baseToAlignmentMap[i]] = sourceRead.classifierQV[i+queryStart];
				}
				floatMetric[floatMetric.size()-1] = 0;
				expGroup->classifierQV.WriteToPos(&floatMetric[0], floatMetric.size(), offsetBegin);
			}

			if (metricOptions["StartFrame"] == true) {
				if (!expGroup->startTime.IsInitialized()) {
					expGroup->startTime.Initialize(expGroup->experimentGroup, "StartFrame");			
				}

        if (useBaseFile) {
          sourceRead.startFrame = new unsigned int[sourceRead.length];
          copy(sourceRead.preBaseFrames, &sourceRead.preBaseFrames[sourceRead.length], sourceRead.startFrame);
          for (i = 0; i < sourceRead.length-1; i++) {
            sourceRead.startFrame[i+1] += sourceRead.widthInFrames[i];
          }
          partial_sum(sourceRead.startFrame, &sourceRead.startFrame[sourceRead.length],  sourceRead.startFrame);
        }
				
				// Store start time normalized to frame rate.
        fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					timeMetric[baseToAlignmentMap[i]] = sourceRead.startFrame[i+queryStart];
				}
				timeMetric[timeMetric.size()-1] = 0;
				expGroup->startTime.WriteToPos(&timeMetric[0], timeMetric.size(), offsetBegin);
			}

			if (metricOptions["PulseWidth"] == true) {
				if (!expGroup->pulseWidth.IsInitialized()) {
					expGroup->pulseWidth.Initialize(expGroup->experimentGroup, "PulseWidth");			
				}
				// Store start time normalized to frame rate.
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);

        //
        // For legacy reasons, it's possible the width in frames is
        // stored in the bas file. If this is the case, use the width
        // in frames there.  Otherwise, use the width in frames stored
        // in the pls file.
        for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[queryStart + i];
        }
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->pulseWidth.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);
			}

			if (metricOptions["PreBaseFrames"] == true) {
				if (!expGroup->preBaseFrames.IsInitialized()) {
					expGroup->preBaseFrames.Initialize(expGroup->experimentGroup, "PreBaseFrames");
				}
				// Compute width in frames normalized to frame rate.
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i+queryStart];
				}
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->preBaseFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);
			}

			if (metricOptions["WidthInFrames"] == true) {
				if (!expGroup->widthInFrames.IsInitialized()) {
					expGroup->widthInFrames.Initialize(expGroup->experimentGroup, "WidthInFrames");
				}
				// Compute width in frames normalized to frame rate.
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
					if (usePulseFile) {
						frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart];
					}
					else {
						frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart];
          }
				}
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->widthInFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);
			}

			if (metricOptions["pkmid"] == true) {

				if (!expGroup->pkmid.IsInitialized()) {
					expGroup->pkmid.Initialize(expGroup->experimentGroup, "pkmid");
				}

				for (i = 0; i < readPulseMetric.size(); i++ ) {
          readPulseMetric[i] = NaN;
				}
				
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          readPulseMetric[baseToAlignmentMap[i]] = sourceRead.midSignal[i+queryStart];
        }
				readPulseMetric[readPulseMetric.size()-1] = 0;
				expGroup->pkmid.WriteToPos(&readPulseMetric[0], readPulseMetric.size(), offsetBegin);
			}

			if (metricOptions["IPD"] == true) {
				if (!expGroup->ipd.IsInitialized()) {
					expGroup->ipd.Initialize(expGroup->experimentGroup, "IPD");
				}
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);				

				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {

					//
					// The IPD is undefined for the first base in a read.
          //
					if (usePulseFile ) {
						if (queryStart == 0 and i == 0) {
              frameRateMetric[baseToAlignmentMap[i]] = 0;
						}
						else {
              frameRateMetric[baseToAlignmentMap[i]] = (sourceRead.startFrame[i+queryStart]  
                                                        - sourceRead.startFrame[i+queryStart-1]
                                                        - sourceRead.widthInFrames[i+queryStart-1]);
						}
					}
					else if (useBaseFile) {
            frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i + queryStart];
					}
        }
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->ipd.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);			
			}

			
			if (metricOptions["Light"] == true) {
				if (!expGroup->light.IsInitialized()) {
					expGroup->light.Initialize(expGroup->experimentGroup, "Light");
				}
        fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue);
				for (i = 0; i < ungappedAlignedSequenceLength; i++ ) {
          frameRateMetric[baseToAlignmentMap[i]] = sourceRead.meanSignal[i+queryStart];
          frameRateMetric[baseToAlignmentMap[i]] = (frameRateMetric[baseToAlignmentMap[i]] * 
                                                    sourceRead.widthInFrames[i+queryStart]);
				}
				frameRateMetric[frameRateMetric.size()-1] = 0;
				expGroup->light.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin);			
			}
    
      sourceRead.Free();
      Free(sourceRead.meanSignal);
      Free(sourceRead.maxSignal);
      Free(sourceRead.midSignal);
      Free(sourceRead.startFrame);
      Free(sourceRead.classifierQV);
      Free(sourceRead.widthInFrames);
		}

    if (byRead == true) {
      if (useBaseFile) {
        hdfBasReader.Close();
      }
      if (cmpFile.readType == ReadType::CCS or useCcs) {
        hdfCcsReader.Close();
      }
      if (usePulseFile) {
        hdfPlsReader.Close();
      }
    }
	} // done loading movies


	cmpReader.Close();
}
Beispiel #2
0
int main(int argc, char* argv[]) {
    string refGenomeFileName = "";
    string lengthModelFileName = "";
    string outputModelFileName = "";
    DNALength numBasesPerFile = 0;
    string sourceReadsFileName = "";
    string titleTableFileName = "";
    int numBasH5Files = 1;
    string basH5BaseFileName = "simulated";
    string movieName = "m101211_092754_00114_cSIM_s1_p0";
    bool   doRandGenInit = true;
    bool   usePosMap     = false;
    bool   printPercentRepeat = false;
    string posMapFileName = "";
    vector<string> movieNames;
    bool useLengthModel = false;
    bool useFixedLength = false;
    ofstream posMapFile;
    int scaledLength = 0;
    int fixedLength = 0;
    int nBasFiles = 1;
    bool useLengthsModel = true;
    bool printHelp = false;


    //  Look to see if the refAsReads flag is specified anywhere before
    //  parsing the command line.

    CommandLineParser clp;
    string commandLine;
    string helpString;
    SetHelp(helpString);
    vector<string> fns;

    clp.RegisterStringOption("genome", &refGenomeFileName, "");
    clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("sourceReads", &sourceReadsFileName, "");
    clp.RegisterStringOption("lengthModel", &lengthModelFileName, "");
    clp.RegisterIntOption("fixedLength", &fixedLength, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterFlagOption("lengthModel", &useLengthModel, "");
    clp.RegisterStringOption("movieName", &movieName, "");
    clp.RegisterStringOption("titleTable", &titleTableFileName, "");
    clp.RegisterStringOption("baseFileName", &basH5BaseFileName, "");
    clp.RegisterIntOption("nFiles", &nBasFiles, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterIntOption("meanLength", &scaledLength, "",
                          CommandLineParser::PositiveInteger);
    clp.RegisterStringOption("posMap", &posMapFileName, "");
    clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, "");
    clp.RegisterFlagOption("h", &printHelp, "");

    clp.SetHelp(helpString);
    clp.ParseCommandLine(argc, argv, fns);
    clp.CommandLineToString(argc, argv, commandLine);

    clp.SetProgramName("alchemy");

    outputModelFileName = fns[0];
    if (argc <= 1 or printHelp or outputModelFileName == "") {
        cout << helpString << endl;
        exit(0);
    }

    if (usePosMap) {
        CrucialOpen(posMapFileName, posMapFile, std::ios::out);
    }

    if (sourceReadsFileName == "" and fixedLength == 0) {
        useLengthModel = true;
    }

    if (useLengthModel and fixedLength != 0) {
        cout << "ERROR! You must either use a length model or a fixed length." << endl;
        exit(1);
    }

    if (sourceReadsFileName == "" and numBasesPerFile == 0) {
        cout << "ERROR! You must specify either a set of read to use as " << endl
             << "original reads for simulation or the total number of bases " << endl
             << "to simulate in each bas.h5 file." << endl;
        exit(1);
    }

    if (sourceReadsFileName == "" and refGenomeFileName == "") {
        cout << "ERROR! You must specify a genome to sample reads from or a set of read "<<endl
             << "to use as original reads for simulation." << endl;
        exit(1);
    }

    if (fixedLength != 0 and refGenomeFileName == "") {
        cout << "ERROR! You must specify a genome file if using a fixed length." << endl;
        exit(1);
    }

    if ((fixedLength != 0 or scaledLength != 0) and sourceReadsFileName != "") {
        cout << "ERROR! You cannot specify a fixed length nor mean length with a source " << endl
             << "reads file.  The read lengths are taken from the source reads or the length model." << endl;
        exit(1);
    }

    LengthHistogram   lengthHistogram;
    OutputSampleListSet   outputModel(0);
    TitleTable titleTable;

    if (doRandGenInit) {
        InitializeRandomGeneratorWithTime();
    }

    //
    // Read models.
    //
    if (titleTableFileName != "") {
        titleTable.Read(titleTableFileName);
    }


    outputModel.Read(outputModelFileName);

    if (useLengthModel) {
        lengthHistogram.BuildFromAlignmentLengths(outputModel.lengths);
    }


    vector<int> alignmentLengths;
    int meanAlignmentLength;


    if (scaledLength != 0 and useLengthModel) {
        //
        // Scale the histogram so that the average length is 'scaledLength'.
        //

        // 1. Integrate histogram
        long totalLength = 0;
        long totalSamples = 0;
        int hi;
        for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) {
            int ni;
            ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi];
            totalLength += ni * lengthHistogram.lengthHistogram.data[hi];
        }
        totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1];

        float meanSampleLength = totalLength / (1.0*totalSamples);
        float fractionIncrease = scaledLength / meanSampleLength;

        for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) {
            lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease;
        }
    }

    FASTAReader inReader, seqReader;
    vector<FASTASequence> reference;
    DNALength refLength = 0;
    int i;
    if (refGenomeFileName != "") {
        inReader.Init(refGenomeFileName);
        inReader.ReadAllSequences(reference);

        for (i = 0; i < reference.size(); i++) {
            refLength += reference[i].length;
        }
    }

    if (sourceReadsFileName !=  "") {
        seqReader.Init(sourceReadsFileName);
    }

    ofstream readsFile;

    //
    // Create and simulate bas.h5 files.
    //
    int baseFileIndex;
    bool readsRemain = true;
    for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles)  // case 1 is reads are generated by file
                             or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file.
            baseFileIndex++) {
        //
        // Prep the base file for writing.
        //
        stringstream fileNameStrm, movieNameStrm;
        //string movieName = "m000000_000000_00000_cSIMULATED_s";
        movieNameStrm << movieName << baseFileIndex << "_p0";
        string fullMovieName = movieNameStrm.str();
        fileNameStrm  << fullMovieName <<  ".bas.h5";


        HDFBasWriter basWriter;
        HDFRegionTableWriter regionWriter;
        //
        // This is mainly used to create the atributes.
        //
        RegionTable regionTable;
        regionTable.CreateDefaultAttributes();

        basWriter.SetPlatform(Springfield);
        //
        // Use a fixed set of fields for now.
        //

        // These are all pulled from the outputModel.
        basWriter.IncludeField("Basecall");
        basWriter.IncludeField("QualityValue");
        basWriter.IncludeField("SubstitutionQV");
        basWriter.IncludeField("SubstitutionTag");
        basWriter.IncludeField("InsertionQV");
        basWriter.IncludeField("DeletionQV");
        basWriter.IncludeField("DeletionTag");
        basWriter.IncludeField("WidthInFrames");
        basWriter.IncludeField("PreBaseFrames");
        basWriter.IncludeField("PulseIndex");

        vector<unsigned char> qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag;
        vector<HalfWord> widthInFrames, preBaseFrames, pulseIndex;

        // Just go from 0 .. hole Number
        basWriter.IncludeField("HoleNumber");
        // Fixed to 0.
        basWriter.IncludeField("HoleXY");
        if (usePosMap == false) {
            basWriter.IncludeField("SimulatedSequenceIndex");
            basWriter.IncludeField("SimulatedCoordinate");
        }
        basWriter.SetChangeListID("1.3.0.50.104380");


        DNALength numSimulatedBases  = 0;
        FASTASequence sampleSeq;
        //sampleSeq.length = readLength;
        int maxRetry = 10000000;
        int retryNumber = 0;
        int numReads = 0;
        int readLength = 0;

        while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) {
            DNALength seqIndex, seqPos;
            if (useLengthModel or fixedLength) {
                if (useLengthModel) {
                    lengthHistogram.GetRandomLength(readLength);
                }
                else {
                    readLength = fixedLength;
                }
            }
            if (refGenomeFileName != "") {
                FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1));
                sampleSeq.seq    = &reference[seqIndex].seq[seqPos];
                sampleSeq.length = readLength + (outputModel.keyLength - 1);
                assert(reference[seqIndex].length >= sampleSeq.length);
            }
            else if (sourceReadsFileName != "") {
                if (seqReader.GetNext(sampleSeq) == false) {
                    readsRemain = false;
                    break;
                }
                if (sampleSeq.length < outputModel.keyLength) {
                    continue;
                }
                //
                // Now attempt to parse the position from the fasta title.
                //

                if (useLengthModel) {
                    int tryNumber = 0;
                    readLength = 0;
                    int maxNTries = 1000;
                    int tryBuffer[5] = {-1,-1,-1,-1,-1};
                    while (tryNumber < maxNTries and readLength < outputModel.keyLength) {
                        lengthHistogram.GetRandomLength(readLength);
                        readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength);
                        tryBuffer[tryNumber%5] = readLength;
                        tryNumber++;
                    }
                    if (tryNumber >= maxNTries) {
                        cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " <<endl
                             << "minimum number of bases using the length model specified in the alchemy." <<endl
                             << "model.  Something is either wrong with the model or the context length is too large." <<endl;
                        cout << "The last few tries were: " << tryBuffer[0] << " " << tryBuffer[1] << " " << tryBuffer[2] << " " << tryBuffer[3] << " " << tryBuffer[4] << endl;
                        exit(1);
                    }
                }

                readLength = sampleSeq.length;
                vector<string> tokens;
                Tokenize(sampleSeq.title, "|", tokens);
                if (tokens.size() == 4) {
                    seqPos = atoi(tokens[2].c_str());
                    if (titleTableFileName == "") {
                        seqIndex = 0;
                    }
                    else {
                        int index;
                        titleTable.Lookup(tokens[1], index);
                        seqIndex = index;
                    }
                }
                else {
                    seqPos   = 0;
                }
            }

            //
            // If this is the first read printed to the base file, initialize it.
            //
            if (numSimulatedBases == 0) {
                basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield);
                regionWriter.Initialize(basWriter.pulseDataGroup);
            }

            numSimulatedBases += readLength;

            int p;
            // create the sample sequence
            int contextLength = outputModel.keyLength;
            int contextMiddle = contextLength / 2;
            string outputString;

            int nDel = 0;
            int nIns = 0;

            //
            // Simulate to beyond the sample length.
            //
            qualityValue.clear();
            substitutionQV.clear();
            substitutionTag.clear();
            insertionQV.clear();
            deletionQV.clear();
            deletionTag.clear();
            pulseIndex.clear();
            widthInFrames.clear();
            preBaseFrames.clear();
            assert(sampleSeq.length > contextMiddle + 1);
            for (p = contextMiddle;
                    p < sampleSeq.length - contextMiddle - 1; p++) {
                string refContext;
                refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength);

                string outputContext;
                int    contextWasFound;
                OutputSample sample;
                int i;
                for (i = 0; i < refContext.size(); i++) {
                    refContext[i] = toupper(refContext[i]);
                }
                outputModel.SampleRandomSample(refContext, sample);

                if (sample.type == OutputSample::Deletion ) {
                    //
                    // There was a deletion.  Advance in reference, then output
                    // the base after the deletion.
                    //
                    p++;
                    ++nDel;
                }

                int cp;
                //
                // Add the sampled context, possibly multiple characters because of an insertion.
                //
                for (i = 0; i < sample.nucleotides.size(); i++) {
                    outputString.push_back(sample.nucleotides[i]);
                    qualityValue.push_back(sample.qualities[i].qv[0]);
                    deletionQV.push_back(sample.qualities[i].qv[1]);
                    insertionQV.push_back(sample.qualities[i].qv[2]);
                    substitutionQV.push_back(sample.qualities[i].qv[3]);
                    deletionTag.push_back(sample.qualities[i].tags[0]);
                    substitutionTag.push_back(sample.qualities[i].tags[1]);
                    pulseIndex.push_back(sample.qualities[i].frameValues[0]);
                    preBaseFrames.push_back(sample.qualities[i].frameValues[1]);
                    widthInFrames.push_back(sample.qualities[i].frameValues[2]);
                }
                nIns += sample.qualities.size() - 1;
            }
            if (outputString.find('N') != outputString.npos or
                    outputString.find('n') != outputString.npos) {
                cout << "WARNING!  The sampled string " << endl << outputString << endl
                     << "should not contain N's, but it seems to.  This is being ignored "<<endl
                     << "for now so that simulation may continue, but this shouldn't happen"<<endl
                     << "and is really a bug." << endl;
                numSimulatedBases -= readLength;
                continue;
            }
            //
            // Ok, done creating the read, now time to create some quality values!!!!!
            //
            SMRTSequence read;
            read.length = outputString.size();
            read.Allocate(read.length);
            memcpy(read.seq, outputString.c_str(), read.length * sizeof(unsigned char));
            assert(qualityValue.size() == read.length * sizeof(unsigned char));
            memcpy(read.qual.data, &qualityValue[0], read.length * sizeof(unsigned char));
            memcpy(read.deletionQV.data, &deletionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.insertionQV.data, &insertionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.substitutionQV.data, &substitutionQV[0], read.length * sizeof(unsigned char));
            memcpy(read.deletionTag, &deletionTag[0], read.length * sizeof(unsigned char));
            memcpy(read.substitutionTag, &substitutionTag[0], read.length * sizeof(unsigned char));
            memcpy(read.pulseIndex, &pulseIndex[0], read.length * sizeof(int));
            memcpy(read.preBaseFrames, &preBaseFrames[0], read.length * sizeof(HalfWord));
            memcpy(read.widthInFrames, &widthInFrames[0], read.length * sizeof(HalfWord));

            //
            // The pulse index for now is just fake data.
            //
            int i;
            for (i = 0; i < read.length; i++) {
                read.pulseIndex[i] = 1;
            }
            read.xy[0] = seqIndex;
            read.xy[1] = seqPos;
            read.zmwData.holeNumber = numReads;

            basWriter.Write(read);
            // Record where this was simulated from.
            if (usePosMap == false) {
                basWriter.WriteSimulatedCoordinate(seqPos);
                basWriter.WriteSimulatedSequenceIndex(seqIndex);
            }
            else {
                posMapFile << fullMovieName << "/" << numReads << "/0_" << read.length << " " << seqIndex << " "<< seqPos;
                if (printPercentRepeat) {
                    DNALength nRepeat = sampleSeq.GetRepeatContent();
                    posMapFile << " " << nRepeat*1.0/sampleSeq.length;
                }
                posMapFile << endl;
            }
            RegionAnnotation region;
            region.row[0] = read.zmwData.holeNumber;
            region.row[1] = 1;
            region.row[2] = 0;
            region.row[3] = read.length;
            region.row[4] = 1000; // Should be enough.
            regionWriter.Write(region);
            region.row[1] = 2; // Rewrite for hq region encompassing everything.
            regionWriter.Write(region);
            if (sourceReadsFileName != "") {
                sampleSeq.Free();
            }
            read.Free();
            ++numReads;
        }
        regionWriter.Finalize(regionTable.columnNames,
                              regionTable.regionTypes,
                              regionTable.regionDescriptions,
                              regionTable.regionSources);
        basWriter.Close();
        numReads = 0;
        //
        // The bas writer should automatically flush on closing.
        //
    }
    if (usePosMap) {
        posMapFile.close();
    }

    for (i = 0; i < reference.size(); i++) {
        reference[i].Free();
    }
}
Beispiel #3
0
int main(int argc, char* argv[]) {
#ifdef USE_GOOGLE_PROFILER
    char *profileFileName = getenv("CPUPROFILE");
    if (profileFileName != NULL) {
      ProfilerStart(profileFileName);
    }
    else {
      ProfilerStart("google_profile.txt");
    }
#endif

    // Register inputs and outputs.
    string samFileName, refFileName, outFileName;

    CommandLineParser clp;
    clp.RegisterStringOption("file.sam", &samFileName,
                             "Input SAM file.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate reads.");
    clp.RegisterStringOption("out.sam", &outFileName,
                             "Output SAM file.");
    clp.RegisterPreviousFlagsAsHidden();

    // Register filter criteria options.
    int minAlnLength = 50;
    float minPctSimilarity = 70, minPctAccuracy = 70;
    string hitPolicyStr = "randombest";
    bool useScoreCutoff = false;
    int  scoreCutoff = INF_INT;
    int  scoreSignInt = -1;
    RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, 
                          minPctAccuracy, hitPolicyStr, useScoreCutoff,
                          scoreSignInt, scoreCutoff);

    int seed = 1; 
    clp.RegisterIntOption("seed", &seed,
            "(1)  Seed for random number generator.\n"
            "If seed is 0, then use current time as seed.",
            CommandLineParser::Integer);

    string holeNumberStr;
    Ranges holeNumberRanges;
    clp.RegisterStringOption("holeNumbers", &holeNumberStr,
            "A string of comma-delimited hole number ranges to output hits, "
            "such as '1,2,10-12'. "
            "This requires hit titles to be in SMRT read title format.");

    bool parseSmrtTitle = false;
    clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle,
            "Use this option when filtering alignments generated by "
            "programs other than blasr, e.g. bwa-sw or gmap. "
            "  Parse read coordinates from the SMRT read title. " 
            "The title is in the format /name/hole/coordinates, where"
            " coordinates are in the format \\d+_\\d+, and represent "
            "the interval of the read that was aligned.");
    /* This experimental option can be useful for metagenomics, in which case
     * there are hundreds of sequences in the target, of which many titles are
     * long and may contain white spaces (e.g., ' ', '\t'). 
     * In order to save disc space and avoid the (possibly) none unique mapping
     * between full and short reference names, one may call blasr with 
     * -titleTable option to represent all target sequences in the output
     * by their indices in the title table.*/

    string titleTableName = "";
    clp.RegisterStringOption("titleTable", &titleTableName,
            "Use this experimental option when filtering alignments generated by "
            "blasr with -titleTable titleTableName, in which case "
            "reference titles in SAM are represented by their "
            "indices (e.g., 0, 1, 2, ...) in the title table.");

    string adapterGffFileName = "";
    clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName,
            "Use this option to remove reads which can only map to adapters " 
            "specified in the GFF file.");

    bool verbose = false;
    clp.RegisterFlagOption("v", &verbose, "Be verbose.");

    clp.SetExamples(
            "Because SAM has optional tags that have different meanings"
            " in different programs, careful usage is required in order "
            "to have proper output.  The \"xs\" tag in bwa-sw is used to "
            "show the suboptimal score, but in PacBio SAM (blasr) it is "
            "defined as the start in the query sequence of the alignment.\n"
            "When \"-smrtTitle\" is specified, the xs tag is ignored, but "
            "when it is not specified, the coordinates given by the xs and "
            "xe tags are used to define the interval of a read that is "
            "aligned.  The CIGAR string is relative to this interval.");

    clp.ParseCommandLine(argc, argv);

    // Set random number seed. 
    if (seed == 0) {
        srand(time(NULL));
    } else {
        srand(seed);
    }
    
    scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE;
    Score s(static_cast<float>(scoreCutoff), scoreSign);
    FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, 
                                  minPctAccuracy, true, s);
    filterCriteria.Verbose(verbose);
    HitPolicy hitPolicy(hitPolicyStr, scoreSign);
                                  
    string errMsg;
    if (not filterCriteria.MakeSane(errMsg)) {
        cout << errMsg << endl;
        exit(1);
    }

    // Parse hole number ranges. 
    if (holeNumberStr.size() != 0) {
        if (not holeNumberRanges.setRanges(holeNumberStr)) {
            cout << "Could not parse hole number ranges: "
                 << holeNumberStr << "." << endl;
            exit(1);
        } 
    }

    // Open output file.
    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}
    
    GFFFile adapterGffFile;
    if (adapterGffFileName != "")
        adapterGffFile.ReadAll(adapterGffFileName);
    
    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);
    string log = "Filter sam hits.";
    string program = "samFilter";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    // If the SAM file is generated by blasr with -titleTable,
    // then references in the SAM are represented by 
    // their corresponding indices in the title table.
    // In that case, we need to convert reference titles in fasta file
    // to their corresponding indices in the title table, such that
    // references in both SAM and fasta files are represented
    // by title table indices and therefore can match.
    if (titleTableName != "") {
        ConvertTitlesToTitleTableIndices(references, titleTableName);
    }
 
    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    vector<string> allHeaders = samReader.ReadHeader(alignmentSet); 

    // Process SAM Header.
    string commandLineString;
    clp.CommandLineToString(argc, argv, commandLineString);
    allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \
                         "\tCL:" + program + " " + commandLineString);
    for (int i = 0; i < allHeaders.size(); i++) {
        outFileStrm << allHeaders[i] << endl;
    }

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that they are ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (int i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    int alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    vector<SAMAlignment> allSAMAlignments;
    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (parseSmrtTitle and holeNumberStr.size() != 0) {
            string movieName;
            int thisHoleNumber;
            if (not ParsePBIReadName(samAlignment.qName, 
                                     movieName, 
                                     thisHoleNumber)) {
                cout << "ERROR, could not parse SMRT title: "
                     << samAlignment.qName << "." << endl;
                exit(1);
            }
            if (not holeNumberRanges.contains(UInt(thisHoleNumber))) {
                if (verbose) 
                    cout << thisHoleNumber << " is not in range." << endl; 
                continue;
            }
        }

        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process SAM record with 'P' in "
                 << "its cigar string." << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;
        SAMAlignmentsToCandidates(samAlignment, 
                references, refNameToIndex,
                convertedAlignments, parseSmrtTitle, false);
        
        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore multiple segments." << endl;
            continue;
        }

        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            //score func does not matter
            DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; 
            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distFunc);
                                  
            // Check whether this alignment can only map to adapters in 
            // the adapter GFF file.
            if (adapterGffFileName != "" and 
                CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) {
                if (verbose)
                    cout << alignment.qName << " filter adapter only."
                         << endl;
                continue;
            }

            // Assign score to samAlignment.
            samAlignment.score = samAlignment.as;

            if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) {
                continue;
            }
            allSAMAlignments.push_back( samAlignment ); 

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

    // Sort all SAM alignments by qName, score and target position.
    sort(allSAMAlignments.begin(), allSAMAlignments.end(), 
         byQNameScoreTStart);

    unsigned int groupBegin = 0;
    unsigned int groupEnd = -1;
    vector<SAMAlignment> filteredSAMAlignments;
    while(groupBegin < allSAMAlignments.size()) {
        // Get the next group of SAM alignments which have the same qName
        // from allSAMAlignments[groupBegin ... groupEnd)
        GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd);
        vector<unsigned int> hitIndices = ApplyHitPolicy(
                hitPolicy, allSAMAlignments, groupBegin, groupEnd);
        for(unsigned int i = 0; i < hitIndices.size(); i++) {
            filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]);
        }
        groupBegin = groupEnd;
    }

    // Sort all SAM alignments by reference name and query name
    sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), 
         byRNameQName);

    for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) {
        filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm);
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
#ifdef USE_GOOGLE_PROFILER
  ProfilerStop();
#endif
    return 0;
}