Ejemplo n.º 1
0
// Function to fill in prediceted signal values
int CalculateHypPredictions(
		PersistingThreadObjects  &thread_objects,
        ExtendedReadInfo         &my_read,
        InputStructures          &global_context,
        const vector<string>     &Hypotheses,
        vector<vector<float> >   &predictions,
        vector<vector<float> >   &normalizedMeasurements) {

  // Create return data structures
  predictions.resize(Hypotheses.size());
  normalizedMeasurements.resize(Hypotheses.size());

  // --- Step 1: Loading data to a read

  int nFlows = min(global_context.treePhaserFlowOrder.num_flows(), (int)my_read.measurementValue.size());

  BasecallerRead read;
  read.key_normalizer = 1;
  read.raw_measurements.reserve(global_context.treePhaserFlowOrder.num_flows());
  read.raw_measurements = my_read.measurementValue;

  for (unsigned int iFlow = 0; iFlow < read.raw_measurements.size(); iFlow++)
    if (isnan(read.raw_measurements[iFlow])) {
      cerr << "Warning: Calculate Predictions: NAN in measurements!"<< endl;
      read.raw_measurements[iFlow] = 0;
    }

  read.raw_measurements.resize(global_context.treePhaserFlowOrder.num_flows(), 0);
  read.normalized_measurements = read.raw_measurements;
  read.sequence.clear();
  read.sequence.reserve(2*global_context.treePhaserFlowOrder.num_flows());
  read.prediction.assign(global_context.treePhaserFlowOrder.num_flows(), 0);
  read.additive_correction.assign(global_context.treePhaserFlowOrder.num_flows(), 0);
  read.multiplicative_correction.assign(global_context.treePhaserFlowOrder.num_flows(), 1.0);

  // --- Step 1b: Initialize Treephaser and Recalibration

  int steps, window_size = 50;
  thread_objects.dpTreephaser.SetModelParameters(my_read.phase_params.at(0), my_read.phase_params.at(1), my_read.phase_params.at(2));
  if (global_context.use_SSE_basecaller)
	  thread_objects.treephaser_sse.SetModelParameters(my_read.phase_params.at(0), my_read.phase_params.at(1));

  // Set up HP recalibration model: hide the recal object behind a mask so we can use the map to select
  thread_objects.dpTreephaser.DisableRecalibration();   // Disable use of previously loaded recalibration model
  thread_objects.treephaser_sse.DisableRecalibration();

  if (global_context.do_recal.recal_is_live()) {
    // query recalibration structure using row, column, entity
    // look up entity here: using row, col, runid
    // note: perhaps do this when we first get the read, exploit here
    string found_key = global_context.do_recal.FindKey(my_read.runid, my_read.well_rowcol.at(1), my_read.well_rowcol.at(0));
    MultiAB multi_ab;
    global_context.do_recal.getAB(multi_ab, found_key, my_read.well_rowcol.at(1), my_read.well_rowcol.at(0));
    if(multi_ab.Valid()) {
      thread_objects.dpTreephaser.SetAsBs(multi_ab.aPtr, multi_ab.bPtr, true);
      thread_objects.treephaser_sse.SetAsBs(multi_ab.aPtr, multi_ab.bPtr, true);
      // in either case, we will have to provide the predicted intensity by simulateRead using recalibration
      thread_objects.dpTreephaser.EnableRecalibration();    // Enable the use of the recalibration model
      thread_objects.treephaser_sse.EnableRecalibration();  // in the 'Solve' function
    }
  }

  // --- Step 2: Solve beginning of the read

  // Solve beginning of maybe clipped read
  int until_flow = min((my_read.start_flow+20), nFlows);
  if (my_read.start_flow>0) {
	if (global_context.use_SSE_basecaller)
	  thread_objects.treephaser_sse.SolveRead(read, 0, until_flow);
	else
	  thread_objects.dpTreephaser.Solve(read, until_flow, 0);
  }

  // StartFlow clipped? Get solved HP length at startFlow
  unsigned int base = 0;
  int flow = 0;
  int HPlength = 0;
  while (base<read.sequence.size()) {
    while (flow < global_context.treePhaserFlowOrder.num_flows()
            and global_context.treePhaserFlowOrder.nuc_at(flow) != read.sequence[base])
      flow++;
    if (flow > my_read.start_flow or flow == global_context.treePhaserFlowOrder.num_flows())
      break;
    if (flow == my_read.start_flow)
      HPlength++;
    base++;
  }
  if (global_context.DEBUG>2)
    printf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, my_read.start_flow);
  // Get HP size at the start of the reference, i.e., Hypotheses[0]
  int count = 1;
  while (Hypotheses[0][count] == Hypotheses[0][0])
    count++;
  if (global_context.DEBUG>2)
    printf("Hypothesis starts with an HP of length %d\n", count);
  // Adjust the length of the prefix and erase extra solved bases
  if (HPlength>count)
    base -= count;
  else
    base -= HPlength;
  read.sequence.erase(read.sequence.begin()+base, read.sequence.end());
  unsigned int prefix_size = read.sequence.size();

  // --- Step 3: creating predictions for the individual hypotheses

  vector<BasecallerRead> hypothesesReads(Hypotheses.size());
  int max_last_flow  = 0;

  for (unsigned int r=0; r<hypothesesReads.size(); ++r) {

    hypothesesReads[r] = read;
    // add hypothesis sequence to prefix
    for (base=0; base<Hypotheses[r].length() and base<(2*(unsigned int)global_context.treePhaserFlowOrder.num_flows()-prefix_size); base++)
      hypothesesReads[r].sequence.push_back(Hypotheses[r][base]);

    // get last main incorporating flow
    int last_incorporating_flow = 0;
    base = 0;
    flow = 0;
    while (base<hypothesesReads[r].sequence.size() and flow<global_context.treePhaserFlowOrder.num_flows()) {
      while (flow<nFlows and global_context.treePhaserFlowOrder.nuc_at(flow) != hypothesesReads[r].sequence[base])
        flow++;
      last_incorporating_flow = flow;
      if (last_incorporating_flow > max_last_flow)
        max_last_flow = last_incorporating_flow;
      base++;
    }

    // Simulate sequence
    thread_objects.dpTreephaser.Simulate(hypothesesReads[r], global_context.treePhaserFlowOrder.num_flows());

    // Adaptively normalize each hypothesis if desired
    if (global_context.apply_normalization) {
      steps = last_incorporating_flow / window_size;
      thread_objects.dpTreephaser.WindowedNormalize(hypothesesReads[r], steps, window_size);
    }

    // Solver simulates beginning of the read and then fills in the remaining clipped bases
    if (global_context.use_SSE_basecaller)
    	thread_objects.treephaser_sse.SolveRead(hypothesesReads[r], last_incorporating_flow, nFlows);
    else
    	thread_objects.dpTreephaser.Solve(hypothesesReads[r], nFlows, last_incorporating_flow);

    // Apply HP recalibration distortion to the predictions
    if (global_context.do_recal.recal_is_live())
    	thread_objects.dpTreephaser.SimulateRecalibrated(hypothesesReads[r], nFlows);

    // Store predictions and adaptively normalized measurements
    predictions[r] = hypothesesReads[r].prediction;
    predictions[r].resize(nFlows);
    normalizedMeasurements[r] = hypothesesReads[r].normalized_measurements;
    normalizedMeasurements[r].resize(nFlows);
  }

  // --- verbose ---
  if (global_context.DEBUG>2) {
    printf("Calculating predictions for %d hypotheses starting at flow %d:\n", (int)Hypotheses.size(), my_read.start_flow);
    for (unsigned int i=0; i<Hypotheses.size(); ++i) {
      for (unsigned int j=0; j<Hypotheses[i].length(); ++j)
        printf("%c", Hypotheses[i][j]);
      printf("\n");
    }
    printf("Solved read prefix: ");
    for (unsigned int j=0; j<prefix_size; ++j)
      printf("%c", read.sequence[j]);
    printf("\n");
    printf("Extended Hypotheses reads to:\n");
    for (unsigned int i=0; i<hypothesesReads.size(); ++i) {
      for (unsigned int j=0; j<hypothesesReads[i].sequence.size(); ++j)
        printf("%c", hypothesesReads[i].sequence[j]);
      printf("\n");
    }
    printf("Phasing Parameters, cf: %f ie: %f dr: %f \n Predictions: \n",
    		my_read.phase_params.at(0), my_read.phase_params.at(1), my_read.phase_params.at(2));
    for (unsigned int i=0; i<hypothesesReads.size(); ++i) {
      for (unsigned int j=0; j<predictions[i].size(); ++j)
        printf("%f", predictions[i][j]);
      printf("\n");
    }
  }
  // --------------- */
  return(max_last_flow);
}
Ejemplo n.º 2
0
void CalculateHypDistances(const vector<float>& NormalizedMeasurements,
				  const float& cf,
				  const float& ie,
				  const float& droop,
				  const ion::FlowOrder& flow_order,
				  const vector<string>& Hypotheses,
				  const int& startFlow,
				  vector<float>& DistanceObserved,
				  vector<float>& DistanceHypotheses,
				  vector<vector<float> >& predictions,
				  vector<vector<float> >& normalizedMeasurements,
				  int applyNormalization,
				  int verbose)
{
	// Create return data structures
	// Distance of normalized observations to different hypotheses: d(obs,h1), ... , d(obs,hN)
	DistanceObserved.assign(Hypotheses.size(), 0);
	// Distance of hypotheses to first hypothesis: d(h1,h2), ... , d(h1, hN)
	DistanceHypotheses.assign(Hypotheses.size()-1, 0);
	predictions.resize(Hypotheses.size());
	normalizedMeasurements.resize(Hypotheses.size());

	// Loading key normalized values into a read and performing adaptive normalization
	BasecallerRead read;
	read.key_normalizer = 1;
	read.raw_measurements = NormalizedMeasurements;
	read.normalized_measurements = NormalizedMeasurements;
	read.sequence.clear();
	read.sequence.reserve(2*flow_order.num_flows());
	read.prediction.assign(flow_order.num_flows(), 0);
	read.additive_correction.assign(flow_order.num_flows(), 0);
	read.multiplicative_correction.assign(flow_order.num_flows(), 1.0);

	int steps, window_size = 50;
	DPTreephaser dpTreephaser(flow_order);
	dpTreephaser.SetModelParameters(cf, ie, droop);

	// Solve beginning of maybe clipped read
	if (startFlow>0)
		dpTreephaser.Solve(read, (startFlow+20), 0);
	// StartFlow clipped? Get solved HP length at startFlow
    unsigned int base = 0;
    int flow = 0;
    int HPlength = 0;
    while (base<read.sequence.size()){
    	while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != read.sequence[base])
    		flow++;
    	if (flow > startFlow or flow == flow_order.num_flows())
    		break;
    	if (flow == startFlow)
    		HPlength++;
    	base++;
    }
    if (verbose>0)
      Rprintf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, startFlow);
    // Get HP size at the start of the reference, i.e., Hypotheses[0]
    int count = 1;
    while (Hypotheses[0][count] == Hypotheses[0][0])
    	count++;
    if (verbose>0)
      Rprintf("Hypothesis starts with an HP of length %d\n", count);
    // Adjust the length of the prefix and erase extra solved bases
    if (HPlength>count)
    	base -= count;
    else
    	base -= HPlength;
    read.sequence.erase(read.sequence.begin()+base, read.sequence.end());
    unsigned int prefix_size = read.sequence.size();

	// creating predictions for the individual hypotheses
	vector<BasecallerRead> hypothesesReads(Hypotheses.size());
	int max_last_flow  = 0;

	for (unsigned int r=0; r<hypothesesReads.size(); ++r) {

		hypothesesReads[r] = read;
		// add hypothesis sequence to prefix
		for (base=0; base<Hypotheses[r].length() and base<(2*(unsigned int)flow_order.num_flows()-prefix_size); base++)
			hypothesesReads[r].sequence.push_back(Hypotheses[r][base]);

		// get last main incorporating flow
		int last_incorporating_flow = 0;
		base = 0;
		flow = 0;
        while (base<hypothesesReads[r].sequence.size() and flow<flow_order.num_flows()){
            while (flow_order.nuc_at(flow) != hypothesesReads[r].sequence[base])
                flow++;
		    last_incorporating_flow = flow;
		    if (last_incorporating_flow > max_last_flow)
		    	max_last_flow = last_incorporating_flow;
		    base++;
		}

		// Simulate sequence
		dpTreephaser.Simulate(hypothesesReads[r], flow_order.num_flows());

		// Adaptively normalize each hypothesis
		if (applyNormalization>0) {
		    steps = last_incorporating_flow / window_size;
		    dpTreephaser.WindowedNormalize(hypothesesReads[r], steps, window_size);
		}

		// Solver simulates beginning of the read and then fills in the remaining clipped bases
		dpTreephaser.Solve(hypothesesReads[r], flow_order.num_flows(), last_incorporating_flow);

		// Store predictions and adaptively normalized measurements
		predictions[r] = hypothesesReads[r].prediction;
		normalizedMeasurements[r] = hypothesesReads[r].normalized_measurements;
	}


	// --- Calculating distances ---
	// Include only flow values in the distance where the predictions differ by more than "threshold"
	float threshold = 0.05;

	// Do not include flows after main inc. flow of lastest hypothesis
	for (int flow=0; flow<(max_last_flow+1); ++flow) {
		bool includeFlow = false;
		for (unsigned int hyp=1; hyp<hypothesesReads.size(); ++hyp)
			if (abs(hypothesesReads[hyp].prediction[flow] - hypothesesReads[0].prediction[flow])>threshold)
				includeFlow = true;

		if (includeFlow) {
			for (unsigned int hyp=0; hyp<hypothesesReads.size(); ++hyp) {
				float residual = hypothesesReads[hyp].normalized_measurements[flow] - hypothesesReads[hyp].prediction[flow];
				DistanceObserved[hyp] += residual * residual;
				if (hyp>0) {
					residual = hypothesesReads[0].prediction[flow] - hypothesesReads[hyp].prediction[flow];
					DistanceHypotheses[hyp-1] += residual * residual;
				}
			}
		}

	}

	// --- verbose ---
	if (verbose>0){
	  Rprintf("Calculating distances between %d hypotheses starting at flow %d:\n", Hypotheses.size(), startFlow);
	  for (unsigned int i=0; i<Hypotheses.size(); ++i){
		for (unsigned int j=0; j<Hypotheses[i].length(); ++j)
			Rprintf("%c", Hypotheses[i][j]);
		Rprintf("\n");
	  }
	  Rprintf("Solved read prefix: ");
	  for (unsigned int j=0; j<prefix_size; ++j)
		Rprintf("%c", read.sequence[j]);
	  Rprintf("\n");
	  Rprintf("Extended Hypotheses reads to:\n");
	  for (unsigned int i=0; i<hypothesesReads.size(); ++i){
		for (unsigned int j=0; j<hypothesesReads[i].sequence.size(); ++j)
		  Rprintf("%c", hypothesesReads[i].sequence[j]);
		Rprintf("\n");
	  }
	  Rprintf("Calculated Distances d2(obs, H_i), d2(H_i, H_0):\n");
	  Rprintf("%f, 0\n", DistanceObserved[0]);
	  for (unsigned int i=1; i<Hypotheses.size(); ++i)
		Rprintf("%f, %f\n", DistanceObserved[i], DistanceHypotheses[i-1]);
    }
    // --------------- */

}
Ejemplo n.º 3
0
// Function to fill in prediceted signal values
int CalculateHypPredictions(
    PersistingThreadObjects  &thread_objects,
    const Alignment          &my_read,
    const InputStructures    &global_context,
    const vector<string>     &Hypotheses,
    const vector<bool>       &same_as_null_hypothesis,
    vector<vector<float> >   &predictions,
    vector<vector<float> >   &normalizedMeasurements,
    int flow_upper_bound) {

    // --- Step 1: Initialize Objects

	if (global_context.DEBUG > 2)
	  cout << "Prediction Generation for read " << my_read.alignment.Name << endl;

    predictions.resize(Hypotheses.size());
    normalizedMeasurements.resize(Hypotheses.size());
    // Careful: num_flows may be smaller than flow_order.num_flows()
    const ion::FlowOrder & flow_order = global_context.flow_order_vector.at(my_read.flow_order_index);
    const int & num_flows = global_context.num_flows_by_run_id.at(my_read.runid);
    int prefix_flow = 0;

    BasecallerRead master_read;
    master_read.SetData(my_read.measurements, flow_order.num_flows());
    InitializeBasecallers(thread_objects, my_read, global_context);

    // --- Step 2: Processing read prefix or solve beginning of the read if desired
    unsigned int prefix_size = 0;
    if (global_context.resolve_clipped_bases or my_read.prefix_flow < 0) {
      prefix_flow = GetStartOfMasterRead(thread_objects, my_read, global_context, Hypotheses, num_flows, master_read);
      prefix_size = master_read.sequence.size();
    }
    else {
      const string & read_prefix = global_context.key_by_read_group.at(my_read.read_group);
      prefix_size = read_prefix.length();
      for (unsigned int i_base=0; i_base < read_prefix.length(); i_base++)
        master_read.sequence.push_back(read_prefix.at(i_base));
      prefix_flow = my_read.prefix_flow;
    }

    // --- Step 3: creating predictions for the individual hypotheses

    // Compute an upper limit of flows to be simulated or solved
    if (global_context.DEBUG > 2)
      cout << "Prediction Generation: determining flow upper bound (flow_order.num_flows()=" << flow_order.num_flows() << ") as the minimum of:"
           << " flow_upper_bound " << flow_upper_bound
           << " measurement_length " << my_read.measurements_length
           << " num_flows " << num_flows << endl;
    flow_upper_bound = min(flow_upper_bound, min(my_read.measurements_length, num_flows));

    vector<BasecallerRead> hypothesesReads(Hypotheses.size());
    int max_last_flow  = 0;

    for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {

    	// No need to simulate if a hypothesis is equal to the read as called
    	// We get that info from the splicing module
    	if (same_as_null_hypothesis.at(i_hyp)) {
            predictions[i_hyp] = predictions[0];
            predictions[i_hyp].resize(flow_order.num_flows());
            normalizedMeasurements[i_hyp] = normalizedMeasurements[0];
            normalizedMeasurements[i_hyp].resize(flow_order.num_flows());
        } else {

            hypothesesReads[i_hyp] = master_read;

            // --- add hypothesis sequence to clipped prefix
            unsigned int i_base = 0;
            unsigned int max_bases = 2*(unsigned int)flow_order.num_flows()-prefix_size; // Our maximum allocated memory for the sequence vector
            int i_flow = prefix_flow;

            // Add bases to read object sequence
            // We add one more base beyond 'flow_upper_bound' (if available) to signal Treephaser to not even start the solver
            while (i_base<Hypotheses[i_hyp].length() and i_base<max_bases) {
              IncrementFlow(flow_order, Hypotheses[i_hyp][i_base], i_flow);
              hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]);
              if (i_flow >= flow_upper_bound) {
            	i_flow = flow_upper_bound;
                break;
              }
              i_base++;
            }

            // Find last main incorporating flow of all hypotheses
            max_last_flow = max(max_last_flow, i_flow);

            // Solver simulates beginning of the read and then fills in the remaining clipped bases
            // Above checks on flow_upper_bound and i_flow guarantee that i_flow <= flow_upper_bound <= num_flows
            thread_objects.SolveRead(my_read.flow_order_index, hypothesesReads[i_hyp], min(i_flow,flow_upper_bound), flow_upper_bound);

            // Store predictions and adaptively normalized measurements
            predictions[i_hyp].swap(hypothesesReads[i_hyp].prediction);
            predictions[i_hyp].resize(flow_order.num_flows(), 0);
            normalizedMeasurements[i_hyp].swap(hypothesesReads[i_hyp].normalized_measurements);
            normalizedMeasurements[i_hyp].resize(flow_order.num_flows(), 0);
        }
    }

    // --- verbose ---
    if (global_context.DEBUG>2)
      PredictionGenerationVerbose(Hypotheses, hypothesesReads, my_read, predictions, prefix_size, global_context);

    //return max_last_flow;
    return (max_last_flow);
}
// Function to fill in predicted signal values
void BaseHypothesisEvaluator(BamTools::BamAlignment    &alignment,
                             const string              &flow_order_str,
                             const string              &alt_base_hyp,
                             float                     &delta_score,
                             float                     &fit_score,
                             int                       heavy_verbose) {

    // --- Step 1: Initialize Objects and retrieve relevant tags

	delta_score = 1e5;
	fit_score   = 1e5;
	vector<string>   Hypotheses(2);
    vector<float>    measurements, phase_params;
    int              start_flow, num_flows, prefix_flow=0;

    if (not GetBamTags(alignment, flow_order_str.length(), measurements, phase_params, start_flow))
      return;
	num_flows = measurements.size();
	ion::FlowOrder flow_order(flow_order_str, num_flows);
	BasecallerRead master_read;
	master_read.SetData(measurements, flow_order.num_flows());
	TreephaserLite   treephaser(flow_order);
    treephaser.SetModelParameters(phase_params[0], phase_params[1]);

    // --- Step 2: Solve beginning of the read
    // Look at mapped vs. unmapped reads in BAM
    Hypotheses[0] = alignment.QueryBases;
    Hypotheses[1] = alt_base_hyp;
    // Safety: reverse complement reverse strand reads in mapped bam
    if (alignment.IsMapped() and alignment.IsReverseStrand()) {
      RevComplementInPlace(Hypotheses[0]);
      RevComplementInPlace(Hypotheses[1]);
    }

    prefix_flow = GetMasterReadPrefix(treephaser, flow_order, start_flow, Hypotheses[0], master_read);
    unsigned int prefix_size = master_read.sequence.size();

    // --- Step 3: creating predictions for the individual hypotheses

    vector<BasecallerRead> hypothesesReads(Hypotheses.size());
    vector<float> squared_distances(Hypotheses.size(), 0.0);
    int max_last_flow = 0;

    for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {

      hypothesesReads[i_hyp] = master_read;
      // --- add hypothesis sequence to clipped prefix
      unsigned int i_base = 0;
      int i_flow = prefix_flow;

      while (i_base<Hypotheses[i_hyp].length() and i_base<(2*(unsigned int)flow_order.num_flows()-prefix_size)) {
        while (i_flow < flow_order.num_flows() and flow_order.nuc_at(i_flow) != Hypotheses[i_hyp][i_base])
          i_flow++;
        if (i_flow < flow_order.num_flows() and i_flow > max_last_flow)
          max_last_flow = i_flow;
        if (i_flow >= flow_order.num_flows())
          break;
        // Add base to sequence only if it fits into flow order
        hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]);
        i_base++;
      }
      i_flow = min(i_flow, flow_order.num_flows()-1);

      // Solver simulates beginning of the read and then fills in the remaining clipped bases for which we have flow information
      treephaser.Solve(hypothesesReads[i_hyp], num_flows, i_flow);
    }
    // Compute L2-distance of measurements and predictions
    for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {
      for (int iFlow=0; iFlow<=max_last_flow; iFlow++)
        squared_distances[i_hyp] += (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)) *
                                    (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow));
    }

    // Delta: L2-distance of alternative base Hypothesis - L2-distance of bases as called
    delta_score = squared_distances.at(1) - squared_distances.at(0);
    fit_score   = min(squared_distances.at(1), squared_distances.at(0));


    // --- verbose ---
    if (heavy_verbose > 1 or (delta_score < 0 and heavy_verbose > 0)) {
      cout << "Processed read " << alignment.Name << endl;
      cout << "Delta Fit: " << delta_score << " Overall Fit: " << fit_score << endl;
      PredictionGenerationVerbose(Hypotheses, hypothesesReads, phase_params, flow_order, start_flow, prefix_size);
    }

}