// Function to fill in prediceted signal values int CalculateHypPredictions( PersistingThreadObjects &thread_objects, ExtendedReadInfo &my_read, InputStructures &global_context, const vector<string> &Hypotheses, vector<vector<float> > &predictions, vector<vector<float> > &normalizedMeasurements) { // Create return data structures predictions.resize(Hypotheses.size()); normalizedMeasurements.resize(Hypotheses.size()); // --- Step 1: Loading data to a read int nFlows = min(global_context.treePhaserFlowOrder.num_flows(), (int)my_read.measurementValue.size()); BasecallerRead read; read.key_normalizer = 1; read.raw_measurements.reserve(global_context.treePhaserFlowOrder.num_flows()); read.raw_measurements = my_read.measurementValue; for (unsigned int iFlow = 0; iFlow < read.raw_measurements.size(); iFlow++) if (isnan(read.raw_measurements[iFlow])) { cerr << "Warning: Calculate Predictions: NAN in measurements!"<< endl; read.raw_measurements[iFlow] = 0; } read.raw_measurements.resize(global_context.treePhaserFlowOrder.num_flows(), 0); read.normalized_measurements = read.raw_measurements; read.sequence.clear(); read.sequence.reserve(2*global_context.treePhaserFlowOrder.num_flows()); read.prediction.assign(global_context.treePhaserFlowOrder.num_flows(), 0); read.additive_correction.assign(global_context.treePhaserFlowOrder.num_flows(), 0); read.multiplicative_correction.assign(global_context.treePhaserFlowOrder.num_flows(), 1.0); // --- Step 1b: Initialize Treephaser and Recalibration int steps, window_size = 50; thread_objects.dpTreephaser.SetModelParameters(my_read.phase_params.at(0), my_read.phase_params.at(1), my_read.phase_params.at(2)); if (global_context.use_SSE_basecaller) thread_objects.treephaser_sse.SetModelParameters(my_read.phase_params.at(0), my_read.phase_params.at(1)); // Set up HP recalibration model: hide the recal object behind a mask so we can use the map to select thread_objects.dpTreephaser.DisableRecalibration(); // Disable use of previously loaded recalibration model thread_objects.treephaser_sse.DisableRecalibration(); if (global_context.do_recal.recal_is_live()) { // query recalibration structure using row, column, entity // look up entity here: using row, col, runid // note: perhaps do this when we first get the read, exploit here string found_key = global_context.do_recal.FindKey(my_read.runid, my_read.well_rowcol.at(1), my_read.well_rowcol.at(0)); MultiAB multi_ab; global_context.do_recal.getAB(multi_ab, found_key, my_read.well_rowcol.at(1), my_read.well_rowcol.at(0)); if(multi_ab.Valid()) { thread_objects.dpTreephaser.SetAsBs(multi_ab.aPtr, multi_ab.bPtr, true); thread_objects.treephaser_sse.SetAsBs(multi_ab.aPtr, multi_ab.bPtr, true); // in either case, we will have to provide the predicted intensity by simulateRead using recalibration thread_objects.dpTreephaser.EnableRecalibration(); // Enable the use of the recalibration model thread_objects.treephaser_sse.EnableRecalibration(); // in the 'Solve' function } } // --- Step 2: Solve beginning of the read // Solve beginning of maybe clipped read int until_flow = min((my_read.start_flow+20), nFlows); if (my_read.start_flow>0) { if (global_context.use_SSE_basecaller) thread_objects.treephaser_sse.SolveRead(read, 0, until_flow); else thread_objects.dpTreephaser.Solve(read, until_flow, 0); } // StartFlow clipped? Get solved HP length at startFlow unsigned int base = 0; int flow = 0; int HPlength = 0; while (base<read.sequence.size()) { while (flow < global_context.treePhaserFlowOrder.num_flows() and global_context.treePhaserFlowOrder.nuc_at(flow) != read.sequence[base]) flow++; if (flow > my_read.start_flow or flow == global_context.treePhaserFlowOrder.num_flows()) break; if (flow == my_read.start_flow) HPlength++; base++; } if (global_context.DEBUG>2) printf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, my_read.start_flow); // Get HP size at the start of the reference, i.e., Hypotheses[0] int count = 1; while (Hypotheses[0][count] == Hypotheses[0][0]) count++; if (global_context.DEBUG>2) printf("Hypothesis starts with an HP of length %d\n", count); // Adjust the length of the prefix and erase extra solved bases if (HPlength>count) base -= count; else base -= HPlength; read.sequence.erase(read.sequence.begin()+base, read.sequence.end()); unsigned int prefix_size = read.sequence.size(); // --- Step 3: creating predictions for the individual hypotheses vector<BasecallerRead> hypothesesReads(Hypotheses.size()); int max_last_flow = 0; for (unsigned int r=0; r<hypothesesReads.size(); ++r) { hypothesesReads[r] = read; // add hypothesis sequence to prefix for (base=0; base<Hypotheses[r].length() and base<(2*(unsigned int)global_context.treePhaserFlowOrder.num_flows()-prefix_size); base++) hypothesesReads[r].sequence.push_back(Hypotheses[r][base]); // get last main incorporating flow int last_incorporating_flow = 0; base = 0; flow = 0; while (base<hypothesesReads[r].sequence.size() and flow<global_context.treePhaserFlowOrder.num_flows()) { while (flow<nFlows and global_context.treePhaserFlowOrder.nuc_at(flow) != hypothesesReads[r].sequence[base]) flow++; last_incorporating_flow = flow; if (last_incorporating_flow > max_last_flow) max_last_flow = last_incorporating_flow; base++; } // Simulate sequence thread_objects.dpTreephaser.Simulate(hypothesesReads[r], global_context.treePhaserFlowOrder.num_flows()); // Adaptively normalize each hypothesis if desired if (global_context.apply_normalization) { steps = last_incorporating_flow / window_size; thread_objects.dpTreephaser.WindowedNormalize(hypothesesReads[r], steps, window_size); } // Solver simulates beginning of the read and then fills in the remaining clipped bases if (global_context.use_SSE_basecaller) thread_objects.treephaser_sse.SolveRead(hypothesesReads[r], last_incorporating_flow, nFlows); else thread_objects.dpTreephaser.Solve(hypothesesReads[r], nFlows, last_incorporating_flow); // Apply HP recalibration distortion to the predictions if (global_context.do_recal.recal_is_live()) thread_objects.dpTreephaser.SimulateRecalibrated(hypothesesReads[r], nFlows); // Store predictions and adaptively normalized measurements predictions[r] = hypothesesReads[r].prediction; predictions[r].resize(nFlows); normalizedMeasurements[r] = hypothesesReads[r].normalized_measurements; normalizedMeasurements[r].resize(nFlows); } // --- verbose --- if (global_context.DEBUG>2) { printf("Calculating predictions for %d hypotheses starting at flow %d:\n", (int)Hypotheses.size(), my_read.start_flow); for (unsigned int i=0; i<Hypotheses.size(); ++i) { for (unsigned int j=0; j<Hypotheses[i].length(); ++j) printf("%c", Hypotheses[i][j]); printf("\n"); } printf("Solved read prefix: "); for (unsigned int j=0; j<prefix_size; ++j) printf("%c", read.sequence[j]); printf("\n"); printf("Extended Hypotheses reads to:\n"); for (unsigned int i=0; i<hypothesesReads.size(); ++i) { for (unsigned int j=0; j<hypothesesReads[i].sequence.size(); ++j) printf("%c", hypothesesReads[i].sequence[j]); printf("\n"); } printf("Phasing Parameters, cf: %f ie: %f dr: %f \n Predictions: \n", my_read.phase_params.at(0), my_read.phase_params.at(1), my_read.phase_params.at(2)); for (unsigned int i=0; i<hypothesesReads.size(); ++i) { for (unsigned int j=0; j<predictions[i].size(); ++j) printf("%f", predictions[i][j]); printf("\n"); } } // --------------- */ return(max_last_flow); }
void CalculateHypDistances(const vector<float>& NormalizedMeasurements, const float& cf, const float& ie, const float& droop, const ion::FlowOrder& flow_order, const vector<string>& Hypotheses, const int& startFlow, vector<float>& DistanceObserved, vector<float>& DistanceHypotheses, vector<vector<float> >& predictions, vector<vector<float> >& normalizedMeasurements, int applyNormalization, int verbose) { // Create return data structures // Distance of normalized observations to different hypotheses: d(obs,h1), ... , d(obs,hN) DistanceObserved.assign(Hypotheses.size(), 0); // Distance of hypotheses to first hypothesis: d(h1,h2), ... , d(h1, hN) DistanceHypotheses.assign(Hypotheses.size()-1, 0); predictions.resize(Hypotheses.size()); normalizedMeasurements.resize(Hypotheses.size()); // Loading key normalized values into a read and performing adaptive normalization BasecallerRead read; read.key_normalizer = 1; read.raw_measurements = NormalizedMeasurements; read.normalized_measurements = NormalizedMeasurements; read.sequence.clear(); read.sequence.reserve(2*flow_order.num_flows()); read.prediction.assign(flow_order.num_flows(), 0); read.additive_correction.assign(flow_order.num_flows(), 0); read.multiplicative_correction.assign(flow_order.num_flows(), 1.0); int steps, window_size = 50; DPTreephaser dpTreephaser(flow_order); dpTreephaser.SetModelParameters(cf, ie, droop); // Solve beginning of maybe clipped read if (startFlow>0) dpTreephaser.Solve(read, (startFlow+20), 0); // StartFlow clipped? Get solved HP length at startFlow unsigned int base = 0; int flow = 0; int HPlength = 0; while (base<read.sequence.size()){ while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != read.sequence[base]) flow++; if (flow > startFlow or flow == flow_order.num_flows()) break; if (flow == startFlow) HPlength++; base++; } if (verbose>0) Rprintf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, startFlow); // Get HP size at the start of the reference, i.e., Hypotheses[0] int count = 1; while (Hypotheses[0][count] == Hypotheses[0][0]) count++; if (verbose>0) Rprintf("Hypothesis starts with an HP of length %d\n", count); // Adjust the length of the prefix and erase extra solved bases if (HPlength>count) base -= count; else base -= HPlength; read.sequence.erase(read.sequence.begin()+base, read.sequence.end()); unsigned int prefix_size = read.sequence.size(); // creating predictions for the individual hypotheses vector<BasecallerRead> hypothesesReads(Hypotheses.size()); int max_last_flow = 0; for (unsigned int r=0; r<hypothesesReads.size(); ++r) { hypothesesReads[r] = read; // add hypothesis sequence to prefix for (base=0; base<Hypotheses[r].length() and base<(2*(unsigned int)flow_order.num_flows()-prefix_size); base++) hypothesesReads[r].sequence.push_back(Hypotheses[r][base]); // get last main incorporating flow int last_incorporating_flow = 0; base = 0; flow = 0; while (base<hypothesesReads[r].sequence.size() and flow<flow_order.num_flows()){ while (flow_order.nuc_at(flow) != hypothesesReads[r].sequence[base]) flow++; last_incorporating_flow = flow; if (last_incorporating_flow > max_last_flow) max_last_flow = last_incorporating_flow; base++; } // Simulate sequence dpTreephaser.Simulate(hypothesesReads[r], flow_order.num_flows()); // Adaptively normalize each hypothesis if (applyNormalization>0) { steps = last_incorporating_flow / window_size; dpTreephaser.WindowedNormalize(hypothesesReads[r], steps, window_size); } // Solver simulates beginning of the read and then fills in the remaining clipped bases dpTreephaser.Solve(hypothesesReads[r], flow_order.num_flows(), last_incorporating_flow); // Store predictions and adaptively normalized measurements predictions[r] = hypothesesReads[r].prediction; normalizedMeasurements[r] = hypothesesReads[r].normalized_measurements; } // --- Calculating distances --- // Include only flow values in the distance where the predictions differ by more than "threshold" float threshold = 0.05; // Do not include flows after main inc. flow of lastest hypothesis for (int flow=0; flow<(max_last_flow+1); ++flow) { bool includeFlow = false; for (unsigned int hyp=1; hyp<hypothesesReads.size(); ++hyp) if (abs(hypothesesReads[hyp].prediction[flow] - hypothesesReads[0].prediction[flow])>threshold) includeFlow = true; if (includeFlow) { for (unsigned int hyp=0; hyp<hypothesesReads.size(); ++hyp) { float residual = hypothesesReads[hyp].normalized_measurements[flow] - hypothesesReads[hyp].prediction[flow]; DistanceObserved[hyp] += residual * residual; if (hyp>0) { residual = hypothesesReads[0].prediction[flow] - hypothesesReads[hyp].prediction[flow]; DistanceHypotheses[hyp-1] += residual * residual; } } } } // --- verbose --- if (verbose>0){ Rprintf("Calculating distances between %d hypotheses starting at flow %d:\n", Hypotheses.size(), startFlow); for (unsigned int i=0; i<Hypotheses.size(); ++i){ for (unsigned int j=0; j<Hypotheses[i].length(); ++j) Rprintf("%c", Hypotheses[i][j]); Rprintf("\n"); } Rprintf("Solved read prefix: "); for (unsigned int j=0; j<prefix_size; ++j) Rprintf("%c", read.sequence[j]); Rprintf("\n"); Rprintf("Extended Hypotheses reads to:\n"); for (unsigned int i=0; i<hypothesesReads.size(); ++i){ for (unsigned int j=0; j<hypothesesReads[i].sequence.size(); ++j) Rprintf("%c", hypothesesReads[i].sequence[j]); Rprintf("\n"); } Rprintf("Calculated Distances d2(obs, H_i), d2(H_i, H_0):\n"); Rprintf("%f, 0\n", DistanceObserved[0]); for (unsigned int i=1; i<Hypotheses.size(); ++i) Rprintf("%f, %f\n", DistanceObserved[i], DistanceHypotheses[i-1]); } // --------------- */ }
// Function to fill in prediceted signal values int CalculateHypPredictions( PersistingThreadObjects &thread_objects, const Alignment &my_read, const InputStructures &global_context, const vector<string> &Hypotheses, const vector<bool> &same_as_null_hypothesis, vector<vector<float> > &predictions, vector<vector<float> > &normalizedMeasurements, int flow_upper_bound) { // --- Step 1: Initialize Objects if (global_context.DEBUG > 2) cout << "Prediction Generation for read " << my_read.alignment.Name << endl; predictions.resize(Hypotheses.size()); normalizedMeasurements.resize(Hypotheses.size()); // Careful: num_flows may be smaller than flow_order.num_flows() const ion::FlowOrder & flow_order = global_context.flow_order_vector.at(my_read.flow_order_index); const int & num_flows = global_context.num_flows_by_run_id.at(my_read.runid); int prefix_flow = 0; BasecallerRead master_read; master_read.SetData(my_read.measurements, flow_order.num_flows()); InitializeBasecallers(thread_objects, my_read, global_context); // --- Step 2: Processing read prefix or solve beginning of the read if desired unsigned int prefix_size = 0; if (global_context.resolve_clipped_bases or my_read.prefix_flow < 0) { prefix_flow = GetStartOfMasterRead(thread_objects, my_read, global_context, Hypotheses, num_flows, master_read); prefix_size = master_read.sequence.size(); } else { const string & read_prefix = global_context.key_by_read_group.at(my_read.read_group); prefix_size = read_prefix.length(); for (unsigned int i_base=0; i_base < read_prefix.length(); i_base++) master_read.sequence.push_back(read_prefix.at(i_base)); prefix_flow = my_read.prefix_flow; } // --- Step 3: creating predictions for the individual hypotheses // Compute an upper limit of flows to be simulated or solved if (global_context.DEBUG > 2) cout << "Prediction Generation: determining flow upper bound (flow_order.num_flows()=" << flow_order.num_flows() << ") as the minimum of:" << " flow_upper_bound " << flow_upper_bound << " measurement_length " << my_read.measurements_length << " num_flows " << num_flows << endl; flow_upper_bound = min(flow_upper_bound, min(my_read.measurements_length, num_flows)); vector<BasecallerRead> hypothesesReads(Hypotheses.size()); int max_last_flow = 0; for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) { // No need to simulate if a hypothesis is equal to the read as called // We get that info from the splicing module if (same_as_null_hypothesis.at(i_hyp)) { predictions[i_hyp] = predictions[0]; predictions[i_hyp].resize(flow_order.num_flows()); normalizedMeasurements[i_hyp] = normalizedMeasurements[0]; normalizedMeasurements[i_hyp].resize(flow_order.num_flows()); } else { hypothesesReads[i_hyp] = master_read; // --- add hypothesis sequence to clipped prefix unsigned int i_base = 0; unsigned int max_bases = 2*(unsigned int)flow_order.num_flows()-prefix_size; // Our maximum allocated memory for the sequence vector int i_flow = prefix_flow; // Add bases to read object sequence // We add one more base beyond 'flow_upper_bound' (if available) to signal Treephaser to not even start the solver while (i_base<Hypotheses[i_hyp].length() and i_base<max_bases) { IncrementFlow(flow_order, Hypotheses[i_hyp][i_base], i_flow); hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]); if (i_flow >= flow_upper_bound) { i_flow = flow_upper_bound; break; } i_base++; } // Find last main incorporating flow of all hypotheses max_last_flow = max(max_last_flow, i_flow); // Solver simulates beginning of the read and then fills in the remaining clipped bases // Above checks on flow_upper_bound and i_flow guarantee that i_flow <= flow_upper_bound <= num_flows thread_objects.SolveRead(my_read.flow_order_index, hypothesesReads[i_hyp], min(i_flow,flow_upper_bound), flow_upper_bound); // Store predictions and adaptively normalized measurements predictions[i_hyp].swap(hypothesesReads[i_hyp].prediction); predictions[i_hyp].resize(flow_order.num_flows(), 0); normalizedMeasurements[i_hyp].swap(hypothesesReads[i_hyp].normalized_measurements); normalizedMeasurements[i_hyp].resize(flow_order.num_flows(), 0); } } // --- verbose --- if (global_context.DEBUG>2) PredictionGenerationVerbose(Hypotheses, hypothesesReads, my_read, predictions, prefix_size, global_context); //return max_last_flow; return (max_last_flow); }
// Function to fill in predicted signal values void BaseHypothesisEvaluator(BamTools::BamAlignment &alignment, const string &flow_order_str, const string &alt_base_hyp, float &delta_score, float &fit_score, int heavy_verbose) { // --- Step 1: Initialize Objects and retrieve relevant tags delta_score = 1e5; fit_score = 1e5; vector<string> Hypotheses(2); vector<float> measurements, phase_params; int start_flow, num_flows, prefix_flow=0; if (not GetBamTags(alignment, flow_order_str.length(), measurements, phase_params, start_flow)) return; num_flows = measurements.size(); ion::FlowOrder flow_order(flow_order_str, num_flows); BasecallerRead master_read; master_read.SetData(measurements, flow_order.num_flows()); TreephaserLite treephaser(flow_order); treephaser.SetModelParameters(phase_params[0], phase_params[1]); // --- Step 2: Solve beginning of the read // Look at mapped vs. unmapped reads in BAM Hypotheses[0] = alignment.QueryBases; Hypotheses[1] = alt_base_hyp; // Safety: reverse complement reverse strand reads in mapped bam if (alignment.IsMapped() and alignment.IsReverseStrand()) { RevComplementInPlace(Hypotheses[0]); RevComplementInPlace(Hypotheses[1]); } prefix_flow = GetMasterReadPrefix(treephaser, flow_order, start_flow, Hypotheses[0], master_read); unsigned int prefix_size = master_read.sequence.size(); // --- Step 3: creating predictions for the individual hypotheses vector<BasecallerRead> hypothesesReads(Hypotheses.size()); vector<float> squared_distances(Hypotheses.size(), 0.0); int max_last_flow = 0; for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) { hypothesesReads[i_hyp] = master_read; // --- add hypothesis sequence to clipped prefix unsigned int i_base = 0; int i_flow = prefix_flow; while (i_base<Hypotheses[i_hyp].length() and i_base<(2*(unsigned int)flow_order.num_flows()-prefix_size)) { while (i_flow < flow_order.num_flows() and flow_order.nuc_at(i_flow) != Hypotheses[i_hyp][i_base]) i_flow++; if (i_flow < flow_order.num_flows() and i_flow > max_last_flow) max_last_flow = i_flow; if (i_flow >= flow_order.num_flows()) break; // Add base to sequence only if it fits into flow order hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]); i_base++; } i_flow = min(i_flow, flow_order.num_flows()-1); // Solver simulates beginning of the read and then fills in the remaining clipped bases for which we have flow information treephaser.Solve(hypothesesReads[i_hyp], num_flows, i_flow); } // Compute L2-distance of measurements and predictions for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) { for (int iFlow=0; iFlow<=max_last_flow; iFlow++) squared_distances[i_hyp] += (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)) * (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)); } // Delta: L2-distance of alternative base Hypothesis - L2-distance of bases as called delta_score = squared_distances.at(1) - squared_distances.at(0); fit_score = min(squared_distances.at(1), squared_distances.at(0)); // --- verbose --- if (heavy_verbose > 1 or (delta_score < 0 and heavy_verbose > 0)) { cout << "Processed read " << alignment.Name << endl; cout << "Delta Fit: " << delta_score << " Overall Fit: " << fit_score << endl; PredictionGenerationVerbose(Hypotheses, hypothesesReads, phase_params, flow_order, start_flow, prefix_size); } }