int GetMasterReadPrefix(TreephaserLite &treephaser, const ion::FlowOrder &flow_order, const int &start_flow, const string &called_bases, BasecallerRead &master_read) { // Solve beginning of maybe clipped read int until_flow = min((start_flow+20), flow_order.num_flows()); treephaser.Solve(master_read, until_flow, 0); // StartFlow clipped? Get solved HP length at startFlow. unsigned int base = 0; int flow = 0; unsigned int HPlength = 0; while (base < master_read.sequence.size()) { while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != master_read.sequence[base]) { flow++; } if (flow > start_flow or flow == flow_order.num_flows()) break; if (flow == start_flow) HPlength++; base++; } //if (global_context.DEBUG>2) // printf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, start_flow); // Get HP size at the start of the read as called in Hypotheses[0] unsigned int count = 1; while (count < called_bases.length() and called_bases.at(count) == called_bases.at(0)) count++; //if (global_context.DEBUG>2) // printf("Hypothesis starts with an HP of length %d\n", count); // Adjust the length of the prefix and erase extra solved bases if (HPlength>count) base -= count; else base -= HPlength; master_read.sequence.erase(master_read.sequence.begin()+base, master_read.sequence.end()); // Get flow of last prefix base int prefix_flow = 0; for (unsigned int i_base = 0; i_base < master_read.sequence.size(); i_base++) { while (prefix_flow < flow_order.num_flows() and flow_order.nuc_at(prefix_flow) != master_read.sequence[i_base]) prefix_flow++; } return prefix_flow; }
void PredictionGenerationVerbose(const vector<string> &Hypotheses, const vector<BasecallerRead> &hypothesesReads, const vector<float> &phase_params, const ion::FlowOrder &flow_order, const int &start_flow, const int &prefix_size) { printf("Calculating predictions for %d hypotheses starting at flow %d:\n", (int)Hypotheses.size(), start_flow); for (unsigned int iHyp=0; iHyp<Hypotheses.size(); ++iHyp) { for (unsigned int iBase=0; iBase<Hypotheses[iHyp].length(); ++iBase) printf("%c", Hypotheses[iHyp][iBase]); printf("\n"); } printf("Solved read prefix: "); for (int iBase=0; iBase<prefix_size; ++iBase) printf("%c", hypothesesReads[0].sequence[iBase]); printf("\n"); printf("Extended Hypotheses reads to:\n"); for (unsigned int iHyp=0; iHyp<hypothesesReads.size(); ++iHyp) { for (unsigned int iBase=0; iBase<hypothesesReads[iHyp].sequence.size(); ++iBase) printf("%c", hypothesesReads[iHyp].sequence[iBase]); printf("\n"); } printf("Phasing Parameters, cf: %f ie: %f dr: %f \n Predictions: \n", phase_params[0], phase_params[1], phase_params[2]); cout << "Flow Order : "; for (int i_flow=0; i_flow<flow_order.num_flows(); i_flow++) { cout << flow_order.nuc_at(i_flow) << " "; if (hypothesesReads[0].normalized_measurements[i_flow] < 0) cout << " "; } cout << endl << "Flow Index : "; for (int i_flow=0; i_flow<flow_order.num_flows(); i_flow++) { cout << i_flow << " "; if (i_flow<10) cout << " "; else if (i_flow<100) cout << " "; else if (i_flow<1000) cout << " "; if (hypothesesReads[0].normalized_measurements[i_flow] < 0) cout << " "; } cout << endl << "Measured : "; for (unsigned int i_flow=0; i_flow<hypothesesReads[0].normalized_measurements.size(); ++i_flow) { printf("%.2f", hypothesesReads[0].normalized_measurements[i_flow]); if (hypothesesReads[0].normalized_measurements[i_flow] < 10) cout << " "; } cout << endl; for (unsigned int i_Hyp=0; i_Hyp<hypothesesReads.size(); ++i_Hyp) { cout << "Prediction "<< i_Hyp << ": "; for (unsigned int i_flow=0; i_flow<hypothesesReads[i_Hyp].prediction.size(); ++i_flow) { printf("%.2f", hypothesesReads[i_Hyp].prediction[i_flow]); if (hypothesesReads[i_Hyp].prediction[i_flow] < 10) cout << " "; if (hypothesesReads[0].normalized_measurements[i_flow] < 0) cout << " "; } cout << endl; } cout << " ------------------- " << endl; }
void GetPrefixFlow(Alignment *rai, const string & prefix_bases, const ion::FlowOrder & flow_order) { rai->prefix_flow = 0; unsigned int base_idx = 0; while (base_idx < prefix_bases.length() and rai->prefix_flow < flow_order.num_flows()) { while (rai->prefix_flow < flow_order.num_flows() and flow_order.nuc_at(rai->prefix_flow) != prefix_bases.at(base_idx)) rai->prefix_flow++; base_idx++; } }
void CreateFlowIndex(Alignment *rai, const ion::FlowOrder & flow_order) { rai->flow_index.assign(rai->read_bases.length(), flow_order.num_flows()); int flow = rai->start_flow; unsigned int base_idx = 0; while (base_idx < rai->read_bases.length() and flow < flow_order.num_flows()){ while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != rai->read_bases[base_idx]) flow++; rai->flow_index[base_idx] = flow; base_idx++; } if (base_idx != rai->read_bases.length()) { cerr << "WARNING in ExtendedReadInfo::CreateFlowIndex: There are more bases in the read than fit into the flow order."; exit(1); } }
void IncrementFlows(const ion::FlowOrder &flow_order, const char &nuc, vector<int> &flows) { for (unsigned int idx = 1; idx < flows.size(); idx++) while (flows[idx] < flow_order.num_flows() and flow_order.nuc_at(flows[idx]) != nuc) flows[idx]++; }
void IncrementFlow(const ion::FlowOrder &flow_order, const char &nuc, int &flow) { while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != nuc) flow++; }
void CalculateHypDistances(const vector<float>& NormalizedMeasurements, const float& cf, const float& ie, const float& droop, const ion::FlowOrder& flow_order, const vector<string>& Hypotheses, const int& startFlow, vector<float>& DistanceObserved, vector<float>& DistanceHypotheses, vector<vector<float> >& predictions, vector<vector<float> >& normalizedMeasurements, int applyNormalization, int verbose) { // Create return data structures // Distance of normalized observations to different hypotheses: d(obs,h1), ... , d(obs,hN) DistanceObserved.assign(Hypotheses.size(), 0); // Distance of hypotheses to first hypothesis: d(h1,h2), ... , d(h1, hN) DistanceHypotheses.assign(Hypotheses.size()-1, 0); predictions.resize(Hypotheses.size()); normalizedMeasurements.resize(Hypotheses.size()); // Loading key normalized values into a read and performing adaptive normalization BasecallerRead read; read.key_normalizer = 1; read.raw_measurements = NormalizedMeasurements; read.normalized_measurements = NormalizedMeasurements; read.sequence.clear(); read.sequence.reserve(2*flow_order.num_flows()); read.prediction.assign(flow_order.num_flows(), 0); read.additive_correction.assign(flow_order.num_flows(), 0); read.multiplicative_correction.assign(flow_order.num_flows(), 1.0); int steps, window_size = 50; DPTreephaser dpTreephaser(flow_order); dpTreephaser.SetModelParameters(cf, ie, droop); // Solve beginning of maybe clipped read if (startFlow>0) dpTreephaser.Solve(read, (startFlow+20), 0); // StartFlow clipped? Get solved HP length at startFlow unsigned int base = 0; int flow = 0; int HPlength = 0; while (base<read.sequence.size()){ while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != read.sequence[base]) flow++; if (flow > startFlow or flow == flow_order.num_flows()) break; if (flow == startFlow) HPlength++; base++; } if (verbose>0) Rprintf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, startFlow); // Get HP size at the start of the reference, i.e., Hypotheses[0] int count = 1; while (Hypotheses[0][count] == Hypotheses[0][0]) count++; if (verbose>0) Rprintf("Hypothesis starts with an HP of length %d\n", count); // Adjust the length of the prefix and erase extra solved bases if (HPlength>count) base -= count; else base -= HPlength; read.sequence.erase(read.sequence.begin()+base, read.sequence.end()); unsigned int prefix_size = read.sequence.size(); // creating predictions for the individual hypotheses vector<BasecallerRead> hypothesesReads(Hypotheses.size()); int max_last_flow = 0; for (unsigned int r=0; r<hypothesesReads.size(); ++r) { hypothesesReads[r] = read; // add hypothesis sequence to prefix for (base=0; base<Hypotheses[r].length() and base<(2*(unsigned int)flow_order.num_flows()-prefix_size); base++) hypothesesReads[r].sequence.push_back(Hypotheses[r][base]); // get last main incorporating flow int last_incorporating_flow = 0; base = 0; flow = 0; while (base<hypothesesReads[r].sequence.size() and flow<flow_order.num_flows()){ while (flow_order.nuc_at(flow) != hypothesesReads[r].sequence[base]) flow++; last_incorporating_flow = flow; if (last_incorporating_flow > max_last_flow) max_last_flow = last_incorporating_flow; base++; } // Simulate sequence dpTreephaser.Simulate(hypothesesReads[r], flow_order.num_flows()); // Adaptively normalize each hypothesis if (applyNormalization>0) { steps = last_incorporating_flow / window_size; dpTreephaser.WindowedNormalize(hypothesesReads[r], steps, window_size); } // Solver simulates beginning of the read and then fills in the remaining clipped bases dpTreephaser.Solve(hypothesesReads[r], flow_order.num_flows(), last_incorporating_flow); // Store predictions and adaptively normalized measurements predictions[r] = hypothesesReads[r].prediction; normalizedMeasurements[r] = hypothesesReads[r].normalized_measurements; } // --- Calculating distances --- // Include only flow values in the distance where the predictions differ by more than "threshold" float threshold = 0.05; // Do not include flows after main inc. flow of lastest hypothesis for (int flow=0; flow<(max_last_flow+1); ++flow) { bool includeFlow = false; for (unsigned int hyp=1; hyp<hypothesesReads.size(); ++hyp) if (abs(hypothesesReads[hyp].prediction[flow] - hypothesesReads[0].prediction[flow])>threshold) includeFlow = true; if (includeFlow) { for (unsigned int hyp=0; hyp<hypothesesReads.size(); ++hyp) { float residual = hypothesesReads[hyp].normalized_measurements[flow] - hypothesesReads[hyp].prediction[flow]; DistanceObserved[hyp] += residual * residual; if (hyp>0) { residual = hypothesesReads[0].prediction[flow] - hypothesesReads[hyp].prediction[flow]; DistanceHypotheses[hyp-1] += residual * residual; } } } } // --- verbose --- if (verbose>0){ Rprintf("Calculating distances between %d hypotheses starting at flow %d:\n", Hypotheses.size(), startFlow); for (unsigned int i=0; i<Hypotheses.size(); ++i){ for (unsigned int j=0; j<Hypotheses[i].length(); ++j) Rprintf("%c", Hypotheses[i][j]); Rprintf("\n"); } Rprintf("Solved read prefix: "); for (unsigned int j=0; j<prefix_size; ++j) Rprintf("%c", read.sequence[j]); Rprintf("\n"); Rprintf("Extended Hypotheses reads to:\n"); for (unsigned int i=0; i<hypothesesReads.size(); ++i){ for (unsigned int j=0; j<hypothesesReads[i].sequence.size(); ++j) Rprintf("%c", hypothesesReads[i].sequence[j]); Rprintf("\n"); } Rprintf("Calculated Distances d2(obs, H_i), d2(H_i, H_0):\n"); Rprintf("%f, 0\n", DistanceObserved[0]); for (unsigned int i=1; i<Hypotheses.size(); ++i) Rprintf("%f, %f\n", DistanceObserved[i], DistanceHypotheses[i-1]); } // --------------- */ }