void DPTreephaser::QueryAllStates(BasecallerRead& data, vector< vector<float> >& query_states, vector<int>& hp_lengths, int max_flows) { max_flows = min(max_flows,flow_order_.num_flows()); InitializeState(&path_[0]); max_flows = min(max_flows, flow_order_.num_flows()); query_states.reserve(data.sequence.size()); query_states.resize(0); hp_lengths.assign(data.sequence.size(), 0); char last_nuc = 'N'; int hp_count = 0; for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow < max_flows; ++nuc) { if (last_nuc != *nuc and last_nuc != 'N') { hp_lengths[hp_count] = path_[0].last_hp; query_states.push_back(path_[0].state); hp_count++; } AdvanceStateInPlace(&path_[0], *nuc, max_flows); last_nuc = *nuc; } hp_lengths[hp_count] = path_[0].last_hp; query_states.push_back(path_[0].state); hp_lengths.resize(query_states.size()); data.prediction.swap(path_[0].prediction); }
void DPTreephaser::QueryState(BasecallerRead& data, vector<float>& query_state, int& current_hp, int max_flows, int query_flow) { max_flows = min(max_flows,flow_order_.num_flows()); assert(query_flow < max_flows); InitializeState(&path_[0]); query_state.assign(max_flows,0); char myNuc = 'N'; for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow <= query_flow; ++nuc) { if (path_[0].flow == query_flow and myNuc != 'N' and myNuc != *nuc) break; AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows()); if (path_[0].flow == query_flow and myNuc == 'N') myNuc = *nuc; } // Catching cases where a query_flow without incorporation or query_flow after end of sequence was given int until_flow = min(path_[0].window_end, max_flows); if (path_[0].flow == query_flow) { current_hp = path_[0].last_hp; for (int flow = path_[0].window_start; flow < until_flow; ++flow) query_state[flow] = path_[0].state[flow]; } else current_hp = 0; }
void DPTreephaser::Simulate(BasecallerRead& data, int max_flows) { InitializeState(&path_[0]); for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow < max_flows; ++nuc) AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows()); data.prediction.swap(path_[0].prediction); }
void DPTreephaser::Simulate(BasecallerRead& data, int max_flows) { max_flows = min(max_flows,flow_order_.num_flows()); InitializeState(&path_[0]); for (int solution_flow = 0; solution_flow < max_flows; ++solution_flow) for (int hp = 0; hp < data.solution[solution_flow]; ++hp) AdvanceStateInPlace(&path_[0], flow_order_.int_at(solution_flow), flow_order_.num_flows()); data.prediction.swap(path_[0].prediction); }
void DPTreephaser::Simulate(BasecallerRead& data, int max_flows,bool state_inphase) { InitializeState(&path_[0]); for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow < max_flows; ++nuc) { AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows()); path_[0].sequence.push_back(*nuc); // Needed to simulate diagonal states correctly if (state_inphase and path_[0].flow < max_flows) data.state_inphase.at(path_[0].flow) = path_[0].state.at(path_[0].flow); } data.prediction.swap(path_[0].prediction); }
void DPTreephaser::SimulateRecalibrated(BasecallerRead& data, int max_flows) { InitializeState(&path_[0]); // Generate predicted signal for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow < max_flows; ++nuc) { int flow_s = path_[0].flow; AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows()); if (path_[0].flow < flow_order_.num_flows()) { //update flow2base_pos path_[0].base_pos++; path_[0].flow2base_pos[path_[0].flow] = path_[0].base_pos; for(int flow_inter = flow_s+1; flow_inter < path_[0].flow; flow_inter++) path_[0].flow2base_pos[flow_inter] = path_[0].flow2base_pos[flow_s]; } } // Apply signal distortion according to HP recalibration model if (pm_model_available_) { for (int flow = 0; flow < flow_order_.num_flows(); ++flow) { int hp_length = 0; if(flow == 0) hp_length = path_[0].flow2base_pos[0]; else hp_length =path_[0].flow2base_pos[flow] - path_[0].flow2base_pos[flow-1]; if(hp_length < 0) hp_length = 0; if(hp_length > MAX_HPXLEN) hp_length = MAX_HPXLEN; path_[0].prediction[flow] = path_[0].prediction[flow] * (*As_)[flow][flow_order_.int_at(flow)][hp_length] + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length]; } } data.prediction.swap(path_[0].prediction); }
void DPTreephaser::Solve(BasecallerRead& read, int max_flows, int restart_flows) { static const char nuc_int_to_char[5] = "ACGT"; assert(max_flows <= flow_order_.num_flows()); // Initialize stack: just one root path for (int p = 1; p < kNumPaths; ++p) path_[p].in_use = false; InitializeState(&path_[0]); path_[0].path_metric = 0; path_[0].per_flow_metric = 0; path_[0].residual_left_of_window = 0; path_[0].dot_counter = 0; path_[0].in_use = true; //path_[0].sequence.reserve(2*flow_order_.num_flows()); //Done in InitializeState int space_on_stack = kNumPaths - 1; float sum_of_squares_upper_bound = 1e20; //max_flows; // Squared distance of solution to measurements if (restart_flows > 0) { // The solver will not attempt to solve initial restart_flows // - Simulate restart_flows instead of solving // - If it turns out that solving was finished before restart_flows, simply exit without any changes to the read. restart_flows = min(restart_flows, flow_order_.num_flows()); for (vector<char>::iterator nuc = read.sequence.begin(); nuc != read.sequence.end() and path_[0].flow < restart_flows; ++nuc) { int flow_s = path_[0].flow; AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows()); if (path_[0].flow < flow_order_.num_flows()) { path_[0].sequence.push_back(*nuc); //update flow2base_pos path_[0].base_pos++; path_[0].flow2base_pos[path_[0].flow] = path_[0].base_pos; for(int flow_inter = flow_s+1; flow_inter < path_[0].flow; flow_inter++) path_[0].flow2base_pos[flow_inter] = path_[0].flow2base_pos[flow_s]; } } if (path_[0].flow < restart_flows-10) { // This read ended before restart_flows. No point resolving it. read.prediction.swap(path_[0].prediction); return; } for (int flow = 0; flow < path_[0].window_start; ++flow) { float residual = 0; if(pm_model_enabled_==false){ residual = read.normalized_measurements[flow] - path_[0].prediction[flow]; } else{ int hp_length = 0; if(flow==0) hp_length = path_[0].flow2base_pos[0]; else hp_length =path_[0].flow2base_pos[flow] - path_[0].flow2base_pos[flow-1]; if(hp_length<0) hp_length = 0; residual = read.normalized_measurements[flow] - (path_[0].prediction[flow] * (*As_)[flow][flow_order_.int_at(flow)][hp_length] + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length]); } path_[0].residual_left_of_window += residual * residual; } } // Initializing variables //read.solution.assign(flow_order_.num_flows(), 0); read.sequence.clear(); read.sequence.reserve(2*flow_order_.num_flows()); read.prediction.assign(flow_order_.num_flows(), 0); // Main loop to select / expand / delete paths while (1) { // ------------------------------------------ // Step 1: Prune the content of the stack and make sure there are at least 4 empty slots // Remove paths that are more than 'maxPathDelay' behind the longest one if (space_on_stack < kNumPaths-3) { int longest_path = 0; for (int p = 0; p < kNumPaths; ++p) if (path_[p].in_use) longest_path = max(longest_path, path_[p].flow); if (longest_path > kMaxPathDelay) { for (int p = 0; p < kNumPaths; ++p) { if (path_[p].in_use and path_[p].flow < longest_path-kMaxPathDelay) { path_[p].in_use = false; space_on_stack++; } } } } // If necessary, remove paths with worst perFlowMetric while (space_on_stack < 4) { // find maximum per flow metric float max_per_flow_metric = -0.1; int max_metric_path = kNumPaths; for (int p = 0; p < kNumPaths; ++p) { if (path_[p].in_use and path_[p].per_flow_metric > max_per_flow_metric) { max_per_flow_metric = path_[p].per_flow_metric; max_metric_path = p; } } // killing path with largest per flow metric if (!(max_metric_path < kNumPaths)) { printf("Failed assertion in Treephaser\n"); for (int p = 0; p < kNumPaths; ++p) { if (path_[p].in_use) printf("Path %d, in_use = true, per_flow_metric = %f\n", p, path_[p].per_flow_metric); else printf("Path %d, in_use = false, per_flow_metric = %f\n", p, path_[p].per_flow_metric); } fflush(NULL); } assert (max_metric_path < kNumPaths); path_[max_metric_path].in_use = false; space_on_stack++; } // ------------------------------------------ // Step 2: Select a path to expand or break if there is none TreephaserPath *parent = NULL; float min_path_metric = 1000; for (int p = 0; p < kNumPaths; ++p) { if (path_[p].in_use and path_[p].path_metric < min_path_metric) { min_path_metric = path_[p].path_metric; parent = &path_[p]; } } if (!parent) break; // ------------------------------------------ // Step 3: Construct four expanded paths and calculate feasibility metrics assert (space_on_stack >= 4); TreephaserPath *children[4]; for (int nuc = 0, p = 0; nuc < 4; ++p) if (not path_[p].in_use) children[nuc++] = &path_[p]; float penalty[4] = { 0, 0, 0, 0 }; for (int nuc = 0; nuc < 4; ++nuc) { TreephaserPath *child = children[nuc]; AdvanceState(child, parent, nuc_int_to_char[nuc], max_flows); // Apply easy termination rules if (child->flow >= max_flows) { penalty[nuc] = 25; // Mark for deletion continue; } if (child->last_hp > kMaxHP) { penalty[nuc] = 25; // Mark for deletion continue; } if ((int)parent->sequence.size() >= (2 * flow_order_.num_flows() - 10)) { penalty[nuc] = 25; // Mark for deletion continue; } child->path_metric = parent->residual_left_of_window; child->residual_left_of_window = parent->residual_left_of_window; float penaltyN = 0; float penalty1 = 0; for (int flow = parent->window_start; flow < child->window_end; ++flow) { float residual = 0; if(pm_model_enabled_==false){ residual = read.normalized_measurements[flow] - child->prediction[flow]; } else{ int hp_length = 0; if(flow==0) hp_length = parent->flow2base_pos[0]; else hp_length = parent->flow2base_pos[flow] - parent->flow2base_pos[flow-1]; if(hp_length<0) hp_length = 0; residual = read.normalized_measurements[flow] - (child->prediction[flow] * (*As_)[flow][flow_order_.int_at(flow)][hp_length] + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length]); } float residual_squared = residual * residual; // Metric calculation if (flow < child->window_start) { child->residual_left_of_window += residual_squared; child->path_metric += residual_squared; } else if (residual <= 0) child->path_metric += residual_squared; if (residual <= 0) penaltyN += residual_squared; else if (flow < child->flow) penalty1 += residual_squared; } penalty[nuc] = penalty1 + kNegativeMultiplier * penaltyN; penalty1 += penaltyN; if (child->flow>0) child->per_flow_metric = (child->path_metric + 0.5 * penalty1) / child->flow; } //looping over nucs // Find out which nuc has the least penalty (the greedy choice nuc) int best_nuc = 0; if (penalty[best_nuc] > penalty[1]) best_nuc = 1; if (penalty[best_nuc] > penalty[2]) best_nuc = 2; if (penalty[best_nuc] > penalty[3]) best_nuc = 3; // ------------------------------------------ // Step 4: Use calculated metrics to decide which paths are worth keeping for (int nuc = 0; nuc < 4; ++nuc) { TreephaserPath *child = children[nuc]; // Path termination rules if (penalty[nuc] >= 20) continue; if (child->path_metric > sum_of_squares_upper_bound) continue; // This is the only rule that depends on finding the "best nuc" if (penalty[nuc] - penalty[best_nuc] >= kExtendThreshold) continue; float dot_signal = 0; if(pm_model_enabled_==false){ dot_signal = (read.normalized_measurements[child->flow] - parent->prediction[child->flow]) / child->state[child->flow]; } else{ int hp_length = 0; if(child->flow==0) hp_length = parent->flow2base_pos[0]; else hp_length = parent->flow2base_pos[child->flow] - parent->flow2base_pos[child->flow-1]; if(hp_length<0) hp_length = 0; dot_signal = (read.normalized_measurements[child->flow] - (parent->prediction[child->flow] * (*As_)[child->flow][flow_order_.int_at(child->flow)][hp_length] + (*Bs_)[child->flow][flow_order_.int_at(child->flow)][hp_length])) / child->state[child->flow]; } child->dot_counter = (dot_signal < kDotThreshold) ? (parent->dot_counter + 1) : 0; if (child->dot_counter > 1) continue; // Path survived termination rules and will be kept on stack child->in_use = true; space_on_stack--; // Fill out the remaining portion of the prediction memcpy(&child->prediction[0], &parent->prediction[0], parent->window_start*sizeof(float)); for (int flow = child->window_end; flow < max_flows; ++flow) child->prediction[flow] = 0; // Fill out the solution child->sequence = parent->sequence; child->sequence.push_back(nuc_int_to_char[nuc]); //calculate starting base position for each flow child->base_pos = parent->base_pos; child->base_pos++; child->flow2base_pos = parent->flow2base_pos; for(int flow_inter = parent->flow+1; flow_inter < child->flow; flow_inter++){ child->flow2base_pos[flow_inter] = child->flow2base_pos[parent->flow]; } child->flow2base_pos[child->flow] = child->base_pos; } // ------------------------------------------ // Step 5. Check if the selected path is in fact the best path so far // Computing sequence squared distance float sum_of_squares = parent->residual_left_of_window; for (int flow = parent->window_start; flow < max_flows; flow++) { float residual = 0; if(pm_model_enabled_==false){ residual = read.normalized_measurements[flow] - parent->prediction[flow]; } else{ int hp_length = 0; if(flow==0) hp_length = parent->flow2base_pos[0]; else hp_length = parent->flow2base_pos[flow] - parent->flow2base_pos[flow-1]; if(hp_length<0) hp_length = 0; residual = read.normalized_measurements[flow] - (parent->prediction[flow] * (*As_)[flow][flow_order_.int_at(flow)][hp_length] + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length]); } sum_of_squares += residual * residual; } // Updating best path if (sum_of_squares < sum_of_squares_upper_bound) { read.prediction.swap(parent->prediction); read.sequence.swap(parent->sequence); sum_of_squares_upper_bound = sum_of_squares; } parent->in_use = false; space_on_stack++; } // main decision loop }