Exemple #1
0
void DPTreephaser::QueryAllStates(BasecallerRead& data, vector< vector<float> >& query_states, vector<int>& hp_lengths, int max_flows)
{
  max_flows = min(max_flows,flow_order_.num_flows());
  InitializeState(&path_[0]);
  max_flows = min(max_flows, flow_order_.num_flows());
  query_states.reserve(data.sequence.size());
  query_states.resize(0);
  hp_lengths.assign(data.sequence.size(), 0);
  char last_nuc = 'N';
  int hp_count = 0;

  for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow < max_flows; ++nuc) {
    if (last_nuc != *nuc and last_nuc != 'N') {
      hp_lengths[hp_count] = path_[0].last_hp;
      query_states.push_back(path_[0].state);
      hp_count++;
    }
    AdvanceStateInPlace(&path_[0], *nuc, max_flows);
    last_nuc = *nuc;
  }
  hp_lengths[hp_count] = path_[0].last_hp;
  query_states.push_back(path_[0].state);
  hp_lengths.resize(query_states.size());
  data.prediction.swap(path_[0].prediction);
}
Exemple #2
0
void DPTreephaser::QueryState(BasecallerRead& data, vector<float>& query_state, int& current_hp, int max_flows, int query_flow)
{
  max_flows = min(max_flows,flow_order_.num_flows());
  assert(query_flow < max_flows);
  InitializeState(&path_[0]);
  query_state.assign(max_flows,0);
  char myNuc = 'N';

  for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow <= query_flow; ++nuc) {
    if (path_[0].flow == query_flow and myNuc != 'N' and myNuc != *nuc)
      break;
    AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());
    if (path_[0].flow == query_flow and myNuc == 'N')
      myNuc = *nuc;
  }

  // Catching cases where a query_flow without incorporation or query_flow after end of sequence was given
  int until_flow = min(path_[0].window_end, max_flows);
  if (path_[0].flow == query_flow) {
    current_hp = path_[0].last_hp;
    for (int flow = path_[0].window_start; flow < until_flow; ++flow)
      query_state[flow] = path_[0].state[flow];
  }
  else
    current_hp = 0;
}
Exemple #3
0
void DPTreephaser::Simulate(BasecallerRead& data, int max_flows)
{
  InitializeState(&path_[0]);

  for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow < max_flows; ++nuc)
    AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());

  data.prediction.swap(path_[0].prediction);
}
Exemple #4
0
void DPTreephaser::Simulate(BasecallerRead& data, int max_flows)
{
  max_flows = min(max_flows,flow_order_.num_flows());
  InitializeState(&path_[0]);

  for (int solution_flow = 0; solution_flow < max_flows; ++solution_flow)
    for (int hp = 0; hp < data.solution[solution_flow]; ++hp)
      AdvanceStateInPlace(&path_[0], flow_order_.int_at(solution_flow), flow_order_.num_flows());

  data.prediction.swap(path_[0].prediction);
}
Exemple #5
0
void DPTreephaser::Simulate(BasecallerRead& data, int max_flows,bool state_inphase)
{
  InitializeState(&path_[0]);

  for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end()
       and path_[0].flow < max_flows; ++nuc) {
    AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());
    path_[0].sequence.push_back(*nuc); // Needed to simulate diagonal states correctly
    if (state_inphase and path_[0].flow < max_flows)
      data.state_inphase.at(path_[0].flow) = path_[0].state.at(path_[0].flow);

  }

  data.prediction.swap(path_[0].prediction);
}
Exemple #6
0
void DPTreephaser::SimulateRecalibrated(BasecallerRead& data, int max_flows)
{
  InitializeState(&path_[0]);

  // Generate predicted signal
  for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end() and path_[0].flow < max_flows; ++nuc) {
    int flow_s = path_[0].flow;
    AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());

    if (path_[0].flow < flow_order_.num_flows()) {
      //update flow2base_pos
      path_[0].base_pos++;
      path_[0].flow2base_pos[path_[0].flow] = path_[0].base_pos;
      for(int flow_inter = flow_s+1; flow_inter < path_[0].flow; flow_inter++)
        path_[0].flow2base_pos[flow_inter] = path_[0].flow2base_pos[flow_s];
    }
  }

  // Apply signal distortion according to HP recalibration model
  if (pm_model_available_) {
    for (int flow = 0; flow < flow_order_.num_flows(); ++flow) {
      int hp_length = 0;
      if(flow == 0)
        hp_length = path_[0].flow2base_pos[0];
      else
        hp_length =path_[0].flow2base_pos[flow] - path_[0].flow2base_pos[flow-1];
      if(hp_length < 0)
        hp_length = 0;
      if(hp_length > MAX_HPXLEN)
        hp_length = MAX_HPXLEN;

      path_[0].prediction[flow] = path_[0].prediction[flow] * (*As_)[flow][flow_order_.int_at(flow)][hp_length]
                                          + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length];
    }
  }

  data.prediction.swap(path_[0].prediction);
}
Exemple #7
0
void DPTreephaser::Solve(BasecallerRead& read, int max_flows, int restart_flows)
{
  static const char nuc_int_to_char[5] = "ACGT";

  assert(max_flows <= flow_order_.num_flows());

  // Initialize stack: just one root path
  for (int p = 1; p < kNumPaths; ++p)
    path_[p].in_use = false;

  InitializeState(&path_[0]);
  path_[0].path_metric = 0;
  path_[0].per_flow_metric = 0;
  path_[0].residual_left_of_window = 0;
  path_[0].dot_counter = 0;
  path_[0].in_use = true;
  //path_[0].sequence.reserve(2*flow_order_.num_flows()); //Done in InitializeState

  int space_on_stack = kNumPaths - 1;
  float sum_of_squares_upper_bound = 1e20;  //max_flows; // Squared distance of solution to measurements

  if (restart_flows > 0) {
    // The solver will not attempt to solve initial restart_flows
    // - Simulate restart_flows instead of solving
    // - If it turns out that solving was finished before restart_flows, simply exit without any changes to the read.

    restart_flows = min(restart_flows, flow_order_.num_flows());

    for (vector<char>::iterator nuc = read.sequence.begin(); nuc != read.sequence.end() and path_[0].flow < restart_flows; ++nuc) {
      int flow_s = path_[0].flow;
      AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());

      if (path_[0].flow < flow_order_.num_flows()) {
        path_[0].sequence.push_back(*nuc);

        //update flow2base_pos
        path_[0].base_pos++;
        path_[0].flow2base_pos[path_[0].flow] = path_[0].base_pos;
        for(int flow_inter = flow_s+1; flow_inter < path_[0].flow; flow_inter++)
          path_[0].flow2base_pos[flow_inter] = path_[0].flow2base_pos[flow_s];
      }
    }

    if (path_[0].flow < restart_flows-10) { // This read ended before restart_flows. No point resolving it.
      read.prediction.swap(path_[0].prediction);
      return;
    }

    for (int flow = 0; flow < path_[0].window_start; ++flow) {
        float residual = 0;
        if(pm_model_enabled_==false){
            residual = read.normalized_measurements[flow] - path_[0].prediction[flow];
        }
        else{
            int hp_length = 0;
            if(flow==0) hp_length = path_[0].flow2base_pos[0];
            else hp_length =path_[0].flow2base_pos[flow] - path_[0].flow2base_pos[flow-1];
            if(hp_length<0) hp_length = 0;
            residual = read.normalized_measurements[flow]
                       - (path_[0].prediction[flow] * (*As_)[flow][flow_order_.int_at(flow)][hp_length]
                          + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length]);
        }
      path_[0].residual_left_of_window += residual * residual;
    }
  }

  // Initializing variables
  //read.solution.assign(flow_order_.num_flows(), 0);
  read.sequence.clear();
  read.sequence.reserve(2*flow_order_.num_flows());
  read.prediction.assign(flow_order_.num_flows(), 0);

  // Main loop to select / expand / delete paths
  while (1) {

    // ------------------------------------------
    // Step 1: Prune the content of the stack and make sure there are at least 4 empty slots

    // Remove paths that are more than 'maxPathDelay' behind the longest one
    if (space_on_stack < kNumPaths-3) {
      int longest_path = 0;
      for (int p = 0; p < kNumPaths; ++p)
        if (path_[p].in_use)
          longest_path = max(longest_path, path_[p].flow);

      if (longest_path > kMaxPathDelay) {
        for (int p = 0; p < kNumPaths; ++p) {
          if (path_[p].in_use and path_[p].flow < longest_path-kMaxPathDelay) {
            path_[p].in_use = false;
            space_on_stack++;
          }
        }
      }
    }

    // If necessary, remove paths with worst perFlowMetric
    while (space_on_stack < 4) {
      // find maximum per flow metric
      float max_per_flow_metric = -0.1;
      int max_metric_path = kNumPaths;
      for (int p = 0; p < kNumPaths; ++p) {
        if (path_[p].in_use and path_[p].per_flow_metric > max_per_flow_metric) {
          max_per_flow_metric = path_[p].per_flow_metric;
          max_metric_path = p;
        }
      }

      // killing path with largest per flow metric
      if (!(max_metric_path < kNumPaths)) {
        printf("Failed assertion in Treephaser\n");
        for (int p = 0; p < kNumPaths; ++p) {
          if (path_[p].in_use)
            printf("Path %d, in_use = true, per_flow_metric = %f\n", p, path_[p].per_flow_metric);
          else
            printf("Path %d, in_use = false, per_flow_metric = %f\n", p, path_[p].per_flow_metric);
        }
        fflush(NULL);
      }
      assert (max_metric_path < kNumPaths);

      path_[max_metric_path].in_use = false;
      space_on_stack++;
    }

    // ------------------------------------------
    // Step 2: Select a path to expand or break if there is none

    TreephaserPath *parent = NULL;
    float min_path_metric = 1000;
    for (int p = 0; p < kNumPaths; ++p) {
      if (path_[p].in_use and path_[p].path_metric < min_path_metric) {
        min_path_metric = path_[p].path_metric;
        parent = &path_[p];
      }
    }
    if (!parent)
      break;


    // ------------------------------------------
    // Step 3: Construct four expanded paths and calculate feasibility metrics
    assert (space_on_stack >= 4);

    TreephaserPath *children[4];

    for (int nuc = 0, p = 0; nuc < 4; ++p)
      if (not path_[p].in_use)
        children[nuc++] = &path_[p];

    float penalty[4] = { 0, 0, 0, 0 };

    for (int nuc = 0; nuc < 4; ++nuc) {

      TreephaserPath *child = children[nuc];

      AdvanceState(child, parent, nuc_int_to_char[nuc], max_flows);

      // Apply easy termination rules

      if (child->flow >= max_flows) {
        penalty[nuc] = 25; // Mark for deletion
        continue;
      }

      if (child->last_hp > kMaxHP) {
        penalty[nuc] = 25; // Mark for deletion
        continue;
      }

      if ((int)parent->sequence.size() >= (2 * flow_order_.num_flows() - 10)) {
        penalty[nuc] = 25; // Mark for deletion
        continue;
      }

      child->path_metric = parent->residual_left_of_window;
      child->residual_left_of_window = parent->residual_left_of_window;

      float penaltyN = 0;
      float penalty1 = 0;

      for (int flow = parent->window_start; flow < child->window_end; ++flow) {

        float residual = 0;

        if(pm_model_enabled_==false){
            residual = read.normalized_measurements[flow] - child->prediction[flow];
        }
        else{
            int hp_length = 0;
            if(flow==0)
              hp_length = parent->flow2base_pos[0];
            else
              hp_length = parent->flow2base_pos[flow] - parent->flow2base_pos[flow-1];
            if(hp_length<0) hp_length = 0;
              residual = read.normalized_measurements[flow]
                         - (child->prediction[flow] * (*As_)[flow][flow_order_.int_at(flow)][hp_length]
                            + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length]);
        }

        float residual_squared = residual * residual;

        // Metric calculation
        if (flow < child->window_start) {
          child->residual_left_of_window += residual_squared;
          child->path_metric += residual_squared;
        } else if (residual <= 0)
          child->path_metric += residual_squared;

        if (residual <= 0)
          penaltyN += residual_squared;
        else if (flow < child->flow)
          penalty1 += residual_squared;
      }


      penalty[nuc] = penalty1 + kNegativeMultiplier * penaltyN;
      penalty1 += penaltyN;

      if (child->flow>0)
        child->per_flow_metric = (child->path_metric + 0.5 * penalty1) / child->flow;

    } //looping over nucs


    // Find out which nuc has the least penalty (the greedy choice nuc)
    int best_nuc = 0;
    if (penalty[best_nuc] > penalty[1])
      best_nuc = 1;
    if (penalty[best_nuc] > penalty[2])
      best_nuc = 2;
    if (penalty[best_nuc] > penalty[3])
      best_nuc = 3;

    // ------------------------------------------
    // Step 4: Use calculated metrics to decide which paths are worth keeping

    for (int nuc = 0; nuc < 4; ++nuc) {

      TreephaserPath *child = children[nuc];

      // Path termination rules

      if (penalty[nuc] >= 20)
        continue;

      if (child->path_metric > sum_of_squares_upper_bound)
        continue;

      // This is the only rule that depends on finding the "best nuc"
      if (penalty[nuc] - penalty[best_nuc] >= kExtendThreshold)
        continue;

      float dot_signal = 0;
      if(pm_model_enabled_==false){
          dot_signal = (read.normalized_measurements[child->flow] - parent->prediction[child->flow]) / child->state[child->flow];
      }
      else{
          int hp_length = 0;
          if(child->flow==0) hp_length = parent->flow2base_pos[0];
          else hp_length = parent->flow2base_pos[child->flow] - parent->flow2base_pos[child->flow-1];
            if(hp_length<0) hp_length = 0;
          dot_signal = (read.normalized_measurements[child->flow] - (parent->prediction[child->flow]
                        * (*As_)[child->flow][flow_order_.int_at(child->flow)][hp_length]
                          + (*Bs_)[child->flow][flow_order_.int_at(child->flow)][hp_length]))
                        / child->state[child->flow];
      }

      child->dot_counter = (dot_signal < kDotThreshold) ? (parent->dot_counter + 1) : 0;
      if (child->dot_counter > 1)
        continue;

      // Path survived termination rules and will be kept on stack
      child->in_use = true;
      space_on_stack--;

      // Fill out the remaining portion of the prediction
      memcpy(&child->prediction[0], &parent->prediction[0], parent->window_start*sizeof(float));

      for (int flow = child->window_end; flow < max_flows; ++flow)
        child->prediction[flow] = 0;

      // Fill out the solution
      child->sequence = parent->sequence;
      child->sequence.push_back(nuc_int_to_char[nuc]);

      //calculate starting base position for each flow
      child->base_pos = parent->base_pos;
      child->base_pos++;
      child->flow2base_pos = parent->flow2base_pos;
      for(int flow_inter = parent->flow+1; flow_inter < child->flow; flow_inter++){
        child->flow2base_pos[flow_inter] = child->flow2base_pos[parent->flow];
      }
      child->flow2base_pos[child->flow] = child->base_pos;      

    }

    // ------------------------------------------
    // Step 5. Check if the selected path is in fact the best path so far

    // Computing sequence squared distance
    float sum_of_squares = parent->residual_left_of_window;
    for (int flow = parent->window_start; flow < max_flows; flow++) {        

      float residual = 0;

      if(pm_model_enabled_==false){
          residual = read.normalized_measurements[flow] - parent->prediction[flow];
      }
      else{
          int hp_length = 0;
          if(flow==0) hp_length = parent->flow2base_pos[0];
          else hp_length = parent->flow2base_pos[flow] - parent->flow2base_pos[flow-1];
          if(hp_length<0) hp_length = 0;
          residual = read.normalized_measurements[flow] - (parent->prediction[flow]
                     * (*As_)[flow][flow_order_.int_at(flow)][hp_length]
                        + (*Bs_)[flow][flow_order_.int_at(flow)][hp_length]);
      }
      sum_of_squares += residual * residual;
    }

    // Updating best path
    if (sum_of_squares < sum_of_squares_upper_bound) {
      read.prediction.swap(parent->prediction);
      read.sequence.swap(parent->sequence);
      sum_of_squares_upper_bound = sum_of_squares;
    }

    parent->in_use = false;
    space_on_stack++;

  } // main decision loop
}