コード例 #1
0
ファイル: ExtendedReadInfo.cpp プロジェクト: Rashesh7/TS
void ExtendedReadInfo::CreateFlowIndex(string &flowOrder) {

  string read_bases = alignment.QueryBases;
  if (!is_forward_strand)
    RevComplementInPlace(read_bases);

  flowIndex.assign(read_bases.length(), flowOrder.length());
  unsigned int flow = start_flow;
  unsigned int base_idx = 0;
  while (base_idx < read_bases.length() and flow < flowOrder.length()){
    while (flow < flowOrder.length() and flowOrder[flow] != read_bases[base_idx])
      flow++;
    flowIndex[base_idx] = flow;
    base_idx++;
  }
  if (base_idx != read_bases.length())
    cerr << "WARNING in ExtendedReadInfo::CreateFlowIndex: There are more bases in the read than fit into the flow order.";
}
コード例 #2
0
bool SpliceVariantHypotheses(const Alignment &current_read, const EnsembleEval &my_ensemble,
                        const LocalReferenceContext &local_context, PersistingThreadObjects &thread_objects,
                        int &splice_start_flow, int &splice_end_flow, vector<string> &my_hypotheses,
                        vector<bool> & same_as_null_hypothesis, bool & changed_alignment, const InputStructures &global_context,
                        const ReferenceReader &ref_reader, int chr_idx)
{

  // Hypotheses: 1) Null; read as called 2) Reference Hypothesis 3-?) Variant Hypotheses
  my_hypotheses.resize(my_ensemble.allele_identity_vector.size()+2);
  same_as_null_hypothesis.assign(my_hypotheses.size(), false);

  // Set up variables to log the flows we splice into
  splice_start_flow = -1;
  splice_end_flow = -1;
  int splice_start_idx = -1;
  vector<int> splice_end_idx;
  splice_end_idx.assign(my_hypotheses.size(), -1);

  // 1) Null hypothesis is read as called
  if (global_context.resolve_clipped_bases) {
    unsigned int null_hyp_length = current_read.read_bases.length() - current_read.left_sc - current_read.right_sc;
    my_hypotheses[0] = current_read.read_bases.substr(current_read.start_sc, null_hyp_length);
  }
  else
    my_hypotheses[0] = current_read.read_bases;

  // Initialize hypotheses variables for splicing
  for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) {
    my_hypotheses[i_hyp].clear();
    my_hypotheses[i_hyp].reserve(current_read.alignment.QueryBases.length() + 20 + local_context.reference_allele.length());
    // Add soft clipped bases on the left side of alignment if desired
    if (!global_context.resolve_clipped_bases)
      my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(0, current_read.left_sc);
  }

  int read_idx = current_read.left_sc;
  int ref_idx  = current_read.alignment.Position;
  int read_idx_max = current_read.alignment.QueryBases.length() - current_read.right_sc;
  bool did_splicing = false;
  bool just_did_splicing = false;
  string pretty_alignment;
  changed_alignment = false;

  // do realignment of a small region around variant if desired
  if (my_ensemble.doRealignment) {
    pretty_alignment = SpliceDoRealignement(thread_objects, current_read, local_context.position0,
                                            changed_alignment, global_context.DEBUG, ref_reader, chr_idx);
    if (pretty_alignment.empty() and global_context.DEBUG > 0)
      cerr << "Realignment returned an empty string in read " << current_read.alignment.Name << endl;
  }

  if (pretty_alignment.empty()) {
    pretty_alignment = current_read.pretty_aln;
    changed_alignment = false;
  }

  // Now fill in 2) and 3)

  for (unsigned int pretty_idx = 0; pretty_idx < pretty_alignment.length(); pretty_idx++) {

    bool outside_of_window = ref_idx < my_ensemble.multiallele_window_start or ref_idx >= my_ensemble.multiallele_window_end;
    bool outside_ref_allele = (long)ref_idx < local_context.position0 or ref_idx >= (int)(local_context.position0 + local_context.reference_allele.length());

    // Basic sanity checks
    if (read_idx >= read_idx_max
        or  ref_idx > ref_reader.chr_size(chr_idx)
        or (ref_idx == ref_reader.chr_size(chr_idx) and pretty_alignment[pretty_idx] != '+')) {
      did_splicing = false;
      break;
    }

    // --- Splice ---
    if (ref_idx == local_context.position0 and !did_splicing and !outside_of_window) {
      // Add insertions before variant window
      while (pretty_idx < pretty_alignment.length() and pretty_alignment[pretty_idx] == '+') {
    	for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)
          my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]);
        read_idx++;
        pretty_idx++;
      }
      did_splicing = SpliceAddVariantAlleles(current_read, pretty_alignment, my_ensemble,
    		                    local_context, my_hypotheses, pretty_idx, global_context.DEBUG);
      just_did_splicing = did_splicing;
    } // --- ---

    // Have reference bases inside of window but outside of span of reference allele
    if (outside_ref_allele and !outside_of_window and pretty_alignment[pretty_idx] != '+') {
      for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)
        my_hypotheses[i_hyp].push_back(ref_reader.base(chr_idx,ref_idx));
    }

    // Have read bases as called outside of variant window
    if (outside_of_window and pretty_alignment[pretty_idx] != '-') {
      for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)
        my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]);

      // --- Information to log flows. Indices are w.r.t. aligned portion of the read
      if (!did_splicing) { // Log index of the last base left of window which is the same for all hypotheses.
        splice_start_idx = read_idx - current_read.left_sc;
      }
      else if (just_did_splicing) { // Log length of hypothesis after splicing
    	splice_end_idx[0] = read_idx  - current_read.left_sc;
    	int clipped_bases = 0;
    	if (!global_context.resolve_clipped_bases)
    	  clipped_bases = current_read.left_sc;
        for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++)
          splice_end_idx[i_hyp] = my_hypotheses[i_hyp].length()-1 - clipped_bases; // Hyp length depends on whether there is resolving!
        just_did_splicing = false;
      }
      // --- ---
    }

    IncrementAlignmentIndices(pretty_alignment[pretty_idx], ref_idx, read_idx);

  } // end of for loop over extended pretty alignment

  // Check whether the whole reference allele fit
  // It seems that with primer trimming ion TVC, many a read throw this warning
  if (ref_idx < (int)(local_context.position0 + local_context.reference_allele.length())) {
    did_splicing = false;
    if (global_context.DEBUG>0)
      cout << "Warning in Splicing: Reference allele "<< local_context.reference_allele << " did not fit into read " << current_read.alignment.Name << endl;
  }

  if (did_splicing) {
    // --- Add soft clipped bases to the right of the alignment and reverse complement ---
    for (unsigned int i_hyp = 1; i_hyp<my_hypotheses.size(); i_hyp++) {
      if (!global_context.resolve_clipped_bases)
        my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(current_read.alignment.QueryBases.length()-current_read.right_sc, current_read.right_sc);

      if (current_read.is_reverse_strand)
        RevComplementInPlace(my_hypotheses[i_hyp]);
    }

    // Get the main flows before and after splicing
    splice_end_flow = GetSpliceFlows(current_read, global_context, my_hypotheses, same_as_null_hypothesis,
                                     splice_start_idx, splice_end_idx, splice_start_flow);
    if (splice_start_flow < 0 or splice_end_flow <= splice_start_flow) {
      did_splicing = false;
      cout << "Warning in Splicing: Splice flows are not valid in read " << current_read.alignment.Name
           << ". splice start flow: "<< splice_start_flow << " splice end flow " << splice_end_flow << endl;
    }
  }

  // Check for non-ACGT bases in hypotheses
  bool valid_bases = true;
  for (unsigned int i_hyp=0; i_hyp<my_hypotheses.size(); i_hyp++) {
	unsigned int iBase = 0;
	while (iBase<my_hypotheses[i_hyp].length() and valid_bases){
      if (my_hypotheses[i_hyp].at(iBase) == 'A' or my_hypotheses[i_hyp].at(iBase) == 'C' or
          my_hypotheses[i_hyp].at(iBase) == 'G' or my_hypotheses[i_hyp].at(iBase) == 'T')
      iBase++;
      else
        valid_bases = false;
	}
  }
  if (not valid_bases){
    cerr << "Non-Fatal ERROR in Splicing for " << local_context.contigName << ":" << local_context.position0+1
         << ": Read Hypotheses for " << current_read.alignment.Name << " contain non-ACGT characters." << endl;
    did_splicing = false;
  }

  // --- Fail safe for hypotheses and verbose
  if (!did_splicing) {
	for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++)
      my_hypotheses[i_hyp] = my_hypotheses[0];
    if (global_context.DEBUG > 1) {
      cout << "Failed to splice " << local_context.reference_allele << "->";
      for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) {
    	cout << my_ensemble.allele_identity_vector[i_alt].altAllele;
        if (i_alt < my_ensemble.allele_identity_vector.size()-1)
          cout << ",";
      }
      cout << " into read " << current_read.alignment.Name << endl;
    }
  }
  else if (global_context.DEBUG > 1) {
	cout << "Spliced " << local_context.reference_allele << "->";
    for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) {
      cout << my_ensemble.allele_identity_vector[i_alt].altAllele;
      if (i_alt < my_ensemble.allele_identity_vector.size()-1)
        cout << ",";
    }
    cout << " into ";
    if (current_read.is_reverse_strand) cout << "reverse ";
    else cout << "forward ";
    cout <<	"strand read read " << current_read.alignment.Name << endl;
    cout << "- Read as called: " << my_hypotheses[0] << endl;
    cout << "- Reference Hyp.: " << my_hypotheses[1] << endl;
    for (unsigned int i_hyp = 2; i_hyp<my_hypotheses.size(); i_hyp++)
      cout << "- Variant Hyp. " << (i_hyp-1) << ": " << my_hypotheses[i_hyp] << endl;
    cout << "- Splice start flow: " << splice_start_flow << " Splice end flow: " << splice_end_flow << endl;
  }

  return did_splicing;
};
// Function to fill in predicted signal values
void BaseHypothesisEvaluator(BamTools::BamAlignment    &alignment,
                             const string              &flow_order_str,
                             const string              &alt_base_hyp,
                             float                     &delta_score,
                             float                     &fit_score,
                             int                       heavy_verbose) {

    // --- Step 1: Initialize Objects and retrieve relevant tags

	delta_score = 1e5;
	fit_score   = 1e5;
	vector<string>   Hypotheses(2);
    vector<float>    measurements, phase_params;
    int              start_flow, num_flows, prefix_flow=0;

    if (not GetBamTags(alignment, flow_order_str.length(), measurements, phase_params, start_flow))
      return;
	num_flows = measurements.size();
	ion::FlowOrder flow_order(flow_order_str, num_flows);
	BasecallerRead master_read;
	master_read.SetData(measurements, flow_order.num_flows());
	TreephaserLite   treephaser(flow_order);
    treephaser.SetModelParameters(phase_params[0], phase_params[1]);

    // --- Step 2: Solve beginning of the read
    // Look at mapped vs. unmapped reads in BAM
    Hypotheses[0] = alignment.QueryBases;
    Hypotheses[1] = alt_base_hyp;
    // Safety: reverse complement reverse strand reads in mapped bam
    if (alignment.IsMapped() and alignment.IsReverseStrand()) {
      RevComplementInPlace(Hypotheses[0]);
      RevComplementInPlace(Hypotheses[1]);
    }

    prefix_flow = GetMasterReadPrefix(treephaser, flow_order, start_flow, Hypotheses[0], master_read);
    unsigned int prefix_size = master_read.sequence.size();

    // --- Step 3: creating predictions for the individual hypotheses

    vector<BasecallerRead> hypothesesReads(Hypotheses.size());
    vector<float> squared_distances(Hypotheses.size(), 0.0);
    int max_last_flow = 0;

    for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {

      hypothesesReads[i_hyp] = master_read;
      // --- add hypothesis sequence to clipped prefix
      unsigned int i_base = 0;
      int i_flow = prefix_flow;

      while (i_base<Hypotheses[i_hyp].length() and i_base<(2*(unsigned int)flow_order.num_flows()-prefix_size)) {
        while (i_flow < flow_order.num_flows() and flow_order.nuc_at(i_flow) != Hypotheses[i_hyp][i_base])
          i_flow++;
        if (i_flow < flow_order.num_flows() and i_flow > max_last_flow)
          max_last_flow = i_flow;
        if (i_flow >= flow_order.num_flows())
          break;
        // Add base to sequence only if it fits into flow order
        hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]);
        i_base++;
      }
      i_flow = min(i_flow, flow_order.num_flows()-1);

      // Solver simulates beginning of the read and then fills in the remaining clipped bases for which we have flow information
      treephaser.Solve(hypothesesReads[i_hyp], num_flows, i_flow);
    }
    // Compute L2-distance of measurements and predictions
    for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {
      for (int iFlow=0; iFlow<=max_last_flow; iFlow++)
        squared_distances[i_hyp] += (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)) *
                                    (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow));
    }

    // Delta: L2-distance of alternative base Hypothesis - L2-distance of bases as called
    delta_score = squared_distances.at(1) - squared_distances.at(0);
    fit_score   = min(squared_distances.at(1), squared_distances.at(0));


    // --- verbose ---
    if (heavy_verbose > 1 or (delta_score < 0 and heavy_verbose > 0)) {
      cout << "Processed read " << alignment.Name << endl;
      cout << "Delta Fit: " << delta_score << " Overall Fit: " << fit_score << endl;
      PredictionGenerationVerbose(Hypotheses, hypothesesReads, phase_params, flow_order, start_flow, prefix_size);
    }

}
コード例 #4
0
void UnpackOnLoad(Alignment *rai, const InputStructures &global_context)
{
  // No need to waste time if the read is filtered
  if (rai->filtered)
    return;

  rai->is_reverse_strand = rai->alignment.IsReverseStrand();

  // Parse read name, run id & flow order index

  rai->runid.clear();
  if (not rai->alignment.Name.empty()) {
    rai->well_rowcol.resize(2);
    ion_readname_to_rowcol(rai->alignment.Name.c_str(), &rai->well_rowcol[0], &rai->well_rowcol[1]);
    // extract runid while we are at it
    rai->runid  = rai->alignment.Name.substr(0,rai->alignment.Name.find(":"));
  }
  
  if (rai->runid.empty()){
    cerr << "WARNING: Unable to determine run id of read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }

  std::map<string,int>::const_iterator fo_it = global_context.flow_order_index_by_run_id.find(rai->runid);
  if (fo_it == global_context.flow_order_index_by_run_id.end()){
    cerr << "WARNING: No matching flow oder found for read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }
  rai->flow_order_index = fo_it->second;
  const ion::FlowOrder & flow_order = global_context.flow_order_vector.at(rai->flow_order_index);

  // Retrieve measurements from ZM tag

  vector<int16_t> quantized_measurements;
  if (not rai->alignment.GetTag("ZM", quantized_measurements)) {
    cerr << "ERROR: Normalized measurements ZM:tag is not present in read " << rai->alignment.Name << endl;
    exit(1);
  }
  if ((int)quantized_measurements.size() > global_context.num_flows_by_run_id.at(rai->runid)) {
    cerr << "ERROR: Normalized measurements ZM:tag length " << quantized_measurements.size()
         << " exceeds flow order length " << global_context.num_flows_by_run_id.at(rai->runid)
         <<" in read " << rai->alignment.Name << endl;
    exit(1);
  }
  rai->measurements.assign(global_context.num_flows_by_run_id.at(rai->runid), 0.0);
  for (size_t counter = 0; counter < quantized_measurements.size(); ++counter)
    rai->measurements[counter] = (float)quantized_measurements[counter]/256;
  rai->measurements_length = quantized_measurements.size();

  // Retrieve phasing parameters from ZP tag

  if (not rai->alignment.GetTag("ZP", rai->phase_params)) {
    cerr << "ERROR: Phasing Parameters ZP:tag is not present in read " << rai->alignment.Name << endl;
    exit(1);
  }
  if (rai->phase_params.size() != 3) {
    cerr << "ERROR: Phasing Parameters ZP:tag does not have 3 phase parameters in read " << rai->alignment.Name << endl;
    exit(1);
  }
  if (rai->phase_params[0] < 0 or rai->phase_params[0] > 1 or rai->phase_params[1] < 0 or rai->phase_params[1] > 1
      or rai->phase_params[2] < 0 or rai->phase_params[2] > 1) {
    cerr << "ERROR: Phasing Parameters ZP:tag outside of [0,1] range in read " << rai->alignment.Name << endl;
    exit(1);
  }
  rai->phase_params[2] = 0.0f;   // ad-hoc corrector: zero droop

  // Populate read_bases (bases without rev-comp on reverse-mapped reads) and flow_index

  rai->read_bases = rai->alignment.QueryBases;
  if (rai->is_reverse_strand)
    RevComplementInPlace(rai->read_bases);
  if (rai->read_bases.empty()){
    cerr << "WARNING: Ignoring length zero read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }

  // Unpack alignment

  rai->pretty_aln.reserve(global_context.num_flows_by_run_id.at(rai->runid));
  UnpackAlignmentInfo(rai);
  if (rai->is_reverse_strand)
    rai->start_sc = rai->right_sc;
  else
    rai->start_sc = rai->left_sc;

  // Generate flow index

  rai->start_flow = 0;
  if (not rai->alignment.GetTag("ZF", rai->start_flow)) {
    uint8_t start_flow_byte = 0;
    if (not rai->alignment.GetTag("ZF", start_flow_byte)) {
      cerr << "ERROR: Start Flow ZF:tag not found in read " << rai->alignment.Name << endl;
      exit(1);
    }
    rai->start_flow = (int)start_flow_byte;
  }
  if (rai->start_flow == 0) {
    cerr << "WARNING: Start Flow ZF:tag has zero value in read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }
  CreateFlowIndex(rai, flow_order);

  if (global_context.resolve_clipped_bases) {
    // Increment start flow to first aligned base
    rai->start_flow = rai->flow_index[rai->start_sc];
  }

  // Check validity of input arguments
  if (rai->start_flow < 0 or rai->start_flow >= global_context.num_flows_by_run_id.at(rai->runid)) {
    cerr << "ERROR: Start flow outside of [0,num_flows) range in read " << rai->alignment.Name << endl;
    cerr << "Start flow: " << rai->start_flow << " Number of flows: " << global_context.flow_order_vector.at(rai->flow_order_index).num_flows();
    exit(1);
  }

  // Retrieve read group name & generate prefix flow

  if (not rai->alignment.GetTag("RG",rai->read_group)) {
    cerr << "WARNING: No read group found in read " << rai->alignment.Name << endl;
    // No big problem, we'll just have to solve the prefix like it's 2013!
    rai->read_group.clear();
  }

  // Get read prefix - hard clipped start of the read: [KS][ZT][ZE]
  rai->prefix_flow = -1;
  std::map<string,string>::const_iterator key_it = global_context.key_by_read_group.find(rai->read_group);
  if (key_it != global_context.key_by_read_group.end()) {
    rai->prefix_bases = key_it->second;

    string temp_zt, temp_ze;
    if (rai->alignment.GetTag("ZT", temp_zt))
      rai->prefix_bases += temp_zt;
    if (rai->alignment.GetTag("ZE", temp_ze))
      rai->prefix_bases += temp_ze;

    if (not rai->prefix_bases.empty())
	  GetPrefixFlow(rai, rai->prefix_bases, flow_order);
  }

  // Check consistency of prefix_flow and start_flow - maybe we don't have all info about hard clipped bases
  if (rai->prefix_flow >= 0) {
    int check_start_flow = rai->prefix_flow;
    while (check_start_flow < flow_order.num_flows() and  flow_order.nuc_at(check_start_flow) != rai->read_bases.at(0))
	  check_start_flow++;
    if (check_start_flow != rai->start_flow) {
      rai->prefix_flow = -1;
      rai->prefix_bases.clear();
    }
  }

}