Esempio n. 1
0
bool Realigner::CreateRefFromQueryBases(
    // Inputs:
    const string&                algn_query_bases,
    const vector<CigarOp>&       algn_cigar_data,
    const string&                md_tag,
    const bool                   clip_anchors)
{

  // Step 1. Generate reference sequence based on QueryBases and Cigar alone
  if (verbose_) {
    cout << "Original Cigar : ";
    for (vector<CigarOp>::const_iterator cigar = algn_cigar_data.begin(); cigar != algn_cigar_data.end(); ++cigar)
      cout << cigar->Length << cigar->Type;
    cout << endl << "Original MD tag: " << md_tag << endl;
  }

  // Initialize variables
  pretty_tseq_.clear();
  pretty_qseq_.clear();
  pretty_aln_.clear();
  q_seq_.clear();
  t_seq_.clear();

  clipped_anchors_.cigar_left.resize(1);
  clipped_anchors_.cigar_left[0].Type = 'S';
  clipped_anchors_.cigar_left[0].Length = 0;
  clipped_anchors_.cigar_right.resize(1);
  clipped_anchors_.cigar_right[0].Type = 'S';
  clipped_anchors_.cigar_right[0].Length = 0;
  clipped_anchors_.md_left.clear();
  clipped_anchors_.md_right.clear();

  const char *read_ptr = algn_query_bases.c_str();


  for (vector<CigarOp>::const_iterator cigar = algn_cigar_data.begin(); cigar != algn_cigar_data.end(); ++cigar) {
    switch (cigar->Type) {
      case ('M') :
      case ('=') :
      case ('X') :
        t_seq_.append(read_ptr, cigar->Length);
        q_seq_.append(read_ptr, cigar->Length);
        pretty_tseq_.append(read_ptr, cigar->Length);
        pretty_qseq_.append(read_ptr, cigar->Length);
        pretty_aln_.append(cigar->Length, '|');
        read_ptr += cigar->Length; break;

      case ('I') :
        q_seq_.append(read_ptr, cigar->Length);
        pretty_tseq_.append(cigar->Length,'-');
        pretty_qseq_.append(read_ptr, cigar->Length);
        pretty_aln_.append(cigar->Length, '+');
        read_ptr += cigar->Length; break;

      case ('S') :
        read_ptr += cigar->Length;
        if (cigar == algn_cigar_data.begin())
          clipped_anchors_.cigar_left[0].Length = cigar->Length;
        else if (cigar == algn_cigar_data.end()-1)
          clipped_anchors_.cigar_right[0].Length = cigar->Length;
        else {
          if (verbose_)
            cout << "Error: invalid cigar string with soft clipped bases in the middle in the input." << endl;
          return false;
        }
        break;

      case ('D') :
      case ('P') :
      case ('N') :
        t_seq_.append(cigar->Length, '-');
        pretty_tseq_.append(cigar->Length,'-');
        pretty_qseq_.append(cigar->Length,'-');
        pretty_aln_.append(cigar->Length, '-');
        break;
    }
  }

  // Step 2: Further patch the sequence based on MD tag

  unsigned int t_idx = 0;
  unsigned int md_idx = 0;
  unsigned int pretty_idx = 0;
  int item_length = 0;

  while (md_idx < md_tag.length()) {

    if (md_tag.at(md_idx) >= '0' and md_tag.at(md_idx) <='9') {  // it's a match
      item_length = 0;
      for (; md_idx < md_tag.length() and md_tag.at(md_idx) >= '0' and md_tag.at(md_idx) <= '9'; md_idx++)
        item_length = 10*item_length + md_tag.at(md_idx) - '0';
      t_idx += item_length;
      while (pretty_idx < pretty_aln_.length() and (item_length > 0 or pretty_aln_.at(pretty_idx) == '+')) {
        if (pretty_aln_[pretty_idx] != '+')
          item_length--;
        pretty_idx++;
      }
    }
    else {
      bool is_deletion = false;
      if (md_tag.at(md_idx) == '^') {                     // Its a deletion or substitution
        md_idx++;
        is_deletion = true;
      }
      while (t_idx < t_seq_.length() and md_idx < md_tag.length() and pretty_idx < pretty_tseq_.length() and
              md_tag.at(md_idx) >= 'A' and md_tag.at(md_idx) <= 'Z') {
        if (pretty_aln_[pretty_idx] == '|') {
          if (is_deletion) {
        	invalid_cigar_in_input = true;
            return false;
          }
          pretty_aln_[pretty_idx] = ' ';
        }
        pretty_tseq_[pretty_idx++] = md_tag.at(md_idx) + 'a' - 'A';
        t_seq_.at(t_idx) = md_tag.at(md_idx);
        t_idx++;
        md_idx++;
      }
    }

    // Checks if Cigar and MD tag are consistent with each other
    bool invalid_pair = item_length > 0;
    invalid_pair = invalid_pair or (t_idx > t_seq_.length());
    invalid_pair = invalid_pair or (pretty_idx == pretty_aln_.length() and md_idx < md_tag.length()-1);
    invalid_pair = invalid_pair or (md_idx == md_tag.length() and t_idx != t_seq_.length());
    invalid_pair = invalid_pair or (md_idx == md_tag.length() and pretty_idx != pretty_aln_.length());

    if (invalid_pair) {
      invalid_cigar_in_input = true;
      return false;
    }
  }

  // ------------------------------------------------ */
  
  if (verbose_) {
    cout << "Original Alignment from BAM: " << endl << pretty_qseq_ << endl;
    cout << pretty_aln_ << endl << pretty_tseq_ << endl << endl;
  }

  // Step 3: Decide whether to realign this read and whether to clip ends

  bool realign_read = ClipAnchors(clip_anchors);
  if (realign_read)
    SetSequences(q_seq_, t_seq_, pretty_aln_, isForwardStrandRead_);
  return realign_read;
}
Esempio n. 2
0
// prepare internal structures for clipping and alignment
// returns true if realignment was performed
bool RealignImp::compute_alignment (
    const char* q_seq,
    unsigned q_len,
    const char* r_seq, 
    unsigned r_len,
    int r_pos, 
    bool forward, 
    const uint32_t* cigar, 
    unsigned cigar_sz, 
    uint32_t*& cigar_dest, 
    unsigned& cigar_dest_sz, 
    int& new_pos,
    bool& already_perfect,
    bool& clip_failed,
    bool& alignment_failed,
    bool& unclip_failed)
{
    already_perfect = false;
    alignment_failed = false;
    unclip_failed = false;
    unsigned oplen;

    const char* q_seq_clipped = q_seq;
    const uint32_t* cigar_clipped = cigar;
    unsigned cigar_sz_clipped = cigar_sz;

    unsigned sclip_q_len, sclip_r_len, sclip_al_len;

    assert (cigar_sz);
    // reset realigner
    Reset ();

    // set clipping 
    SetClipping ((int) cliptype_, forward);

    // clip out the hard and soft clipping zones from 5" and 3"
    // The 'cut out' of the q_seq is done by switching to downstream pointer.
    if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP)
    {
        oplen = bam_cigar_oplen (*cigar);
        ClipStart (oplen);
        q_seq_clipped += oplen;
        ++cigar_clipped;
        --cigar_sz_clipped;
    }

    if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP)
    {
        oplen = bam_cigar_oplen (cigar [cigar_sz - 1]);
        ClipEnd (oplen);
        --cigar_sz_clipped;
    }

    // cigar defines q_seq and t_seq lengths
    sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len);

    const std::string query (q_seq_clipped, sclip_q_len);
    const std::string target (r_seq, sclip_r_len);
    std::string pretty_al; pretty_al.reserve (sclip_al_len);

    pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al);

    // Realigner requires strings of proper size to be passed to SetSequences
    SetSequences (query, target, pretty_al, forward);

    if (!ClipAnchors (clip_failed))
    {
        already_perfect = true;
        return false; // alignment already good, no imperfect zone to realign found
    }

    // TODO avoid automatic vectors to prevent unneeded heap usage
    vector<MDelement> new_md_vec; 
    vector<CigarOp> new_cigar_vec;
    unsigned int start_pos_shift;

    if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift))
    {
        alignment_failed = true;
        return false;
    }

    if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len))
    {
        unclip_failed = true;
        return false; // error adding back clipped out zones
    }

    if (!LeftAnchorClipped () && start_pos_shift != 0) 
    {
        // build cigar data only if it is needed
        // TODO avoid automatic vectors to prevent unneeded heap usage
        std::vector <CigarOp> cigar_vec;
        cigar_vector_from_bin (cigar, cigar_sz, cigar_vec);
        new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos);
    }
    else
        new_pos = r_pos;

    // free (cigar_dest);
    // TODO: switch to better alignment memory management, avoid heap operations
    cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest");
    cigar_dest_sz = new_cigar_vec.size ();
    cigar_vector_to_bin (new_cigar_vec, cigar_dest);

    return true;
}