/* 
    Stores the sequence, start and end position of the read after removing clipped bases
    using the provided references
 */
 void GetUnclippedInfo(AlignedRead* aln, string& bases, int& unclipped_start, int& unclipped_end){
   unclipped_start = aln->read_start;
   unclipped_end   = aln->read_start-1;
   bool begin      = true;
   int start_index = 0, num_bases = 0;
   for(vector<BamTools::CigarOp>::iterator cigar_iter = aln->cigar_ops.begin(); cigar_iter != aln->cigar_ops.end(); cigar_iter++){
     switch(cigar_iter->Type) {
     case 'D':
       unclipped_end += cigar_iter->Length;
       begin          = false;
       break;
     case 'H':
       break;
     case 'S':
       if (begin) start_index += cigar_iter->Length;
       break;
     case 'M':
       unclipped_end += cigar_iter->Length;
       num_bases     += cigar_iter->Length;
       begin          = false;
       break;
     case 'I':
       num_bases += cigar_iter->Length;
       begin      = false;
       break;
     default:
       string msg = "Invalid CIGAR char ";
       msg += cigar_iter->Type;
       PrintMessageDieOnError(msg, ERROR);
       break;
     }
   }
   bases = aln->nucleotides.substr(start_index, num_bases);
 }
 template<typename CigarIterator> int GetDistToIndel(CigarIterator iter, CigarIterator end){
   // Process leading clipping ops
   if (iter != end && iter->Type == 'H')
     iter++;
   if (iter != end && iter->Type == 'S')
     iter++;
   
   int dist = 0;
   while (iter != end){
     char type = iter->Type;
     if (type == 'M')
       dist += iter->Length;
     else if (type == 'I' || type == 'D')
       return dist;
     else if (type == 'S' || type == 'H')
       return -1;
     else {
       string msg = "Invalid CIGAR char";
       msg += type;
       PrintMessageDieOnError(msg, ERROR);
     }
     iter++;
   }
   return -1;
 }
std::string FilterCounter::GetFilterType(const int type){
  switch(type) {
  case NOT_UNIT:
    return "NOT_UNIT";
  case DIFF_FROM_REF:
    return "DIFF_FROM_REF";
  case MAPPING_QUALITY:
    return "MAPPING_QUALITY";
  case MATE_DIST:
    return "MATE_DIST";
  case ALLELE_SIZE:
    return "ALLELE_SIZE";
  case SPANNING_AMOUNT:
    return "SPANNING_AMOUNT";
  case NUM_END_MATCHES:
    return "NUM_END_MATCHES";
  case NOT_MAXIMAL_END:
    return "NOT_MAXIMAL_END";
  case BP_BEFORE_INDEL:
    return "BP_BEFORE_INDEL";
  case UNFILTERED:
    return "UNFILTERED";
  default:
    PrintMessageDieOnError("Invalid filter type", ERROR);
    return "ERROR"; // this should never be reached
  }
}
 int base_to_int(char c){
   c = toupper(c);
   switch(c){
   case 'A':
     return 0;
   case 'C':
     return 1;
   case 'G':
     return 2;
   case 'T':
     return 3;
   case 'N':
     return 4;
   default:
     PrintMessageDieOnError("Invalid character in read " + c, ERROR);
   }
   return -1;
 }
  bool StitchReads(ReadPair* read_pair,
		   ALIGNMENT* left_alignment,
		   ALIGNMENT* right_alignment) {
    // Set up
    const int& num_aligned_read = read_pair->aligned_read_num;
    string seq1 = read_pair->reads.at(num_aligned_read).orig_nucleotides;
    string seq2 = reverseComplement(read_pair->reads.
				    at(1-num_aligned_read).
				    orig_nucleotides);
    string seq1_qual = read_pair->reads.at(num_aligned_read).orig_qual;
    string seq2_qual = reverse(read_pair->reads.
			       at(1-num_aligned_read).orig_qual);
    if (seq1.length() <= MIN_STITCH_OVERLAP ||
	seq2.length() <= MIN_STITCH_OVERLAP) { 
      return false;
    }
    bool best_stitch_is_backwards = false;
    vector<float> scores;
    scores.push_back(0);
    float score, max_score = 0;
    size_t overlap_len, max_score_index = -1;
    // Gradually bring ends together and try to stitch
    for (size_t i = 0; i <= seq1.length() - MIN_STITCH_OVERLAP; i++) {
      score = 0;
      overlap_len = seq1.length() - i;
      for (size_t j = 0; j < overlap_len; j++) {
	if (j >= seq2.length()) {
	  score = 0;
	} else {
	  if (seq1.at(i+j) == seq2.at(j)) {
	    score += 1;
	  }
	}
      }
      if (score/overlap_len >= max_score) {
	max_score = score/overlap_len;
	max_score_index = i;
      }
      scores.push_back(score/overlap_len);
    }
    // Other direction
    for (size_t i = 0; i <= seq2.length() - MIN_STITCH_OVERLAP; i++) {
      score = 0;
      overlap_len = seq2.length() - i;
      for (size_t j = 0; j < overlap_len; j++) {
	if (j >=  seq1.length()) {
	  score = 0;
	} else {
	  if (seq2.at(i+j) == seq1.at(j)) {
	    score += 1;
	  }
	}
      }
      if (score/overlap_len >= max_score) {
	max_score = score/overlap_len;
	max_score_index = i;
	best_stitch_is_backwards = true;
      }
      scores.push_back(score/overlap_len);
    }
    if (best_stitch_is_backwards) {
      overlap_len = seq2.length() - max_score_index - 1;
    } else {
      overlap_len = seq1.length() - max_score_index - 1;
    }
    // Check if stitch is good enough
    if ((overlap_len < MIN_STITCH_OVERLAP) ||
	(max_score < STITCH_REQUIRED_SCORE)) {
      return false;
    }
    // Check if too many good scores
    int numhits = 0;
    for (size_t i = 0; i < scores.size(); i++) {
      if (scores.at(i) >= max_score - STITCH_DIFF) {
	numhits += 1;
      }
    }
    if (numhits > 1) return false;

    if (best_stitch_is_backwards) {
      string tmp = seq1;
      seq1 = seq2;
      seq2 = tmp;
      tmp = seq1_qual;
      seq1_qual = seq2_qual;
      seq2_qual = tmp;
    }
    
    string stitched_string = seq1.
      substr(0, static_cast<int>(max_score_index));
    string stitched_qual = seq1_qual.
      substr(0, static_cast<int>(max_score_index));
    string na, nb, qa, qb;
    
    for (size_t i = 0; i <= overlap_len; i++) {
      na = seq1.substr(max_score_index+i, 1);
      nb = seq2.substr(i, 1);
      qa = seq1_qual.substr(max_score_index+i, 1);
      qb = seq2_qual.substr(i, 1);
      if (qa > qb) {
	stitched_string.append(na);
	stitched_qual.append(qa);
      } else if (qa < qb) {
	stitched_string.append(nb);
	stitched_qual.append(qb);
      } else {
	stitched_string.append(na);
	stitched_qual.append(qa);
      }
    }
    if (seq2.length() > overlap_len+1) {
      stitched_string.append(seq2.substr(overlap_len + 1));
      stitched_qual.append(seq2_qual.substr(overlap_len + 1));
    }
    // put stitched info in aligned read
    read_pair->reads.at(num_aligned_read).nucleotides = stitched_string;
    read_pair->reads.at(num_aligned_read).quality_scores = stitched_qual;
    if (best_stitch_is_backwards) {
      if (left_alignment->left) {
	left_alignment->pos -= (seq1.length() - overlap_len);
      }
    } else {
      if (!left_alignment->left) {
	right_alignment->pos -= (seq2.length() - overlap_len);
      }
    }
    if (align_debug) {
      PrintMessageDieOnError("[StitchReads]: Found stitching", DEBUG);
    }
    return true;
  }
  bool GetSTRAllele(MSReadRecord* aligned_read,
		    const CIGAR_LIST& cigar_list) {
    if (align_debug) {
      PrintMessageDieOnError("[GetSTRAllele]: starint GetSTRAllele", DEBUG);
    }
    // index where STR starts in the read
    size_t str_index = aligned_read->msStart-aligned_read->read_start + 1;
    // Length of the total STR region
    size_t ms_length = aligned_read->msEnd - aligned_read->msStart;
    
    // check that not too close to ends
    size_t span = 0;
    for (size_t i = 0; i < cigar_list.cigars.size(); i++) {
      const int& s = cigar_list.cigars.at(i).num;
      const char& t = cigar_list.cigars.at(i).cigar_type;
      if (t == 'M' || t == 'D') span += s;
    }
    size_t str_index_end = aligned_read->read_start + span - aligned_read->msEnd;
    if ((str_index < MIN_DIST_FROM_END || str_index_end < MIN_DIST_FROM_END)) {
      if (align_debug) {
	PrintMessageDieOnError("[GetSTRAllele]: failed in dist from end check", DEBUG);
      }
      return false;
    }

    // If alignment is too messy, get rid of it
    if (cigar_list.cigars.size() > MAX_CIGAR_SIZE) {
      if (align_debug) {
	stringstream msg;
	msg << "[GetSTRAllele]: failed max cigar size test " << cigar_list.cigar_string;
	PrintMessageDieOnError(msg.str(), DEBUG);
      }
      return false;
    }
    
    // same as reference
    if (cigar_list.cigars.size() == 1) {
      if (aligned_read->reverse) {
	aligned_read->detected_ms_nuc =
	  reverseComplement(aligned_read->nucleotides).
	  substr(str_index - 1, ms_length);
      } else {
	aligned_read->detected_ms_nuc =
	  aligned_read->nucleotides.substr(str_index - 1, ms_length);
      }
      aligned_read->diffFromRef = 0;
      return (aligned_read->detected_ms_nuc.length() >= MIN_STR_LENGTH);
    }
    
    // get only cigar score spanning the STR
    const int& str_start_in_cigar =
      aligned_read->msStart - aligned_read->read_start;
    // position into the segment
    int pos = 0;
    // base pairs spanned by this cigar item
    int bp = 0;
    // type of the cigar item
    char cigar_type;
    // index into the cigar score
    size_t cigar_index = 0;
    // diff to go until end of this segment
    int diff = 0;
    // temp cigar list to store when removing flanks
    CIGAR_LIST new_cigar_list;
    // list with only cigars for the STR region
    CIGAR_LIST str_cigar_list;
    // Diff in bp from ref STR
    int diff_from_ref = 0;
    
    // get rid of left flanking region
    while (pos <= str_start_in_cigar  &&
	   cigar_index < cigar_list.cigars.size()) {
      bp = cigar_list.cigars.at(cigar_index).num;
      cigar_type = cigar_list.cigars.at(cigar_index).cigar_type;
      // If match or del, increment position
      if (cigar_type == 'M' || cigar_type == 'D' || cigar_type == 'S') pos += bp;
      // bp to go until we hit STR
      diff = pos - str_start_in_cigar;
      if (diff >= 0) {
	size_t cigar_index_to_include = cigar_index;
	// If left adjacent cigar is not M or S, include it
	if (diff == 0 && (cigar_list.cigars.at(cigar_index).cigar_type == 'M' ||
			  cigar_list.cigars.at(cigar_index).cigar_type == 'S')) {
	  cigar_index_to_include += 1;
	} else {
	  diff -= cigar_list.cigars.at(cigar_index).num;
	}
	new_cigar_list.cigars.resize(cigar_list.cigars.size() -
				     cigar_index_to_include);
	copy(cigar_list.cigars.begin() + cigar_index_to_include,
	     cigar_list.cigars.end(),
	     new_cigar_list.cigars.begin());
	break;
      }
      cigar_index += 1;
    }
    // Update STR cigar taking away left flank
    str_cigar_list.cigars = new_cigar_list.cigars;
    str_cigar_list.ResetString();
    new_cigar_list.cigars.clear();
    
    // get rid of right flank cigars
    // start at beginning of STR list
    cigar_index = 0;
    // Pos from end of the STR region
    pos = diff;
    int total_str_len = static_cast<int>(ms_length);
    while (pos < total_str_len) {
      if (cigar_index >= str_cigar_list.cigars.size()) {
	return false;
      }
      bp = str_cigar_list.cigars.at(cigar_index).num;
      cigar_type = str_cigar_list.cigars.at(cigar_index).cigar_type;
      if (cigar_type == 'M' || cigar_type == 'D' || cigar_type == 'S')
	pos += bp;
      // Difference between our position and the end of the STR
      diff = pos-total_str_len;
      if (diff >= 0) {
	size_t cigar_index_to_include = cigar_index;
	// If right adjacent is not M or S, include it
	if (cigar_index < str_cigar_list.cigars.size() - 1) {
	  const char& next_type = str_cigar_list.cigars.
	    at(cigar_index+1).cigar_type;
	  if (next_type != 'M' && next_type != 'S' && diff == 0) {
	    cigar_index_to_include += 1;
	  }
	}
	new_cigar_list.cigars.resize(cigar_index_to_include + 1);
	copy(str_cigar_list.cigars.begin(),
	     str_cigar_list.cigars.begin() + cigar_index_to_include + 1,
	     new_cigar_list.cigars.begin());
	break;
      }
      cigar_index += 1;
    }
    str_cigar_list.cigars.clear();
    str_cigar_list.cigars = new_cigar_list.cigars;
    str_cigar_list.ResetString();
    // set diff from ref
    diff_from_ref = 0;
    for (size_t i = 0; i < str_cigar_list.cigars.size(); i++) {
      if (str_cigar_list.cigars.at(i).cigar_type == 'I') {
	diff_from_ref += str_cigar_list.cigars.at(i).num;
      }
      if (str_cigar_list.cigars.at(i).cigar_type == 'D') {
	diff_from_ref -= str_cigar_list.cigars.at(i).num;
      }
    }
    
    // set STR region
    string ms_nuc;
    if (aligned_read->reverse) {
      string rev_read = reverseComplement(aligned_read->nucleotides);
      ms_nuc =  rev_read.substr(str_index - 1, ms_length+diff_from_ref);
    } else {
      ms_nuc =  aligned_read->nucleotides.
	substr(str_index - 1, ms_length+diff_from_ref);
    }
    if (ms_nuc.length() <= MIN_STR_LENGTH) {
      if (align_debug) {
	PrintMessageDieOnError("[GetSTRAllele]: failed min STR length check", DEBUG);
      }
      return false;
    }
    aligned_read->diffFromRef = diff_from_ref;
    aligned_read->detected_ms_nuc = ms_nuc;
    return true;
  }
 pair<int,int> GetNumEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start){
   if (aln->read_start < ref_seq_start)
     return pair<int,int>(-1,-1);
   
   unsigned int read_index = 0;
   unsigned int ref_index  = aln->read_start-ref_seq_start;
   vector<BamTools::CigarOp>::iterator cigar_iter = aln->cigar_ops.begin();
   bool beginning = true;
   int match_run  = 0;
   int head_match = 0;
   
   // Process leading clip CIGAR types
   if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'H')
     cigar_iter++;
   if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'S'){
     read_index += cigar_iter->Length;
     cigar_iter++;
   }
   
   // Process CIGAR items as long as read region lies within reference sequence bounds
   while (cigar_iter != aln->cigar_ops.end() && ref_index < ref_seq.size() && read_index < aln->nucleotides.size()){
     if (cigar_iter->Type == 'M'){
       if (ref_index + cigar_iter->Length > ref_seq.size()) 
         return pair<int,int>(-1, -1);
       if (read_index + cigar_iter->Length > aln->nucleotides.size())
         PrintMessageDieOnError("Nucleotides for aligned read don't correspond to the CIGAR string", ERROR);
       for (unsigned int len = cigar_iter->Length; len > 0; len--){
         if (ref_seq[ref_index] == aln->nucleotides[read_index])
           match_run++;
         else {
           if (beginning) head_match = match_run;
           beginning = false;
           match_run = 0;
         }
         read_index++;
         ref_index++;
       }
     }
     else if (cigar_iter->Type == 'I'){
       if (beginning) head_match = match_run;
       beginning   = false;
       match_run   = 0;
       read_index += cigar_iter->Length;
     }
     else if (cigar_iter->Type == 'D'){
       if (beginning) head_match = match_run;
       beginning  = false;
       match_run  = 0;
       ref_index += cigar_iter->Length;
     }
     else if (cigar_iter->Type == 'S' || cigar_iter->Type == 'H')
       break;
     else {
       string msg = "Invalid CIGAR char";
       msg += cigar_iter->Type;
       PrintMessageDieOnError(msg, ERROR);
     }
     cigar_iter++;
   }
   
   // Process trailing clip CIGAR types
   if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'S'){
     read_index += cigar_iter->Length;
     cigar_iter++;
   }
   if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'H')
     cigar_iter++;
   
   // Ensure that we processed all CIGAR options
   if (cigar_iter != aln->cigar_ops.end()){
     if (ref_index >= ref_seq.size())
       return pair<int,int>(-1,-1);
     else
       PrintMessageDieOnError("Improperly formatted CIGAR string", ERROR);
   }
   
   // Ensure that CIGAR string corresponded to aligned bases
   if (read_index != aln->nucleotides.size()){
     if (ref_index >= ref_seq.size())
       return pair<int,int>(-1,-1);
     else
       PrintMessageDieOnError("CIGAR string does not correspond to alignment bases", ERROR);
   }
   
   if (beginning)
     return pair<int,int>(match_run, match_run);
   else
     return pair<int,int>(head_match, match_run);
 } 
  void nw_helper(std::vector<float>& M, std::vector<float>& Iref, std::vector<float>& Iread, 
		 std::vector<int>& traceM, std::vector<int>& traceIref, std::vector<int>& traceIread,
		 const std::string& refseq, const std::string& readseq, 
		 std::string& refseq_al, std::string& readseq_al, 
		 float* score, std::vector<BamTools::CigarOp>& cigar_list){
    int L1 = refseq.length();
    int L2 = readseq.length();
    cigar_list.clear();

    // Various variables used in the matrix calculations
    int ref_base, read_base, oindex, nindex;
    float s1, s2, s3;
    int c;

    // Fill in the 3 matrices using dynamic programming
    for (int i = 1; i <= L2; i++){
      for (int j = 1; j <= L1; j++){
	nindex    = i*(L1+1)+j;
	ref_base  = base_to_int(refseq[j-1]);
	read_base = base_to_int(readseq[i-1]);

	// Update M matrix (examine (i-1, j-1))
	oindex          = (i-1)*(L1+1)+(j-1);
	s1              = M[oindex];
	s2              = Iref[oindex];
	s3              = Iread[oindex];
	M[nindex]       = bestIndex(s1, s2, s3, &c) + s[ref_base][read_base];
	traceM[nindex]  = c;

	// Update Iref matrix (examine (i,j-1))
	oindex             = i*(L1+1) + (j-1);
	s1                 = M[oindex]     - GAPOPEN;
	s2                 = Iref[oindex]  - GAPEXTEND;
	s3                 = Iread[oindex] - GAPOPEN;
	Iref[nindex]       = bestIndex(s1, s2, s3, &c);
	traceIref[nindex]  = c;

	// Update Iread matrix (examine (i-1,j))
	oindex              = (i-1)*(L1+1) + j;
	s1                  = M[oindex]     - GAPOPEN;
	s2                  = Iref[oindex]  - GAPOPEN;
	s3                  = Iread[oindex] - GAPEXTEND;
	Iread[nindex]       = bestIndex(s1, s2, s3, &c);
	traceIread[nindex]  = c;
      }
    }
  
    //Find the best ending point for the alignment
    float best_val;
    int best_col, best_type;
    findOptimalStop(L1, L2, M, Iref, Iread, best_val, best_col, best_type);
  
    // Store the optimal alignment score
    *score = best_val;
  
    std::stringstream refseq_ss, readseq_ss, cigar_ss;
  
    // Handle trailing gaps
    for(int i = L1; i > best_col; i--){
      refseq_ss  << refseq.at(i-1);
      readseq_ss << "-";
    }

    // Traceback the optimal alignment
    int best_row = L2;
    std::string raw_cigar;
    int index;
    while (best_row > 0){
      index = best_row*(L1+1) + best_col;
      if (best_type == 0){
	// M
	refseq_ss  << refseq.at(best_col-1);
	readseq_ss << readseq.at(best_row-1);
	cigar_ss   << "M";
	best_type   = traceM[index];
	best_row--;
	best_col--;
      } 
      else if (best_type == 1){
	//Iref
	refseq_ss  << refseq.at(best_col-1);
	readseq_ss << "-";
	cigar_ss   << "D";
	best_type   = traceIref[index];
	best_col--;
      } 
      else if (best_type == 2){
	// Iread
	refseq_ss  << "-";
	readseq_ss << readseq.at(best_row-1);
	cigar_ss   << "I";
	best_type   = traceIread[index];
	best_row--;
      } 
      else
	PrintMessageDieOnError("Invalid matrix type in Needleman-Wunsch alignment", ERROR);
    }

    // Handle leading gaps
    for (int i = best_col; i > 0; i--){
      refseq_ss  << refseq.at(i-1);
      readseq_ss << "-";
    }
  
    // Order alignment front to back
    refseq_al  = refseq_ss.str();
    readseq_al = readseq_ss.str();
    raw_cigar  = cigar_ss.str();
    reverse(refseq_al.begin(),  refseq_al.end());
    reverse(readseq_al.begin(), readseq_al.end());
    reverse(raw_cigar.begin(),  raw_cigar.end());

    // Simplify cigar string
    char cigar_char = raw_cigar[0];
    int  num        = 1;
    char new_cigar_char;
    for(unsigned int i = 1; i < raw_cigar.length(); i++){
      new_cigar_char = raw_cigar[i];
      if (new_cigar_char != cigar_char){
	cigar_list.push_back(BamTools::CigarOp(cigar_char, num));
	num = 1;
	cigar_char = new_cigar_char;
      }
      else
	num += 1;
    }
    cigar_list.push_back(BamTools::CigarOp(cigar_char, num));
    if (cigar_list.back().Type == 'I')
      cigar_list.back().Type = 'S';
  }
uint64_t FilterCounter::GetFilterCount(const int type){
  if (type > NUM_FILTERS || type < 0)
    PrintMessageDieOnError("Invalid filter type", ERROR);
  return counts[type];
}
Exemple #10
0
void FilterCounter::increment(const int type){
  if (type > NUM_FILTERS || type < 0)
    PrintMessageDieOnError("Invalid filter type", ERROR);
  counts[type]++;
}