/* Stores the sequence, start and end position of the read after removing clipped bases using the provided references */ void GetUnclippedInfo(AlignedRead* aln, string& bases, int& unclipped_start, int& unclipped_end){ unclipped_start = aln->read_start; unclipped_end = aln->read_start-1; bool begin = true; int start_index = 0, num_bases = 0; for(vector<BamTools::CigarOp>::iterator cigar_iter = aln->cigar_ops.begin(); cigar_iter != aln->cigar_ops.end(); cigar_iter++){ switch(cigar_iter->Type) { case 'D': unclipped_end += cigar_iter->Length; begin = false; break; case 'H': break; case 'S': if (begin) start_index += cigar_iter->Length; break; case 'M': unclipped_end += cigar_iter->Length; num_bases += cigar_iter->Length; begin = false; break; case 'I': num_bases += cigar_iter->Length; begin = false; break; default: string msg = "Invalid CIGAR char "; msg += cigar_iter->Type; PrintMessageDieOnError(msg, ERROR); break; } } bases = aln->nucleotides.substr(start_index, num_bases); }
template<typename CigarIterator> int GetDistToIndel(CigarIterator iter, CigarIterator end){ // Process leading clipping ops if (iter != end && iter->Type == 'H') iter++; if (iter != end && iter->Type == 'S') iter++; int dist = 0; while (iter != end){ char type = iter->Type; if (type == 'M') dist += iter->Length; else if (type == 'I' || type == 'D') return dist; else if (type == 'S' || type == 'H') return -1; else { string msg = "Invalid CIGAR char"; msg += type; PrintMessageDieOnError(msg, ERROR); } iter++; } return -1; }
std::string FilterCounter::GetFilterType(const int type){ switch(type) { case NOT_UNIT: return "NOT_UNIT"; case DIFF_FROM_REF: return "DIFF_FROM_REF"; case MAPPING_QUALITY: return "MAPPING_QUALITY"; case MATE_DIST: return "MATE_DIST"; case ALLELE_SIZE: return "ALLELE_SIZE"; case SPANNING_AMOUNT: return "SPANNING_AMOUNT"; case NUM_END_MATCHES: return "NUM_END_MATCHES"; case NOT_MAXIMAL_END: return "NOT_MAXIMAL_END"; case BP_BEFORE_INDEL: return "BP_BEFORE_INDEL"; case UNFILTERED: return "UNFILTERED"; default: PrintMessageDieOnError("Invalid filter type", ERROR); return "ERROR"; // this should never be reached } }
int base_to_int(char c){ c = toupper(c); switch(c){ case 'A': return 0; case 'C': return 1; case 'G': return 2; case 'T': return 3; case 'N': return 4; default: PrintMessageDieOnError("Invalid character in read " + c, ERROR); } return -1; }
bool StitchReads(ReadPair* read_pair, ALIGNMENT* left_alignment, ALIGNMENT* right_alignment) { // Set up const int& num_aligned_read = read_pair->aligned_read_num; string seq1 = read_pair->reads.at(num_aligned_read).orig_nucleotides; string seq2 = reverseComplement(read_pair->reads. at(1-num_aligned_read). orig_nucleotides); string seq1_qual = read_pair->reads.at(num_aligned_read).orig_qual; string seq2_qual = reverse(read_pair->reads. at(1-num_aligned_read).orig_qual); if (seq1.length() <= MIN_STITCH_OVERLAP || seq2.length() <= MIN_STITCH_OVERLAP) { return false; } bool best_stitch_is_backwards = false; vector<float> scores; scores.push_back(0); float score, max_score = 0; size_t overlap_len, max_score_index = -1; // Gradually bring ends together and try to stitch for (size_t i = 0; i <= seq1.length() - MIN_STITCH_OVERLAP; i++) { score = 0; overlap_len = seq1.length() - i; for (size_t j = 0; j < overlap_len; j++) { if (j >= seq2.length()) { score = 0; } else { if (seq1.at(i+j) == seq2.at(j)) { score += 1; } } } if (score/overlap_len >= max_score) { max_score = score/overlap_len; max_score_index = i; } scores.push_back(score/overlap_len); } // Other direction for (size_t i = 0; i <= seq2.length() - MIN_STITCH_OVERLAP; i++) { score = 0; overlap_len = seq2.length() - i; for (size_t j = 0; j < overlap_len; j++) { if (j >= seq1.length()) { score = 0; } else { if (seq2.at(i+j) == seq1.at(j)) { score += 1; } } } if (score/overlap_len >= max_score) { max_score = score/overlap_len; max_score_index = i; best_stitch_is_backwards = true; } scores.push_back(score/overlap_len); } if (best_stitch_is_backwards) { overlap_len = seq2.length() - max_score_index - 1; } else { overlap_len = seq1.length() - max_score_index - 1; } // Check if stitch is good enough if ((overlap_len < MIN_STITCH_OVERLAP) || (max_score < STITCH_REQUIRED_SCORE)) { return false; } // Check if too many good scores int numhits = 0; for (size_t i = 0; i < scores.size(); i++) { if (scores.at(i) >= max_score - STITCH_DIFF) { numhits += 1; } } if (numhits > 1) return false; if (best_stitch_is_backwards) { string tmp = seq1; seq1 = seq2; seq2 = tmp; tmp = seq1_qual; seq1_qual = seq2_qual; seq2_qual = tmp; } string stitched_string = seq1. substr(0, static_cast<int>(max_score_index)); string stitched_qual = seq1_qual. substr(0, static_cast<int>(max_score_index)); string na, nb, qa, qb; for (size_t i = 0; i <= overlap_len; i++) { na = seq1.substr(max_score_index+i, 1); nb = seq2.substr(i, 1); qa = seq1_qual.substr(max_score_index+i, 1); qb = seq2_qual.substr(i, 1); if (qa > qb) { stitched_string.append(na); stitched_qual.append(qa); } else if (qa < qb) { stitched_string.append(nb); stitched_qual.append(qb); } else { stitched_string.append(na); stitched_qual.append(qa); } } if (seq2.length() > overlap_len+1) { stitched_string.append(seq2.substr(overlap_len + 1)); stitched_qual.append(seq2_qual.substr(overlap_len + 1)); } // put stitched info in aligned read read_pair->reads.at(num_aligned_read).nucleotides = stitched_string; read_pair->reads.at(num_aligned_read).quality_scores = stitched_qual; if (best_stitch_is_backwards) { if (left_alignment->left) { left_alignment->pos -= (seq1.length() - overlap_len); } } else { if (!left_alignment->left) { right_alignment->pos -= (seq2.length() - overlap_len); } } if (align_debug) { PrintMessageDieOnError("[StitchReads]: Found stitching", DEBUG); } return true; }
bool GetSTRAllele(MSReadRecord* aligned_read, const CIGAR_LIST& cigar_list) { if (align_debug) { PrintMessageDieOnError("[GetSTRAllele]: starint GetSTRAllele", DEBUG); } // index where STR starts in the read size_t str_index = aligned_read->msStart-aligned_read->read_start + 1; // Length of the total STR region size_t ms_length = aligned_read->msEnd - aligned_read->msStart; // check that not too close to ends size_t span = 0; for (size_t i = 0; i < cigar_list.cigars.size(); i++) { const int& s = cigar_list.cigars.at(i).num; const char& t = cigar_list.cigars.at(i).cigar_type; if (t == 'M' || t == 'D') span += s; } size_t str_index_end = aligned_read->read_start + span - aligned_read->msEnd; if ((str_index < MIN_DIST_FROM_END || str_index_end < MIN_DIST_FROM_END)) { if (align_debug) { PrintMessageDieOnError("[GetSTRAllele]: failed in dist from end check", DEBUG); } return false; } // If alignment is too messy, get rid of it if (cigar_list.cigars.size() > MAX_CIGAR_SIZE) { if (align_debug) { stringstream msg; msg << "[GetSTRAllele]: failed max cigar size test " << cigar_list.cigar_string; PrintMessageDieOnError(msg.str(), DEBUG); } return false; } // same as reference if (cigar_list.cigars.size() == 1) { if (aligned_read->reverse) { aligned_read->detected_ms_nuc = reverseComplement(aligned_read->nucleotides). substr(str_index - 1, ms_length); } else { aligned_read->detected_ms_nuc = aligned_read->nucleotides.substr(str_index - 1, ms_length); } aligned_read->diffFromRef = 0; return (aligned_read->detected_ms_nuc.length() >= MIN_STR_LENGTH); } // get only cigar score spanning the STR const int& str_start_in_cigar = aligned_read->msStart - aligned_read->read_start; // position into the segment int pos = 0; // base pairs spanned by this cigar item int bp = 0; // type of the cigar item char cigar_type; // index into the cigar score size_t cigar_index = 0; // diff to go until end of this segment int diff = 0; // temp cigar list to store when removing flanks CIGAR_LIST new_cigar_list; // list with only cigars for the STR region CIGAR_LIST str_cigar_list; // Diff in bp from ref STR int diff_from_ref = 0; // get rid of left flanking region while (pos <= str_start_in_cigar && cigar_index < cigar_list.cigars.size()) { bp = cigar_list.cigars.at(cigar_index).num; cigar_type = cigar_list.cigars.at(cigar_index).cigar_type; // If match or del, increment position if (cigar_type == 'M' || cigar_type == 'D' || cigar_type == 'S') pos += bp; // bp to go until we hit STR diff = pos - str_start_in_cigar; if (diff >= 0) { size_t cigar_index_to_include = cigar_index; // If left adjacent cigar is not M or S, include it if (diff == 0 && (cigar_list.cigars.at(cigar_index).cigar_type == 'M' || cigar_list.cigars.at(cigar_index).cigar_type == 'S')) { cigar_index_to_include += 1; } else { diff -= cigar_list.cigars.at(cigar_index).num; } new_cigar_list.cigars.resize(cigar_list.cigars.size() - cigar_index_to_include); copy(cigar_list.cigars.begin() + cigar_index_to_include, cigar_list.cigars.end(), new_cigar_list.cigars.begin()); break; } cigar_index += 1; } // Update STR cigar taking away left flank str_cigar_list.cigars = new_cigar_list.cigars; str_cigar_list.ResetString(); new_cigar_list.cigars.clear(); // get rid of right flank cigars // start at beginning of STR list cigar_index = 0; // Pos from end of the STR region pos = diff; int total_str_len = static_cast<int>(ms_length); while (pos < total_str_len) { if (cigar_index >= str_cigar_list.cigars.size()) { return false; } bp = str_cigar_list.cigars.at(cigar_index).num; cigar_type = str_cigar_list.cigars.at(cigar_index).cigar_type; if (cigar_type == 'M' || cigar_type == 'D' || cigar_type == 'S') pos += bp; // Difference between our position and the end of the STR diff = pos-total_str_len; if (diff >= 0) { size_t cigar_index_to_include = cigar_index; // If right adjacent is not M or S, include it if (cigar_index < str_cigar_list.cigars.size() - 1) { const char& next_type = str_cigar_list.cigars. at(cigar_index+1).cigar_type; if (next_type != 'M' && next_type != 'S' && diff == 0) { cigar_index_to_include += 1; } } new_cigar_list.cigars.resize(cigar_index_to_include + 1); copy(str_cigar_list.cigars.begin(), str_cigar_list.cigars.begin() + cigar_index_to_include + 1, new_cigar_list.cigars.begin()); break; } cigar_index += 1; } str_cigar_list.cigars.clear(); str_cigar_list.cigars = new_cigar_list.cigars; str_cigar_list.ResetString(); // set diff from ref diff_from_ref = 0; for (size_t i = 0; i < str_cigar_list.cigars.size(); i++) { if (str_cigar_list.cigars.at(i).cigar_type == 'I') { diff_from_ref += str_cigar_list.cigars.at(i).num; } if (str_cigar_list.cigars.at(i).cigar_type == 'D') { diff_from_ref -= str_cigar_list.cigars.at(i).num; } } // set STR region string ms_nuc; if (aligned_read->reverse) { string rev_read = reverseComplement(aligned_read->nucleotides); ms_nuc = rev_read.substr(str_index - 1, ms_length+diff_from_ref); } else { ms_nuc = aligned_read->nucleotides. substr(str_index - 1, ms_length+diff_from_ref); } if (ms_nuc.length() <= MIN_STR_LENGTH) { if (align_debug) { PrintMessageDieOnError("[GetSTRAllele]: failed min STR length check", DEBUG); } return false; } aligned_read->diffFromRef = diff_from_ref; aligned_read->detected_ms_nuc = ms_nuc; return true; }
pair<int,int> GetNumEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start){ if (aln->read_start < ref_seq_start) return pair<int,int>(-1,-1); unsigned int read_index = 0; unsigned int ref_index = aln->read_start-ref_seq_start; vector<BamTools::CigarOp>::iterator cigar_iter = aln->cigar_ops.begin(); bool beginning = true; int match_run = 0; int head_match = 0; // Process leading clip CIGAR types if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'H') cigar_iter++; if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'S'){ read_index += cigar_iter->Length; cigar_iter++; } // Process CIGAR items as long as read region lies within reference sequence bounds while (cigar_iter != aln->cigar_ops.end() && ref_index < ref_seq.size() && read_index < aln->nucleotides.size()){ if (cigar_iter->Type == 'M'){ if (ref_index + cigar_iter->Length > ref_seq.size()) return pair<int,int>(-1, -1); if (read_index + cigar_iter->Length > aln->nucleotides.size()) PrintMessageDieOnError("Nucleotides for aligned read don't correspond to the CIGAR string", ERROR); for (unsigned int len = cigar_iter->Length; len > 0; len--){ if (ref_seq[ref_index] == aln->nucleotides[read_index]) match_run++; else { if (beginning) head_match = match_run; beginning = false; match_run = 0; } read_index++; ref_index++; } } else if (cigar_iter->Type == 'I'){ if (beginning) head_match = match_run; beginning = false; match_run = 0; read_index += cigar_iter->Length; } else if (cigar_iter->Type == 'D'){ if (beginning) head_match = match_run; beginning = false; match_run = 0; ref_index += cigar_iter->Length; } else if (cigar_iter->Type == 'S' || cigar_iter->Type == 'H') break; else { string msg = "Invalid CIGAR char"; msg += cigar_iter->Type; PrintMessageDieOnError(msg, ERROR); } cigar_iter++; } // Process trailing clip CIGAR types if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'S'){ read_index += cigar_iter->Length; cigar_iter++; } if (cigar_iter != aln->cigar_ops.end() && cigar_iter->Type == 'H') cigar_iter++; // Ensure that we processed all CIGAR options if (cigar_iter != aln->cigar_ops.end()){ if (ref_index >= ref_seq.size()) return pair<int,int>(-1,-1); else PrintMessageDieOnError("Improperly formatted CIGAR string", ERROR); } // Ensure that CIGAR string corresponded to aligned bases if (read_index != aln->nucleotides.size()){ if (ref_index >= ref_seq.size()) return pair<int,int>(-1,-1); else PrintMessageDieOnError("CIGAR string does not correspond to alignment bases", ERROR); } if (beginning) return pair<int,int>(match_run, match_run); else return pair<int,int>(head_match, match_run); }
void nw_helper(std::vector<float>& M, std::vector<float>& Iref, std::vector<float>& Iread, std::vector<int>& traceM, std::vector<int>& traceIref, std::vector<int>& traceIread, const std::string& refseq, const std::string& readseq, std::string& refseq_al, std::string& readseq_al, float* score, std::vector<BamTools::CigarOp>& cigar_list){ int L1 = refseq.length(); int L2 = readseq.length(); cigar_list.clear(); // Various variables used in the matrix calculations int ref_base, read_base, oindex, nindex; float s1, s2, s3; int c; // Fill in the 3 matrices using dynamic programming for (int i = 1; i <= L2; i++){ for (int j = 1; j <= L1; j++){ nindex = i*(L1+1)+j; ref_base = base_to_int(refseq[j-1]); read_base = base_to_int(readseq[i-1]); // Update M matrix (examine (i-1, j-1)) oindex = (i-1)*(L1+1)+(j-1); s1 = M[oindex]; s2 = Iref[oindex]; s3 = Iread[oindex]; M[nindex] = bestIndex(s1, s2, s3, &c) + s[ref_base][read_base]; traceM[nindex] = c; // Update Iref matrix (examine (i,j-1)) oindex = i*(L1+1) + (j-1); s1 = M[oindex] - GAPOPEN; s2 = Iref[oindex] - GAPEXTEND; s3 = Iread[oindex] - GAPOPEN; Iref[nindex] = bestIndex(s1, s2, s3, &c); traceIref[nindex] = c; // Update Iread matrix (examine (i-1,j)) oindex = (i-1)*(L1+1) + j; s1 = M[oindex] - GAPOPEN; s2 = Iref[oindex] - GAPOPEN; s3 = Iread[oindex] - GAPEXTEND; Iread[nindex] = bestIndex(s1, s2, s3, &c); traceIread[nindex] = c; } } //Find the best ending point for the alignment float best_val; int best_col, best_type; findOptimalStop(L1, L2, M, Iref, Iread, best_val, best_col, best_type); // Store the optimal alignment score *score = best_val; std::stringstream refseq_ss, readseq_ss, cigar_ss; // Handle trailing gaps for(int i = L1; i > best_col; i--){ refseq_ss << refseq.at(i-1); readseq_ss << "-"; } // Traceback the optimal alignment int best_row = L2; std::string raw_cigar; int index; while (best_row > 0){ index = best_row*(L1+1) + best_col; if (best_type == 0){ // M refseq_ss << refseq.at(best_col-1); readseq_ss << readseq.at(best_row-1); cigar_ss << "M"; best_type = traceM[index]; best_row--; best_col--; } else if (best_type == 1){ //Iref refseq_ss << refseq.at(best_col-1); readseq_ss << "-"; cigar_ss << "D"; best_type = traceIref[index]; best_col--; } else if (best_type == 2){ // Iread refseq_ss << "-"; readseq_ss << readseq.at(best_row-1); cigar_ss << "I"; best_type = traceIread[index]; best_row--; } else PrintMessageDieOnError("Invalid matrix type in Needleman-Wunsch alignment", ERROR); } // Handle leading gaps for (int i = best_col; i > 0; i--){ refseq_ss << refseq.at(i-1); readseq_ss << "-"; } // Order alignment front to back refseq_al = refseq_ss.str(); readseq_al = readseq_ss.str(); raw_cigar = cigar_ss.str(); reverse(refseq_al.begin(), refseq_al.end()); reverse(readseq_al.begin(), readseq_al.end()); reverse(raw_cigar.begin(), raw_cigar.end()); // Simplify cigar string char cigar_char = raw_cigar[0]; int num = 1; char new_cigar_char; for(unsigned int i = 1; i < raw_cigar.length(); i++){ new_cigar_char = raw_cigar[i]; if (new_cigar_char != cigar_char){ cigar_list.push_back(BamTools::CigarOp(cigar_char, num)); num = 1; cigar_char = new_cigar_char; } else num += 1; } cigar_list.push_back(BamTools::CigarOp(cigar_char, num)); if (cigar_list.back().Type == 'I') cigar_list.back().Type = 'S'; }
uint64_t FilterCounter::GetFilterCount(const int type){ if (type > NUM_FILTERS || type < 0) PrintMessageDieOnError("Invalid filter type", ERROR); return counts[type]; }
void FilterCounter::increment(const int type){ if (type > NUM_FILTERS || type < 0) PrintMessageDieOnError("Invalid filter type", ERROR); counts[type]++; }