bool Realigner::CreateRefFromQueryBases( // Inputs: const string& algn_query_bases, const vector<CigarOp>& algn_cigar_data, const string& md_tag, const bool clip_anchors) { // Step 1. Generate reference sequence based on QueryBases and Cigar alone if (verbose_) { cout << "Original Cigar : "; for (vector<CigarOp>::const_iterator cigar = algn_cigar_data.begin(); cigar != algn_cigar_data.end(); ++cigar) cout << cigar->Length << cigar->Type; cout << endl << "Original MD tag: " << md_tag << endl; } // Initialize variables pretty_tseq_.clear(); pretty_qseq_.clear(); pretty_aln_.clear(); q_seq_.clear(); t_seq_.clear(); clipped_anchors_.cigar_left.resize(1); clipped_anchors_.cigar_left[0].Type = 'S'; clipped_anchors_.cigar_left[0].Length = 0; clipped_anchors_.cigar_right.resize(1); clipped_anchors_.cigar_right[0].Type = 'S'; clipped_anchors_.cigar_right[0].Length = 0; clipped_anchors_.md_left.clear(); clipped_anchors_.md_right.clear(); const char *read_ptr = algn_query_bases.c_str(); for (vector<CigarOp>::const_iterator cigar = algn_cigar_data.begin(); cigar != algn_cigar_data.end(); ++cigar) { switch (cigar->Type) { case ('M') : case ('=') : case ('X') : t_seq_.append(read_ptr, cigar->Length); q_seq_.append(read_ptr, cigar->Length); pretty_tseq_.append(read_ptr, cigar->Length); pretty_qseq_.append(read_ptr, cigar->Length); pretty_aln_.append(cigar->Length, '|'); read_ptr += cigar->Length; break; case ('I') : q_seq_.append(read_ptr, cigar->Length); pretty_tseq_.append(cigar->Length,'-'); pretty_qseq_.append(read_ptr, cigar->Length); pretty_aln_.append(cigar->Length, '+'); read_ptr += cigar->Length; break; case ('S') : read_ptr += cigar->Length; if (cigar == algn_cigar_data.begin()) clipped_anchors_.cigar_left[0].Length = cigar->Length; else if (cigar == algn_cigar_data.end()-1) clipped_anchors_.cigar_right[0].Length = cigar->Length; else { if (verbose_) cout << "Error: invalid cigar string with soft clipped bases in the middle in the input." << endl; return false; } break; case ('D') : case ('P') : case ('N') : t_seq_.append(cigar->Length, '-'); pretty_tseq_.append(cigar->Length,'-'); pretty_qseq_.append(cigar->Length,'-'); pretty_aln_.append(cigar->Length, '-'); break; } } // Step 2: Further patch the sequence based on MD tag unsigned int t_idx = 0; unsigned int md_idx = 0; unsigned int pretty_idx = 0; int item_length = 0; while (md_idx < md_tag.length()) { if (md_tag.at(md_idx) >= '0' and md_tag.at(md_idx) <='9') { // it's a match item_length = 0; for (; md_idx < md_tag.length() and md_tag.at(md_idx) >= '0' and md_tag.at(md_idx) <= '9'; md_idx++) item_length = 10*item_length + md_tag.at(md_idx) - '0'; t_idx += item_length; while (pretty_idx < pretty_aln_.length() and (item_length > 0 or pretty_aln_.at(pretty_idx) == '+')) { if (pretty_aln_[pretty_idx] != '+') item_length--; pretty_idx++; } } else { bool is_deletion = false; if (md_tag.at(md_idx) == '^') { // Its a deletion or substitution md_idx++; is_deletion = true; } while (t_idx < t_seq_.length() and md_idx < md_tag.length() and pretty_idx < pretty_tseq_.length() and md_tag.at(md_idx) >= 'A' and md_tag.at(md_idx) <= 'Z') { if (pretty_aln_[pretty_idx] == '|') { if (is_deletion) { invalid_cigar_in_input = true; return false; } pretty_aln_[pretty_idx] = ' '; } pretty_tseq_[pretty_idx++] = md_tag.at(md_idx) + 'a' - 'A'; t_seq_.at(t_idx) = md_tag.at(md_idx); t_idx++; md_idx++; } } // Checks if Cigar and MD tag are consistent with each other bool invalid_pair = item_length > 0; invalid_pair = invalid_pair or (t_idx > t_seq_.length()); invalid_pair = invalid_pair or (pretty_idx == pretty_aln_.length() and md_idx < md_tag.length()-1); invalid_pair = invalid_pair or (md_idx == md_tag.length() and t_idx != t_seq_.length()); invalid_pair = invalid_pair or (md_idx == md_tag.length() and pretty_idx != pretty_aln_.length()); if (invalid_pair) { invalid_cigar_in_input = true; return false; } } // ------------------------------------------------ */ if (verbose_) { cout << "Original Alignment from BAM: " << endl << pretty_qseq_ << endl; cout << pretty_aln_ << endl << pretty_tseq_ << endl << endl; } // Step 3: Decide whether to realign this read and whether to clip ends bool realign_read = ClipAnchors(clip_anchors); if (realign_read) SetSequences(q_seq_, t_seq_, pretty_aln_, isForwardStrandRead_); return realign_read; }
// prepare internal structures for clipping and alignment // returns true if realignment was performed bool RealignImp::compute_alignment ( const char* q_seq, unsigned q_len, const char* r_seq, unsigned r_len, int r_pos, bool forward, const uint32_t* cigar, unsigned cigar_sz, uint32_t*& cigar_dest, unsigned& cigar_dest_sz, int& new_pos, bool& already_perfect, bool& clip_failed, bool& alignment_failed, bool& unclip_failed) { already_perfect = false; alignment_failed = false; unclip_failed = false; unsigned oplen; const char* q_seq_clipped = q_seq; const uint32_t* cigar_clipped = cigar; unsigned cigar_sz_clipped = cigar_sz; unsigned sclip_q_len, sclip_r_len, sclip_al_len; assert (cigar_sz); // reset realigner Reset (); // set clipping SetClipping ((int) cliptype_, forward); // clip out the hard and soft clipping zones from 5" and 3" // The 'cut out' of the q_seq is done by switching to downstream pointer. if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (*cigar); ClipStart (oplen); q_seq_clipped += oplen; ++cigar_clipped; --cigar_sz_clipped; } if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (cigar [cigar_sz - 1]); ClipEnd (oplen); --cigar_sz_clipped; } // cigar defines q_seq and t_seq lengths sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len); const std::string query (q_seq_clipped, sclip_q_len); const std::string target (r_seq, sclip_r_len); std::string pretty_al; pretty_al.reserve (sclip_al_len); pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al); // Realigner requires strings of proper size to be passed to SetSequences SetSequences (query, target, pretty_al, forward); if (!ClipAnchors (clip_failed)) { already_perfect = true; return false; // alignment already good, no imperfect zone to realign found } // TODO avoid automatic vectors to prevent unneeded heap usage vector<MDelement> new_md_vec; vector<CigarOp> new_cigar_vec; unsigned int start_pos_shift; if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift)) { alignment_failed = true; return false; } if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len)) { unclip_failed = true; return false; // error adding back clipped out zones } if (!LeftAnchorClipped () && start_pos_shift != 0) { // build cigar data only if it is needed // TODO avoid automatic vectors to prevent unneeded heap usage std::vector <CigarOp> cigar_vec; cigar_vector_from_bin (cigar, cigar_sz, cigar_vec); new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos); } else new_pos = r_pos; // free (cigar_dest); // TODO: switch to better alignment memory management, avoid heap operations cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest"); cigar_dest_sz = new_cigar_vec.size (); cigar_vector_to_bin (new_cigar_vec, cigar_dest); return true; }