CNWAligner::CNWAligner( const char* seq1, size_t len1, const char* seq2, size_t len2, const SNCBIPackedScoreMatrix* scoremat ) : m_Wm(GetDefaultWm()), m_Wms(GetDefaultWms()), m_Wg(GetDefaultWg()), m_Ws(GetDefaultWs()), m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false), m_SmithWaterman(false), m_GapPreference(eLater), m_abc(g_nwaligner_nucleotides), m_ScoreMatrixInvalid(true), m_prg_callback(0), m_terminate(false), m_Seq1Vec(&seq1[0], &seq1[0]+len1), m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(len1), m_Seq2Vec(&seq2[0], &seq2[0]+len2), m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(len2), m_PositivesAsMatches(false), m_score(kInfMinus), m_mt(false), m_maxthreads(1), m_MaxMem(GetDefaultSpaceLimit()) { SetScoreMatrix(scoremat); SetSequences(seq1, len1, seq2, len2); }
CRef<CSeq_align> CNWAligner::Run(CScope &scope, const CSeq_loc &loc1, const CSeq_loc &loc2, bool trim_end_gaps) { if ((!loc1.IsInt() && !loc1.IsWhole()) || (!loc1.IsInt() && !loc1.IsWhole())) { NCBI_THROW(CException, eUnknown, "Only whole and interval locations supported"); } CSeqVector vec1(loc1, scope, CBioseq_Handle::eCoding_Iupac); string seq1; vec1.GetSeqData(0, vec1.size(), seq1); CSeqVector vec2(loc2, scope, CBioseq_Handle::eCoding_Iupac); string seq2; vec2.GetSeqData(0, vec2.size(), seq2); SetSequences(seq1,seq2); Run(); CRef<CSeq_align> align(new CSeq_align); align->SetType(CSeq_align::eType_partial); align->SetSegs().SetDenseg(*GetDense_seg( loc1.GetStart(eExtreme_Biological), loc1.GetStrand(), *loc1.GetId(), loc2.GetStart(eExtreme_Biological), loc2.GetStrand(), *loc2.GetId(), trim_end_gaps)); return align; }
CNWAligner::CNWAligner(const string& seq1, const string& seq2, const SNCBIPackedScoreMatrix* scoremat) : m_Wm(GetDefaultWm()), m_Wms(GetDefaultWms()), m_Wg(GetDefaultWg()), m_Ws(GetDefaultWs()), m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false), m_SmithWaterman(false), m_GapPreference(eLater), m_abc(g_nwaligner_nucleotides), m_ScoreMatrixInvalid(true), m_prg_callback(0), m_terminate(false), m_Seq1Vec(seq1.begin(), seq1.end()), m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(seq1.size()), m_Seq2Vec(seq2.begin(), seq2.end()), m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(seq2.size()), m_score(kInfMinus), m_mt(false), m_maxthreads(1), m_MaxMem(GetDefaultSpaceLimit()) { SetScoreMatrix(scoremat); SetSequences(seq1, seq2); };
CPSSMAligner::CPSSMAligner(const TScore** pssm1, size_t len1, const char* seq2, size_t len2) : CNWAligner(), m_Pssm1(pssm1), m_Freq1(0), m_Seq2(seq2), m_Freq2(0), m_FreqScale(1), m_StartWg(GetDefaultWg()), m_StartWs(GetDefaultWs()), m_EndWg(GetDefaultWg()), m_EndWs(GetDefaultWs()) { SetSequences(pssm1, len1, seq2, len2); }
CPSSMAligner::CPSSMAligner(const double** freq1, size_t len1, const double** freq2, size_t len2, const SNCBIPackedScoreMatrix *scoremat, const int scale) : CNWAligner(), m_Pssm1(0), m_Freq1(freq1), m_Seq2(0), m_Freq2(freq2), m_FreqScale(scale), m_StartWg(GetDefaultWg()), m_StartWs(GetDefaultWs()), m_EndWg(GetDefaultWg()), m_EndWs(GetDefaultWs()) { SetScoreMatrix(scoremat); SetSequences(freq1, len1, freq2, len2, scale); }
bool Realigner::CreateRefFromQueryBases( // Inputs: const string& algn_query_bases, const vector<CigarOp>& algn_cigar_data, const string& md_tag, const bool clip_anchors) { // Step 1. Generate reference sequence based on QueryBases and Cigar alone if (verbose_) { cout << "Original Cigar : "; for (vector<CigarOp>::const_iterator cigar = algn_cigar_data.begin(); cigar != algn_cigar_data.end(); ++cigar) cout << cigar->Length << cigar->Type; cout << endl << "Original MD tag: " << md_tag << endl; } // Initialize variables pretty_tseq_.clear(); pretty_qseq_.clear(); pretty_aln_.clear(); q_seq_.clear(); t_seq_.clear(); clipped_anchors_.cigar_left.resize(1); clipped_anchors_.cigar_left[0].Type = 'S'; clipped_anchors_.cigar_left[0].Length = 0; clipped_anchors_.cigar_right.resize(1); clipped_anchors_.cigar_right[0].Type = 'S'; clipped_anchors_.cigar_right[0].Length = 0; clipped_anchors_.md_left.clear(); clipped_anchors_.md_right.clear(); const char *read_ptr = algn_query_bases.c_str(); for (vector<CigarOp>::const_iterator cigar = algn_cigar_data.begin(); cigar != algn_cigar_data.end(); ++cigar) { switch (cigar->Type) { case ('M') : case ('=') : case ('X') : t_seq_.append(read_ptr, cigar->Length); q_seq_.append(read_ptr, cigar->Length); pretty_tseq_.append(read_ptr, cigar->Length); pretty_qseq_.append(read_ptr, cigar->Length); pretty_aln_.append(cigar->Length, '|'); read_ptr += cigar->Length; break; case ('I') : q_seq_.append(read_ptr, cigar->Length); pretty_tseq_.append(cigar->Length,'-'); pretty_qseq_.append(read_ptr, cigar->Length); pretty_aln_.append(cigar->Length, '+'); read_ptr += cigar->Length; break; case ('S') : read_ptr += cigar->Length; if (cigar == algn_cigar_data.begin()) clipped_anchors_.cigar_left[0].Length = cigar->Length; else if (cigar == algn_cigar_data.end()-1) clipped_anchors_.cigar_right[0].Length = cigar->Length; else { if (verbose_) cout << "Error: invalid cigar string with soft clipped bases in the middle in the input." << endl; return false; } break; case ('D') : case ('P') : case ('N') : t_seq_.append(cigar->Length, '-'); pretty_tseq_.append(cigar->Length,'-'); pretty_qseq_.append(cigar->Length,'-'); pretty_aln_.append(cigar->Length, '-'); break; } } // Step 2: Further patch the sequence based on MD tag unsigned int t_idx = 0; unsigned int md_idx = 0; unsigned int pretty_idx = 0; int item_length = 0; while (md_idx < md_tag.length()) { if (md_tag.at(md_idx) >= '0' and md_tag.at(md_idx) <='9') { // it's a match item_length = 0; for (; md_idx < md_tag.length() and md_tag.at(md_idx) >= '0' and md_tag.at(md_idx) <= '9'; md_idx++) item_length = 10*item_length + md_tag.at(md_idx) - '0'; t_idx += item_length; while (pretty_idx < pretty_aln_.length() and (item_length > 0 or pretty_aln_.at(pretty_idx) == '+')) { if (pretty_aln_[pretty_idx] != '+') item_length--; pretty_idx++; } } else { bool is_deletion = false; if (md_tag.at(md_idx) == '^') { // Its a deletion or substitution md_idx++; is_deletion = true; } while (t_idx < t_seq_.length() and md_idx < md_tag.length() and pretty_idx < pretty_tseq_.length() and md_tag.at(md_idx) >= 'A' and md_tag.at(md_idx) <= 'Z') { if (pretty_aln_[pretty_idx] == '|') { if (is_deletion) { invalid_cigar_in_input = true; return false; } pretty_aln_[pretty_idx] = ' '; } pretty_tseq_[pretty_idx++] = md_tag.at(md_idx) + 'a' - 'A'; t_seq_.at(t_idx) = md_tag.at(md_idx); t_idx++; md_idx++; } } // Checks if Cigar and MD tag are consistent with each other bool invalid_pair = item_length > 0; invalid_pair = invalid_pair or (t_idx > t_seq_.length()); invalid_pair = invalid_pair or (pretty_idx == pretty_aln_.length() and md_idx < md_tag.length()-1); invalid_pair = invalid_pair or (md_idx == md_tag.length() and t_idx != t_seq_.length()); invalid_pair = invalid_pair or (md_idx == md_tag.length() and pretty_idx != pretty_aln_.length()); if (invalid_pair) { invalid_cigar_in_input = true; return false; } } // ------------------------------------------------ */ if (verbose_) { cout << "Original Alignment from BAM: " << endl << pretty_qseq_ << endl; cout << pretty_aln_ << endl << pretty_tseq_ << endl << endl; } // Step 3: Decide whether to realign this read and whether to clip ends bool realign_read = ClipAnchors(clip_anchors); if (realign_read) SetSequences(q_seq_, t_seq_, pretty_aln_, isForwardStrandRead_); return realign_read; }
// prepare internal structures for clipping and alignment // returns true if realignment was performed bool RealignImp::compute_alignment ( const char* q_seq, unsigned q_len, const char* r_seq, unsigned r_len, int r_pos, bool forward, const uint32_t* cigar, unsigned cigar_sz, uint32_t*& cigar_dest, unsigned& cigar_dest_sz, int& new_pos, bool& already_perfect, bool& clip_failed, bool& alignment_failed, bool& unclip_failed) { already_perfect = false; alignment_failed = false; unclip_failed = false; unsigned oplen; const char* q_seq_clipped = q_seq; const uint32_t* cigar_clipped = cigar; unsigned cigar_sz_clipped = cigar_sz; unsigned sclip_q_len, sclip_r_len, sclip_al_len; assert (cigar_sz); // reset realigner Reset (); // set clipping SetClipping ((int) cliptype_, forward); // clip out the hard and soft clipping zones from 5" and 3" // The 'cut out' of the q_seq is done by switching to downstream pointer. if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (*cigar); ClipStart (oplen); q_seq_clipped += oplen; ++cigar_clipped; --cigar_sz_clipped; } if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (cigar [cigar_sz - 1]); ClipEnd (oplen); --cigar_sz_clipped; } // cigar defines q_seq and t_seq lengths sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len); const std::string query (q_seq_clipped, sclip_q_len); const std::string target (r_seq, sclip_r_len); std::string pretty_al; pretty_al.reserve (sclip_al_len); pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al); // Realigner requires strings of proper size to be passed to SetSequences SetSequences (query, target, pretty_al, forward); if (!ClipAnchors (clip_failed)) { already_perfect = true; return false; // alignment already good, no imperfect zone to realign found } // TODO avoid automatic vectors to prevent unneeded heap usage vector<MDelement> new_md_vec; vector<CigarOp> new_cigar_vec; unsigned int start_pos_shift; if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift)) { alignment_failed = true; return false; } if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len)) { unclip_failed = true; return false; // error adding back clipped out zones } if (!LeftAnchorClipped () && start_pos_shift != 0) { // build cigar data only if it is needed // TODO avoid automatic vectors to prevent unneeded heap usage std::vector <CigarOp> cigar_vec; cigar_vector_from_bin (cigar, cigar_sz, cigar_vec); new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos); } else new_pos = r_pos; // free (cigar_dest); // TODO: switch to better alignment memory management, avoid heap operations cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest"); cigar_dest_sz = new_cigar_vec.size (); cigar_vector_to_bin (new_cigar_vec, cigar_dest); return true; }
void CNWAligner::SetSequences(const string& seq1, const string& seq2, bool verify) { SetSequences(seq1.data(), seq1.size(), seq2.data(), seq2.size(), verify); }