// prepare internal structures for clipping and alignment // returns true if realignment was performed bool ContAlignImp::compute_alignment ( const char* q_seq, unsigned q_len, const char* r_seq, unsigned r_len, int r_pos, bool forward, const uint32_t* cigar, unsigned cigar_sz, uint32_t*& cigar_dest, unsigned& cigar_dest_sz, int& new_pos, bool& already_perfect, bool& clip_failed, bool& alignment_failed, bool& unclip_failed) { already_perfect = false; alignment_failed = false; unclip_failed = false; // unsigned oplen; //const char* q_seq_clipped = q_seq; //const uint32_t* cigar_clipped = cigar; //unsigned cigar_sz_clipped = cigar_sz; //unsigned sclip_q_len, sclip_r_len, sclip_al_len; assert (cigar_sz); // clip out the hard and soft clipping zones from 5" and 3" // The 'cut out' of the q_seq is done by switching to downstream pointer. uint32_t clean_len; EndClips clips; const char* clean_read = clip_seq (q_seq, cigar, cigar_sz, clean_len, clips); // clip reference accordingly //r_seq += clips.soft_beg_; //r_len -= clips.soft_beg_ + clips.soft_end_; if (clean_len > MAX_SEQ_LEN || r_len > MAX_SEQ_LEN) { // std::cerr << "Sequence is too long to fit into aligner (" << std::max (clean_len, r_len) << ", max is " << MAX_SEQ_LEN << ")" << std::endl; #if 0 std::cerr << "Cigar is "; cigar_out (std::cerr, cigar, cigar_sz); std::cerr << "\nClips: " << clips << std::endl; #endif return false; } unsigned qry_ins; // extra bases in query == width_left unsigned ref_ins; // extra bases in reference == width_right band_width (cigar, cigar_sz, qry_ins, ref_ins); if (!ContAlign::can_align (clean_len, r_len, qry_ins + extra_bandwidth_, ref_ins + extra_bandwidth_)) return false; ContAlign::align_band ( clean_read, // xseq clean_len, // xlen r_seq, // yseq r_len, // ylen 0, // xpos 0, // ypos std::max (clean_len, r_len), // segment length qry_ins + extra_bandwidth_, // width_left ref_ins + extra_bandwidth_, // width_right - forces to width_left true, // to_beg true // to_end ); unsigned bno = ContAlign::backtrace ( batches_, // BATCH buffer MAX_BATCH_NO, // size of BATCH buffer ref_ins + extra_bandwidth_ // width ); // convert alignment to cigar unsigned qry_off, ref_off; // int ref_shift = roll_cigar (new_cigar_, MAX_CIGAR_SZ, cigar_dest_sz, batches_, bno, clean_len, clips, qry_off, ref_off); new_pos = r_pos + ref_off; cigar_dest = new_cigar_; return true; }
bool BamProcessor::processRecord () { trclog << "\nProcessing record " << read_cnt_ << " - " << rec_.getReadName () << ", " << rec_.get0BasedUnclippedEnd () << "->" << rec_.getReadLength () << ", ref " << rec_.getReferenceName () << std::endl; const char* seq = rec_.getSequence (); unsigned position = rec_.get0BasedPosition (); unsigned new_position = position; bool reverse_match = (rec_.getFlag () & 0x10); Cigar* cigar_p = rec_.getCigarInfo (); if (!cigar_p->size ()) // can not recreate reference is cigar is missing. Keep record unaligned. { // TODO: allow to specify and load external reference ++ unaligned_cnt_; return true; } myassert (cigar_p); const String *mdval = rec_.getStringTag ("MD"); if (!mdval) // can not recreate reference is MD tag is missing. Keep record as is. { warn << "No MD Tag for record " << proc_cnt_ << ". Skipping record." << std::endl; ++nomd_cnt_; return true; // record will be kept as-is. } std::string md_tag = mdval->c_str (); // find the non-clipped region uint32_t clean_len; EndClips clips; const char* clean_read = clip_seq (seq, *cigar_p, clean_len, clips); // find length needed for the reference // this reserves space enough for entire refference, including softclipped ends. unsigned ref_len = cigar_p->getExpectedReferenceBaseCount (); if (ref_buffer_sz_ < ref_len) { ref_buffer_sz_ = (1 + ref_len / REF_BUF_INCR) * REF_BUF_INCR; ref_buffer_.reset (ref_buffer_sz_); } if (clean_len > MAX_SEQ_LEN || ref_len > MAX_SEQ_LEN) { ++ toolongs_; return true; } // recreate reference by Query, Cigar, and MD tag. Do not include softclipped ends in the recreated sequence (use default last parameter) recreate_ref (seq, rec_.getReadLength (), cigar_p, md_tag.c_str (), ref_buffer_, ref_buffer_sz_); unsigned qry_ins; // extra bases in query == width_left unsigned ref_ins; // extra bases in reference == width_right band_width (*cigar_p, qry_ins, ref_ins); if (log_matr_ || log_base_) { logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << "\n" << " sequence (" << rec_.getReadLength () << " bases)\n"; } CigarRoller roller; int ref_shift = 0; // shift of the new alignment position on refereance relative the original unsigned qry_off, ref_off; // offsets on the query and reference of the first non-clipped aligned bases double new_score = 0; switch (p_->algo ()) { case ContalignParams::TEMPL: { // call aligner new_score = taligner_.eval (clean_read, clean_len, ref_buffer_, ref_len, 0, band_width_); // read traceback // TODO: convert directly to cigar genstr::Alignment* al = taligner_.trace (); // convert alignment to cigar ref_shift = roll_cigar (roller, *al, clean_len, clips, qry_off, ref_off); } break; case ContalignParams::PLAIN: { new_score = aligner_.align_band ( clean_read, // xseq clean_len, // xlen ref_buffer_, // yseq ref_len, // ylen 0, // xpos 0, // ypos std::max (clean_len, ref_len), // segment length qry_ins + band_width_, // width_left false, // unpack ref_ins + band_width_, // width_right - forces to width_left true, // to_beg true // to_end ); unsigned bno = aligner_.backtrace ( batches_, // BATCH buffer max_batch_no_, // size of BATCH buffer false, // fill the BATCH array in reverse direction ref_ins + band_width_ // width ); // convert alignment to cigar ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off); } break; case ContalignParams::POLY: { new_score = contalign_.align_band ( clean_read, // xseq clean_len, // xlen ref_buffer_, // yseq ref_len, // ylen 0, // xpos 0, // ypos std::max (clean_len, ref_len), // segment length qry_ins + band_width_, // width_left false, // unpack ref_ins + band_width_, // width_right - forces to width_left true, // to_beg true // to_end ); unsigned bno = contalign_.backtrace ( batches_, // BATCH buffer max_batch_no_, // size of BATCH buffer false, // fill the BATCH array in reverse direction ref_ins + band_width_ // width ); // convert alignment to cigar ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off); } break; default: break; } ++realigned_cnt_; // compare original and new cigar (and location) if (ref_shift || !(*cigar_p == roller)) { // save original cigar and position for reporting std::string orig_cigar_str; rec_.getCigarInfo ()->getCigarString (orig_cigar_str); int32_t prior_pos = rec_.get0BasedPosition (); // replace cigar rec_.setCigar (roller); ++ modified_cnt_; // update pos_adjusted_cnt if position changed if (ref_shift != 0) { myassert (prior_pos + ref_shift >= 0); rec_.set0BasedPosition (prior_pos + ref_shift); ++ pos_adjusted_cnt_; } if (log_diff_) { const unsigned MAX_BATCH_PRINTED = 100; BATCH batches [MAX_BATCH_PRINTED]; std::string new_cigar_str; unsigned bno; int swscore; rec_.getCigarInfo ()->getCigarString (new_cigar_str); if (!log_base_ && !log_matr_) logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << " (" << rec_.getReadLength () << " bases)\n"; logfile_ << " ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" << orig_cigar_str << "\n"; bno = cigar_to_batches (orig_cigar_str, batches, MAX_BATCH_PRINTED); swscore = align_score (batches, bno, clean_read, ref_buffer_, p_->gip (), p_->gep (), p_->mat (), p_->mis ()); print_batches (clean_read, clean_len, false, ref_buffer_, ref_len, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_, clips.soft_beg_, 0, 160); logfile_ << "\n 'classic' SW score is " << swscore << "\n"; logfile_ << " NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" << new_cigar_str << std::endl; bno = cigar_to_batches (new_cigar_str, batches, MAX_BATCH_PRINTED); swscore = align_score (batches, bno, clean_read + qry_off, ref_buffer_ + ref_off, p_->gip (), p_->gep (), p_->mat (), p_->mis ()); print_batches (clean_read + qry_off, clean_len - qry_off, false, ref_buffer_ + ref_off, ref_len - ref_off, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_ + ref_off, clips.soft_beg_ + qry_off, 0, 160); logfile_ << "\n 'classic' SW score is " << swscore; logfile_ << "\n alternate (context-aware) score is " << new_score << ", used bandwidth left: " << qry_ins + band_width_ << ", right: " << ref_ins + band_width_ << "\n" << std::endl; } else if (log_base_) { logfile_ << "Recomputed alignment differs from original:\n"; logfile_ << " ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" << orig_cigar_str << "\n"; std::string new_cigar_str; rec_.getCigarInfo ()->getCigarString (new_cigar_str); logfile_ << " NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" << new_cigar_str << "\n" << std::endl; } } else { if (log_base_) { logfile_ << "Recomputed alignment matches the original:\n"; std::string orig_cigar_str; rec_.getCigarInfo ()->getCigarString (orig_cigar_str); int32_t prior_pos = rec_.get0BasedPosition (); logfile_ << " " << std::right << std::setw (9) << prior_pos+1 << "->" << orig_cigar_str << "\n" << std::endl; } } return true; }