size_t recreate_ref (const char* query, size_t query_len, const char* cigar_str, const char* mdtag, char* dest, size_t destlen, bool include_softclip) { CigarRoller cigar (cigar_str); return recreate_ref (query, query_len, &cigar, mdtag, dest, destlen, include_softclip); }
bool BamProcessor::processRecord () { trclog << "\nProcessing record " << read_cnt_ << " - " << rec_.getReadName () << ", " << rec_.get0BasedUnclippedEnd () << "->" << rec_.getReadLength () << ", ref " << rec_.getReferenceName () << std::endl; const char* seq = rec_.getSequence (); unsigned position = rec_.get0BasedPosition (); unsigned new_position = position; bool reverse_match = (rec_.getFlag () & 0x10); Cigar* cigar_p = rec_.getCigarInfo (); if (!cigar_p->size ()) // can not recreate reference is cigar is missing. Keep record unaligned. { // TODO: allow to specify and load external reference ++ unaligned_cnt_; return true; } myassert (cigar_p); const String *mdval = rec_.getStringTag ("MD"); if (!mdval) // can not recreate reference is MD tag is missing. Keep record as is. { warn << "No MD Tag for record " << proc_cnt_ << ". Skipping record." << std::endl; ++nomd_cnt_; return true; // record will be kept as-is. } std::string md_tag = mdval->c_str (); // find the non-clipped region uint32_t clean_len; EndClips clips; const char* clean_read = clip_seq (seq, *cigar_p, clean_len, clips); // find length needed for the reference // this reserves space enough for entire refference, including softclipped ends. unsigned ref_len = cigar_p->getExpectedReferenceBaseCount (); if (ref_buffer_sz_ < ref_len) { ref_buffer_sz_ = (1 + ref_len / REF_BUF_INCR) * REF_BUF_INCR; ref_buffer_.reset (ref_buffer_sz_); } if (clean_len > MAX_SEQ_LEN || ref_len > MAX_SEQ_LEN) { ++ toolongs_; return true; } // recreate reference by Query, Cigar, and MD tag. Do not include softclipped ends in the recreated sequence (use default last parameter) recreate_ref (seq, rec_.getReadLength (), cigar_p, md_tag.c_str (), ref_buffer_, ref_buffer_sz_); unsigned qry_ins; // extra bases in query == width_left unsigned ref_ins; // extra bases in reference == width_right band_width (*cigar_p, qry_ins, ref_ins); if (log_matr_ || log_base_) { logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << "\n" << " sequence (" << rec_.getReadLength () << " bases)\n"; } CigarRoller roller; int ref_shift = 0; // shift of the new alignment position on refereance relative the original unsigned qry_off, ref_off; // offsets on the query and reference of the first non-clipped aligned bases double new_score = 0; switch (p_->algo ()) { case ContalignParams::TEMPL: { // call aligner new_score = taligner_.eval (clean_read, clean_len, ref_buffer_, ref_len, 0, band_width_); // read traceback // TODO: convert directly to cigar genstr::Alignment* al = taligner_.trace (); // convert alignment to cigar ref_shift = roll_cigar (roller, *al, clean_len, clips, qry_off, ref_off); } break; case ContalignParams::PLAIN: { new_score = aligner_.align_band ( clean_read, // xseq clean_len, // xlen ref_buffer_, // yseq ref_len, // ylen 0, // xpos 0, // ypos std::max (clean_len, ref_len), // segment length qry_ins + band_width_, // width_left false, // unpack ref_ins + band_width_, // width_right - forces to width_left true, // to_beg true // to_end ); unsigned bno = aligner_.backtrace ( batches_, // BATCH buffer max_batch_no_, // size of BATCH buffer false, // fill the BATCH array in reverse direction ref_ins + band_width_ // width ); // convert alignment to cigar ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off); } break; case ContalignParams::POLY: { new_score = contalign_.align_band ( clean_read, // xseq clean_len, // xlen ref_buffer_, // yseq ref_len, // ylen 0, // xpos 0, // ypos std::max (clean_len, ref_len), // segment length qry_ins + band_width_, // width_left false, // unpack ref_ins + band_width_, // width_right - forces to width_left true, // to_beg true // to_end ); unsigned bno = contalign_.backtrace ( batches_, // BATCH buffer max_batch_no_, // size of BATCH buffer false, // fill the BATCH array in reverse direction ref_ins + band_width_ // width ); // convert alignment to cigar ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off); } break; default: break; } ++realigned_cnt_; // compare original and new cigar (and location) if (ref_shift || !(*cigar_p == roller)) { // save original cigar and position for reporting std::string orig_cigar_str; rec_.getCigarInfo ()->getCigarString (orig_cigar_str); int32_t prior_pos = rec_.get0BasedPosition (); // replace cigar rec_.setCigar (roller); ++ modified_cnt_; // update pos_adjusted_cnt if position changed if (ref_shift != 0) { myassert (prior_pos + ref_shift >= 0); rec_.set0BasedPosition (prior_pos + ref_shift); ++ pos_adjusted_cnt_; } if (log_diff_) { const unsigned MAX_BATCH_PRINTED = 100; BATCH batches [MAX_BATCH_PRINTED]; std::string new_cigar_str; unsigned bno; int swscore; rec_.getCigarInfo ()->getCigarString (new_cigar_str); if (!log_base_ && !log_matr_) logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << " (" << rec_.getReadLength () << " bases)\n"; logfile_ << " ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" << orig_cigar_str << "\n"; bno = cigar_to_batches (orig_cigar_str, batches, MAX_BATCH_PRINTED); swscore = align_score (batches, bno, clean_read, ref_buffer_, p_->gip (), p_->gep (), p_->mat (), p_->mis ()); print_batches (clean_read, clean_len, false, ref_buffer_, ref_len, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_, clips.soft_beg_, 0, 160); logfile_ << "\n 'classic' SW score is " << swscore << "\n"; logfile_ << " NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" << new_cigar_str << std::endl; bno = cigar_to_batches (new_cigar_str, batches, MAX_BATCH_PRINTED); swscore = align_score (batches, bno, clean_read + qry_off, ref_buffer_ + ref_off, p_->gip (), p_->gep (), p_->mat (), p_->mis ()); print_batches (clean_read + qry_off, clean_len - qry_off, false, ref_buffer_ + ref_off, ref_len - ref_off, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_ + ref_off, clips.soft_beg_ + qry_off, 0, 160); logfile_ << "\n 'classic' SW score is " << swscore; logfile_ << "\n alternate (context-aware) score is " << new_score << ", used bandwidth left: " << qry_ins + band_width_ << ", right: " << ref_ins + band_width_ << "\n" << std::endl; } else if (log_base_) { logfile_ << "Recomputed alignment differs from original:\n"; logfile_ << " ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" << orig_cigar_str << "\n"; std::string new_cigar_str; rec_.getCigarInfo ()->getCigarString (new_cigar_str); logfile_ << " NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" << new_cigar_str << "\n" << std::endl; } } else { if (log_base_) { logfile_ << "Recomputed alignment matches the original:\n"; std::string orig_cigar_str; rec_.getCigarInfo ()->getCigarString (orig_cigar_str); int32_t prior_pos = rec_.get0BasedPosition (); logfile_ << " " << std::right << std::setw (9) << prior_pos+1 << "->" << orig_cigar_str << "\n" << std::endl; } } return true; }