/** * @brief Debug dump, optional */ void KlibAlignment::update() { _impl->result = ksw_align( _impl->reflen, _impl->ref.get(), _impl->altlen, _impl->alt.get(), 5, _impl->mat, _impl->gapo, _impl->gape, KSW_XSTART, // add flags here &(_impl->qprofile)); if(_impl->cigar) { free(_impl->cigar); _impl->cigar = NULL; _impl->cigar_len = 0; } ksw_global( _impl->result.qe - _impl->result.qb + 1, _impl->ref.get() + _impl->result.qb, _impl->result.te - _impl->result.tb + 1, _impl->alt.get() + _impl->result.tb, 5, _impl->mat, _impl->gapo, _impl->gape, _impl->altlen, &_impl->cigar_len, &_impl->cigar); _impl->valid_result = true; }
void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); int64_t k, beg, end; uint8_t *seq, *ref; int i; // compute the region start and end a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 if (h->is_rev == 0) { beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); if (beg < h->k) beg = h->k; end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); a->is_rev = 1; a->flag |= 16; } else { beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); a->is_rev = 0; } if (beg < 1) beg = 1; if (end > l_pac) end = l_pac; if (end - beg < l_mseq) return; // generate the sequence seq = malloc(l_mseq + (end - beg)); ref = seq + l_mseq; for (k = beg; k < end; ++k) ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; if (h->is_rev == 0) { for (i = 0; i < l_mseq; ++i) { // on the reverse strand int c = nst_nt4_table[(int)mseq[i]]; seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; } } else { for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } { int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; kswr_t aln; aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); a->G = aln.score; a->G2 = aln.score2; if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + aln.tb; a->len = aln.te - aln.tb + 1; a->beg = aln.qb; a->end = aln.qe + 1; /* printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); */ } if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); }
void ta_trim1(ta_opt_t *opt, char *seq) { int i, j, k; kstring_t _str = {0,0,0}, *str = &_str; str->l = strlen(seq); str->s = malloc(str->l); for (i = 0; i < str->l; ++i) str->s[i] = seq_nt4_table[(uint8_t)seq[i]]; for (j = 0; j < opt->n_adaps; ++j) { kswr_t r; double diff; int type; ta_adap_t *p = &opt->adaps[j]; r = ksw_align(p->len, p->seq, str->l, (uint8_t*)str->s, 5, opt->mat, opt->go, opt->ge, KSW_XBYTE|KSW_XSTART|(opt->min_len * opt->sa), 0); ++r.te; ++r.qe; // change to 0-based k = r.qe - r.qb < r.te - r.tb? r.qe - r.qb : r.te - r.tb; diff = (double)(k * opt->sa - r.score) / opt->sb / k; //printf("%d:%.3f [%d,%d):%d <=> [%d,%d):%d\n", r.score, diff, r.qb, r.qe, p->len, r.tb, r.te, (int)str.l); if (r.qb <= r.tb && p->len - r.qe <= str->l - r.te) { // contained if (r.qb * opt->sa > opt->sa + opt->sb) continue; if ((p->len - r.qe) * opt->sa > opt->sa + opt->sb) continue; type = 1; } else if (r.qb <= r.tb) { // 3'-end overlap if (r.qb * opt->sa > opt->sa + opt->sb) continue; if ((str->l - r.te) * opt->sa > opt->sa + opt->sb) continue; type = 2; } else { // 5'-end overlap if ((p->len - r.qe) * opt->sa > opt->sa + opt->sb) continue; if (r.tb * opt->sa > opt->sa + opt->sb) continue; type = 3; } if (p->type == 5) { if (r.tb == 0 && r.qe == p->len && (r.te - r.tb) * opt->sa == r.score) type = 4; } else if (p->type == 3) { if (r.qb == 0 && r.te == str->l && (r.te - r.tb) * opt->sa == r.score) type = 4; } if (type == 4) { // exact match if (r.te - r.tb < opt->min_len) continue; } else { // inexact match if (r.score < opt->min_sc || diff > opt->max_diff) continue; } __sync_fetch_and_add(&p->cnt, 1); if (p->type == 5) { k = r.te + (p->len - r.qe); k = k < str->l? k : str->l; for (i = 0; i < k; ++i) seq[i] = 'X'; } else if (p->type == 3) { k = r.tb > r.qb? r.tb - r.qb : 0; for (i = k; i < str->l; ++i) seq[i] = 'X'; } } free(str->s); }
static aln_v align_read(const kseq_t *read, const kseq_v targets, const align_config_t *conf) { kseq_t *r; const int32_t read_len = read->seq.l; aln_v result; kv_init(result); kv_resize(aln_t, result, kv_size(targets)); uint8_t *read_num = calloc(read_len, sizeof(uint8_t)); for(size_t k = 0; k < read_len; ++k) read_num[k] = conf->table[(int)read->seq.s[k]]; // Align to each target kswq_t *qry = NULL; for(size_t j = 0; j < kv_size(targets); j++) { // Encode target r = &kv_A(targets, j); uint8_t *ref_num = calloc(r->seq.l, sizeof(uint8_t)); for(size_t k = 0; k < r->seq.l; ++k) ref_num[k] = conf->table[(int)r->seq.s[k]]; aln_t aln; aln.target_idx = j; aln.loc = ksw_align(read_len, read_num, r->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, &qry); ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, 50, /* TODO: Magic number - band width */ &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for(size_t k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if(optype & 3) { // consumes both - check for mismatches for(size_t j = 0; j < oplen; j++) { if(UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if(optype & 1) qi += oplen; if(optype & 2) ri += oplen; } kv_push(aln_t, result, aln); free(ref_num); } free(qry); free(read_num); ks_introsort(dec_score, kv_size(result), result.a); return result; }
int Aligner::getOffsetAgainstMaster(std::string& slaveDR, AlignerFlag_t& flags) { int slave_dr_length = static_cast<int>(slaveDR.length()); uint8_t* slave_dr_forward = new uint8_t[slave_dr_length+1]; uint8_t* slave_dr_reverse = new uint8_t[slave_dr_length+1]; prepareSlaveForAlignment(slaveDR, slave_dr_forward, slave_dr_reverse); // query profile kswq_t *slave_forward_query_profile = 0; kswq_t *slave_reverse_query_profile = 0; // alignment of slave against master kswr_t forward_return = ksw_align(slave_dr_length, slave_dr_forward, AL_masterDRLength, AL_masterDR, 5, AL_scoringMatrix, AL_gapOpening, AL_gapExtension, AL_xtra, &slave_forward_query_profile); kswr_t reverse_return = ksw_align(slave_dr_length, slave_dr_reverse, AL_masterDRLength, AL_masterDR, 5, AL_scoringMatrix, AL_gapOpening, AL_gapExtension, AL_xtra, &slave_reverse_query_profile); // free the query profile free(slave_forward_query_profile); free(slave_reverse_query_profile); delete slave_dr_reverse; delete slave_dr_forward; // figure out which alignment was better if (reverse_return.score == forward_return.score) { flags[score_equal] = true; return 0; } kswr_t best_alignment_info; if(reverse_return.score > forward_return.score) { best_alignment_info = reverse_return; flags[reversed] = true; } else { best_alignment_info = forward_return; } int min_query_seq_coverage = static_cast<int>(slave_dr_length / 2); // this is not the way you are supposed to use goto // actually you're never supposed to use goto // but you know what, I don't care! // http://xkcd.com/292/ if(min_query_seq_coverage > best_alignment_info.score) { logWarn("Alignment Warning: Slave Alignment Failure",4); logWarn("\tfailed query coverage test", 4); logWarn("\trequired: "<<min_query_seq_coverage, 4); goto FAILED; } if(best_alignment_info.score < AL_minAlignmentScore) { logWarn("Alignment Warning: Slave Alignment Failure",4); logWarn("\tfailed minimum score test", 4); goto FAILED; } //if (best_alignment_info.qb != 0 && best_alignment_info.qe != slave_dr_length - 1) { // logWarn("Alignment Warning: Slave Alignment Failure",4); // logWarn("\tfailed internal only test", 4); // goto FAILED; //} //std::cerr << best_alignment_info.tb - best_alignment_info.qb << " : "; //this->printAlignment(best_alignment_info, slaveDR, std::cerr); return best_alignment_info.tb - best_alignment_info.qb; FAILED: logWarn("\tmaster: "<<mStringCheck->getString(AL_masterDRToken) , 4); logWarn("\ttb: "<< best_alignment_info.tb, 4); logWarn("\tte: "<< best_alignment_info.te+1, 4); logWarn("\tslave: "<< slaveDR, 4); logWarn("\tqb: "<< best_alignment_info.qb, 4); logWarn("\tqe: "<< best_alignment_info.qe+1, 4); logWarn("\tscore: "<< best_alignment_info.score, 4); logWarn("\t2nd-score: "<< best_alignment_info.score2, 4); logWarn("\t2nd-te: "<< best_alignment_info.te2, 4); logWarn("\toffset: "<<best_alignment_info.tb - best_alignment_info.qb,4); logWarn("******", 4); flags[failed] = true; return 0; }
static aln_t align_read_against_one(kseq_t *target, const int read_len, uint8_t *read_num, kswq_t **qry, const align_config_t *conf, const int min_score) { uint8_t *ref_num = calloc(target->seq.l, sizeof(uint8_t)); for (size_t k = 0; k < target->seq.l; ++k) ref_num[k] = conf->table[(int)target->seq.s[k]]; aln_t aln; aln.cigar = NULL; aln.loc = ksw_align(read_len, read_num, target->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, qry); aln.target_name = target->name.s; if (aln.loc.score < min_score) { free(ref_num); return aln; } ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, conf->bandwidth, &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for (int k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if (optype & 3) { // consumes both - check for mismatches for (int j = 0; j < oplen; j++) { if (UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if (optype & 1) qi += oplen; if (optype & 2) ri += oplen; } free(ref_num); /* size_t cigar_len = aln.loc.qb; */ /* for (int c = 0; c < aln.n_cigar; c++) { */ /* int32_t length = (0xfffffff0 & *(aln.cigar + c)) >> 4; */ /* cigar_len += length; */ /* } */ /* cigar_len += read_len - aln.loc.qe - 1; */ /* if(cigar_len != (size_t)read_len) { */ /* /\* printf("[ig_align] Error: cigar length (score %d) not equal to read length for XXX (target %s): %zu vs %d\n", aln.loc.score, target->name.s, cigar_len, read_len); *\/ */ /* // NOTE: */ /* // It is *really* *f*****g* *scary* that it's spitting out cigars that are not the same length as the query sequence. */ /* // Nonetheless, fixing it seems to involve delving into the depths of ksw_align() and ksw_global(), which would be very time consuming, and the length discrepancy seems to ony appear in very poor matches. */ /* // I.e., poor enough that we will subsequently ignore them in partis/python/waterer.py, so it seems to not screw anything up downstream to just set the length-discrepant matches' scores to zero, such that ig-sw doesn't write them to its sam output. */ /* // Note also that it is not always the lowest- or highest-scoring matches that have discrepant lengths (i.e. setting their scores to zero promotes matches swith poorer scores, but which do not have discrepant lengths. */ /* /\* aln.loc.score = 0; *\/ */ /* aln.cigar = NULL; */ /* } */ return aln; }
uint8_t* slave_dr_reverse = new uint8_t[slave_dr_length+1]; prepareSlaveForAlignment(slaveDR, slave_dr_forward, slave_dr_reverse); // query profile kswq_t *slave_forward_query_profile = 0; kswq_t *slave_reverse_query_profile = 0; // alignment of slave against master kswr_t forward_return = ksw_align(slave_dr_length, slave_dr_forward, AL_masterDRLength, AL_masterDR, 5, AL_scoringMatrix, AL_gapOpening, AL_gapExtension, AL_xtra, &slave_forward_query_profile); kswr_t reverse_return = ksw_align(slave_dr_length, slave_dr_reverse, AL_masterDRLength, AL_masterDR, 5, AL_scoringMatrix, AL_gapOpening, AL_gapExtension, AL_xtra,