/** * @brief Debug dump, optional */ void KlibAlignment::update() { _impl->result = ksw_align( _impl->reflen, _impl->ref.get(), _impl->altlen, _impl->alt.get(), 5, _impl->mat, _impl->gapo, _impl->gape, KSW_XSTART, // add flags here &(_impl->qprofile)); if(_impl->cigar) { free(_impl->cigar); _impl->cigar = NULL; _impl->cigar_len = 0; } ksw_global( _impl->result.qe - _impl->result.qb + 1, _impl->ref.get() + _impl->result.qb, _impl->result.te - _impl->result.tb + 1, _impl->alt.get() + _impl->result.tb, 5, _impl->mat, _impl->gapo, _impl->gape, _impl->altlen, &_impl->cigar_len, &_impl->cigar); _impl->valid_result = true; }
static aln_v align_read(const kseq_t *read, const kseq_v targets, const align_config_t *conf) { kseq_t *r; const int32_t read_len = read->seq.l; aln_v result; kv_init(result); kv_resize(aln_t, result, kv_size(targets)); uint8_t *read_num = calloc(read_len, sizeof(uint8_t)); for(size_t k = 0; k < read_len; ++k) read_num[k] = conf->table[(int)read->seq.s[k]]; // Align to each target kswq_t *qry = NULL; for(size_t j = 0; j < kv_size(targets); j++) { // Encode target r = &kv_A(targets, j); uint8_t *ref_num = calloc(r->seq.l, sizeof(uint8_t)); for(size_t k = 0; k < r->seq.l; ++k) ref_num[k] = conf->table[(int)r->seq.s[k]]; aln_t aln; aln.target_idx = j; aln.loc = ksw_align(read_len, read_num, r->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, &qry); ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, 50, /* TODO: Magic number - band width */ &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for(size_t k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if(optype & 3) { // consumes both - check for mismatches for(size_t j = 0; j < oplen; j++) { if(UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if(optype & 1) qi += oplen; if(optype & 2) ri += oplen; } kv_push(aln_t, result, aln); free(ref_num); } free(qry); free(read_num); ks_introsort(dec_score, kv_size(result), result.a); return result; }
static aln_t align_read_against_one(kseq_t *target, const int read_len, uint8_t *read_num, kswq_t **qry, const align_config_t *conf, const int min_score) { uint8_t *ref_num = calloc(target->seq.l, sizeof(uint8_t)); for (size_t k = 0; k < target->seq.l; ++k) ref_num[k] = conf->table[(int)target->seq.s[k]]; aln_t aln; aln.cigar = NULL; aln.loc = ksw_align(read_len, read_num, target->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, qry); aln.target_name = target->name.s; if (aln.loc.score < min_score) { free(ref_num); return aln; } ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, conf->bandwidth, &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for (int k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if (optype & 3) { // consumes both - check for mismatches for (int j = 0; j < oplen; j++) { if (UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if (optype & 1) qi += oplen; if (optype & 2) ri += oplen; } free(ref_num); /* size_t cigar_len = aln.loc.qb; */ /* for (int c = 0; c < aln.n_cigar; c++) { */ /* int32_t length = (0xfffffff0 & *(aln.cigar + c)) >> 4; */ /* cigar_len += length; */ /* } */ /* cigar_len += read_len - aln.loc.qe - 1; */ /* if(cigar_len != (size_t)read_len) { */ /* /\* printf("[ig_align] Error: cigar length (score %d) not equal to read length for XXX (target %s): %zu vs %d\n", aln.loc.score, target->name.s, cigar_len, read_len); *\/ */ /* // NOTE: */ /* // It is *really* *f*****g* *scary* that it's spitting out cigars that are not the same length as the query sequence. */ /* // Nonetheless, fixing it seems to involve delving into the depths of ksw_align() and ksw_global(), which would be very time consuming, and the length discrepancy seems to ony appear in very poor matches. */ /* // I.e., poor enough that we will subsequently ignore them in partis/python/waterer.py, so it seems to not screw anything up downstream to just set the length-discrepant matches' scores to zero, such that ig-sw doesn't write them to its sam output. */ /* // Note also that it is not always the lowest- or highest-scoring matches that have discrepant lengths (i.e. setting their scores to zero promotes matches swith poorer scores, but which do not have discrepant lengths. */ /* /\* aln.loc.score = 0; *\/ */ /* aln.cigar = NULL; */ /* } */ return aln; }