int bam_cigar2matches(int n_cigar, const uint32_t *cigar) { int k, l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))==3) l += bam_cigar_oplen(cigar[k]); return l; }
int bam_cigar2ulen(int n_cigar, const uint32_t *cigar) { int k, l; for (k = l = 0; k < n_cigar; ++k) { if (bam_cigar_type(bam_cigar_op(cigar[k])) &1) { l += bam_cigar_oplen(cigar[k]); } else if (bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) { l += bam_cigar_oplen(cigar[k]); } } return l; }
void CigarParser::advance() { int type = bam_cigar_type(currentOp_); if (type & BAM_CONSUME_REFERENCE) { refPos_ += currentOpLen_; } if (type & BAM_CONSUME_QUERY) { readPos_ += currentOpLen_; } ++currentOpIdx_; assert(currentOpIdx_ < len_); currentOp_ = bam_cigar_op(cigar_[currentOpIdx_]); currentOpLen_ = bam_cigar_oplen(cigar_[currentOpIdx_]); }
unsigned seq_lens_from_bin_cigar (uint32_t* cigar_bin, unsigned cigar_bin_sz, unsigned* q_len, unsigned* r_len) { unsigned oplen, constype; uint32_t *sent; *q_len = *r_len = 0; unsigned allen = 0; for (sent = cigar_bin + cigar_bin_sz; cigar_bin != sent; ++cigar_bin) { oplen = bam_cigar_oplen (*cigar_bin); constype = bam_cigar_type (*cigar_bin); if (constype & CONSUME_QRY) *q_len += oplen; if (constype & CONSUME_REF) *r_len += oplen; allen += oplen; } return allen; }
// Returns 0 to indicate read should be output 1 otherwise static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { int k, qlen = 0; uint32_t *cigar = bam_get_cigar(b); for (k = 0; k < b->core.n_cigar; ++k) if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) qlen += bam_cigar_oplen(cigar[k]); if (qlen < settings->min_qlen) return 1; } if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off)) return 1; if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); if ((double)(k&0xffffff) / 0x1000000 >= settings->subsam_frac) return 1; } if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } if (settings->library) { const char *p = bam_get_library((bam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } if (settings->remove_aux_len) { size_t i; for (i = 0; i < settings->remove_aux_len; ++i) { uint8_t *s = bam_aux_get(b, settings->remove_aux[i]); if (s) { bam_aux_del(b, s); } } } return 0; }
void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { double w; if (no_fractional_weight) w = 1.0; else { uint8_t *p_tag = bam_aux_get(b, "ZW"); if (p_tag == NULL) return; w = bam_aux2f(p_tag); } int pos = b->core.pos; uint32_t *p = bam_get_cigar(b); for (int i = 0; i < (int)b->core.n_cigar; ++i, ++p) { char op = bam_cigar_op(*p); int op_len = bam_cigar_oplen(*p); if (op == BAM_CMATCH) for (int j = 0; j < op_len; ++j, ++pos) wiggle.read_depth[pos] += w; else pos += ((bam_cigar_type(op) & 2) ? op_len : 0); } }
unsigned alignment_bounds_from_bin_cigar (uint32_t* cigar_bin, unsigned cigar_bin_sz, uint8_t forward, unsigned qry_len, unsigned* q_beg, unsigned* q_end, unsigned* r_beg, unsigned* r_end) { unsigned oplen, op, constype; uint32_t *sent; *q_beg = *q_end = *r_beg = *r_end = 0; unsigned allen = 0; uint32_t tail = 0; for (sent = cigar_bin + cigar_bin_sz; cigar_bin != sent; ++cigar_bin) { oplen = bam_cigar_oplen (*cigar_bin); op = bam_cigar_op (*cigar_bin); constype = bam_cigar_type (*cigar_bin); if (tail && (op == BAM_CHARD_CLIP || op == BAM_CSOFT_CLIP)) // the aligned zone ended, clip started. Note that tail indels are not valid, so we do not assume they are possible.. break; if (op != BAM_CHARD_CLIP && op != BAM_CSOFT_CLIP) tail = 1; if (constype & CONSUME_QRY) { if (!tail) *q_beg += oplen; *q_end += oplen; } if (constype & CONSUME_REF) { if (!tail) *r_beg += oplen; *r_end += oplen; } allen += oplen; } if (!forward) { unsigned tmp = qry_len - *q_beg; *q_beg = qry_len - *q_end; *q_end = tmp; } return allen; }
/* Make a node containing an InDel * * b The input read * cigar_op_num The operation number of the Insertion/Deletion * * returns a node, that must be either inserted into the linked list or * destroyed with destroyNode() */ InDel *makeNode(bam1_t *b, int cigar_op_num) { int i, op, oplen, quit = 0, type; int32_t start = b->core.pos-1; int32_t end; uint32_t *cigar = bam_get_cigar(b); InDel *node; for(i=0; i<cigar_op_num; i++) { oplen = bam_cigar_oplen(cigar[i]); type = bam_cigar_type(bam_cigar_op(cigar[i])); if(type & 2) start += oplen; } end = ++start; for(i=cigar_op_num; i<b->core.n_cigar; i++) { op = bam_cigar_op(cigar[i]); oplen = bam_cigar_oplen(cigar[i]); switch(op) { case 1: //I case 2: //D end = (end>start+oplen) ? end : start+oplen; break; default : quit = 1; break; } if(quit) break; } //Make the node node = calloc(1, sizeof(InDel)); node->tid = b->core.tid; node->start = start; node->end = end; node->count = 1; return node; }
static aln_v align_read(const kseq_t *read, const kseq_v targets, const align_config_t *conf) { kseq_t *r; const int32_t read_len = read->seq.l; aln_v result; kv_init(result); kv_resize(aln_t, result, kv_size(targets)); uint8_t *read_num = calloc(read_len, sizeof(uint8_t)); for(size_t k = 0; k < read_len; ++k) read_num[k] = conf->table[(int)read->seq.s[k]]; // Align to each target kswq_t *qry = NULL; for(size_t j = 0; j < kv_size(targets); j++) { // Encode target r = &kv_A(targets, j); uint8_t *ref_num = calloc(r->seq.l, sizeof(uint8_t)); for(size_t k = 0; k < r->seq.l; ++k) ref_num[k] = conf->table[(int)r->seq.s[k]]; aln_t aln; aln.target_idx = j; aln.loc = ksw_align(read_len, read_num, r->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, &qry); ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, 50, /* TODO: Magic number - band width */ &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for(size_t k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if(optype & 3) { // consumes both - check for mismatches for(size_t j = 0; j < oplen; j++) { if(UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if(optype & 1) qi += oplen; if(optype & 2) ri += oplen; } kv_push(aln_t, result, aln); free(ref_num); } free(qry); free(read_num); ks_introsort(dec_score, kv_size(result), result.a); return result; }
bool clipAdapters( libmaus2::bambam::BamAlignment & algn, libmaus2::autoarray::AutoArray<char> & R, libmaus2::autoarray::AutoArray<char> & Q, libmaus2::bambam::BamSeqEncodeTable const & seqenc, libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> & cigop, libmaus2::bambam::BamAlignment::D_array_type & T ) { // a3,as uint64_t const asclip = algn.hasAux("as") ? algn.getAuxAsNumber<int>("as") : 0; uint64_t const a3clip = algn.hasAux("a3") ? algn.getAuxAsNumber<int>("a3") : 0; uint64_t const aclip = std::max(asclip,a3clip); bool const reverse = algn.isReverse(); if ( aclip ) { uint64_t const len = algn.decodeRead(R); algn.decodeQual(Q); if ( (len - aclip) > 1 ) { if ( algn.isMapped() ) { uint32_t const numcigop = algn.getCigarOperations(cigop); if ( numcigop == cigop.size() ) cigop.resize(numcigop+1); if ( reverse ) { std::reverse(cigop.begin(),cigop.begin()+numcigop); } // can't just add a HC to the cigar uint32_t index; uint32_t hardclip = 0; uint32_t cig_type; int32_t left = aclip; int32_t repos = 0; for ( index = numcigop - 1; index > 0; index-- ) { cig_type = bam_cigar_type(cigop[index].first); if ( cig_type == 0 ) { hardclip += cigop[index].second; } else { if ( cig_type & 1 ) { if ( cigop[index].second < left ) { left -= cigop[index].second; } else { break; } } if ( cig_type & 2 ) { // move pos if reversed repos += cigop[index].second; } } } cig_type = bam_cigar_type(cigop[index].first); if ( cigop[index].second != left ) { cigop[index++].second -= left; } cigop[index] = libmaus2::bambam::cigar_operation(libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CHARD_CLIP, aclip + hardclip); if ( numcigop > index + 1 ) cigop.resize(index + 1); if ( reverse ) { std::reverse(cigop.begin(),cigop.begin() + index + 1); // account for the last possible pos move if ( cig_type & 2 ) repos += left; if ( repos ) { // clipping has moved the pos point algn.putPos(algn.getPos() + repos); } } algn.replaceCigarString(cigop.begin(),index + 1,T); } if ( !reverse ) { algn.replaceSequence(seqenc,R.begin(),Q.begin(),len-aclip,T); algn.putAuxString("qs",std::string(R.begin()+(len-aclip),R.begin()+len)); algn.putAuxString("qq",std::string(Q.begin()+(len-aclip),Q.begin()+len)); } else { algn.replaceSequence(seqenc, (R.begin() + aclip), (Q.begin() + aclip), len - aclip, T); algn.putAuxString("qs", std::string(R.begin(), R.begin() + aclip)); algn.putAuxString("qq", std::string(Q.begin(), Q.begin() + aclip)); } } } return true; }
static aln_t align_read_against_one(kseq_t *target, const int read_len, uint8_t *read_num, kswq_t **qry, const align_config_t *conf, const int min_score) { uint8_t *ref_num = calloc(target->seq.l, sizeof(uint8_t)); for (size_t k = 0; k < target->seq.l; ++k) ref_num[k] = conf->table[(int)target->seq.s[k]]; aln_t aln; aln.cigar = NULL; aln.loc = ksw_align(read_len, read_num, target->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, qry); aln.target_name = target->name.s; if (aln.loc.score < min_score) { free(ref_num); return aln; } ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, conf->bandwidth, &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for (int k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if (optype & 3) { // consumes both - check for mismatches for (int j = 0; j < oplen; j++) { if (UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if (optype & 1) qi += oplen; if (optype & 2) ri += oplen; } free(ref_num); /* size_t cigar_len = aln.loc.qb; */ /* for (int c = 0; c < aln.n_cigar; c++) { */ /* int32_t length = (0xfffffff0 & *(aln.cigar + c)) >> 4; */ /* cigar_len += length; */ /* } */ /* cigar_len += read_len - aln.loc.qe - 1; */ /* if(cigar_len != (size_t)read_len) { */ /* /\* printf("[ig_align] Error: cigar length (score %d) not equal to read length for XXX (target %s): %zu vs %d\n", aln.loc.score, target->name.s, cigar_len, read_len); *\/ */ /* // NOTE: */ /* // It is *really* *f*****g* *scary* that it's spitting out cigars that are not the same length as the query sequence. */ /* // Nonetheless, fixing it seems to involve delving into the depths of ksw_align() and ksw_global(), which would be very time consuming, and the length discrepancy seems to ony appear in very poor matches. */ /* // I.e., poor enough that we will subsequently ignore them in partis/python/waterer.py, so it seems to not screw anything up downstream to just set the length-discrepant matches' scores to zero, such that ig-sw doesn't write them to its sam output. */ /* // Note also that it is not always the lowest- or highest-scoring matches that have discrepant lengths (i.e. setting their scores to zero promotes matches swith poorer scores, but which do not have discrepant lengths. */ /* /\* aln.loc.score = 0; *\/ */ /* aln.cigar = NULL; */ /* } */ return aln; }