int bam_pad2unpad(bamFile in, bamFile out) { bam_header_t *h; bam1_t *b; kstring_t r, q; uint32_t *cigar2 = 0; int n2 = 0, m2 = 0, *posmap = 0; h = bam_header_read(in); bam_header_write(out, h); b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; while (bam_read1(in, b) >= 0) { uint32_t *cigar = bam1_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { int i, k; unpad_seq(b, &r); write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = realloc(posmap, r.m * sizeof(int)); for (i = k = 0; i < r.l; ++i) { posmap[i] = k; // note that a read should NOT start at a padding if (r.s[i]) ++k; } } else { int i, k, op; unpad_seq(b, &q); if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); for (i = k = 1, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); b->core.pos = posmap[b->core.pos]; } bam_write1(out, b); } free(r.s); free(q.s); free(posmap); bam_destroy1(b); bam_header_destroy(h); return 0; }
int bam_cigar2ulen(int n_cigar, const uint32_t *cigar) { int k, l; for (k = l = 0; k < n_cigar; ++k) { if (bam_cigar_type(bam_cigar_op(cigar[k])) &1) { l += bam_cigar_oplen(cigar[k]); } else if (bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) { l += bam_cigar_oplen(cigar[k]); } } return l; }
int bam_cigar2matches(int n_cigar, const uint32_t *cigar) { int k, l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))==3) l += bam_cigar_oplen(cigar[k]); return l; }
CigarParser::CigarParser(uint32_t const* cigar, int len) : cigar_(cigar) , len_(len) , readPos_(0) , refPos_(0) , currentOpIdx_(0) , currentOp_(bam_cigar_op(*cigar)) , currentOpLen_(bam_cigar_oplen(*cigar)) , started_(false) { }
/* Finds InDels in a BAM or CRAM file, adding them to the linked list * * fp Input BAM/CRAM file * hdr The header for the BAM/CRAM file * k The K-mer size * * discussion The linked list will need to be destroyed with destroyNodes() */ void findInDels(htsFile *fp, bam_hdr_t *hdr, int minMAPQ, int k) { bam1_t *b = bam_init1(); int i, op; InDel *node; uint32_t *cigar; while(sam_read1(fp, hdr, b) > 0) { if(b->core.qual < minMAPQ) continue; cigar = bam_get_cigar(b); for(i=0; i<b->core.n_cigar; i++) { op = bam_cigar_op(cigar[i]); if(op == 1 || op == 2) { node = makeNode(b, i); if(node == NULL) goto quit; insertNode(node, k); while(++i < b->core.n_cigar) { //Skip adjacent D/I operations op = bam_cigar_op(cigar[i]); if(op != 1 && op != 2) break; continue; } } } } //Ensure that all ROIs are at least k apart lastTargetNode = firstTargetNode->next; while(lastTargetNode->next) { i = TargetNodeCmp(lastTargetNode,lastTargetNode->next, k); assert(i<=0); if(i==0) { lastTargetNode->end = lastTargetNode->next->end; lastTargetNode->count += (lastTargetNode->count+lastTargetNode->next->count > lastTargetNode->count)?lastTargetNode->next->count:0xFFFFFFFF; removeNode(lastTargetNode->next); } else { lastTargetNode = lastTargetNode->next; } } quit: bam_destroy1(b); }
// Returns 0 to indicate read should be output 1 otherwise static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { int k, qlen = 0; uint32_t *cigar = bam_get_cigar(b); for (k = 0; k < b->core.n_cigar; ++k) if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) qlen += bam_cigar_oplen(cigar[k]); if (qlen < settings->min_qlen) return 1; } if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off)) return 1; if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); if ((double)(k&0xffffff) / 0x1000000 >= settings->subsam_frac) return 1; } if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } if (settings->library) { const char *p = bam_get_library((bam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } if (settings->remove_aux_len) { size_t i; for (i = 0; i < settings->remove_aux_len; ++i) { uint8_t *s = bam_aux_get(b, settings->remove_aux[i]); if (s) { bam_aux_del(b, s); } } } return 0; }
// Adapted from samtools/bam.c int32_t b2g_bam_clippedlength(bam1_t *b) { const bam1_core_t c = b->core; const uint32_t *cigar = bam1_cigar(b); uint32_t k; int32_t l = 0; for (k = 0; k < c.n_cigar; ++k) { if ('S' == bam_cigar_opchr(bam_cigar_op(cigar[k]))) { l += bam_cigar_oplen(cigar[k]); } } return l; }
/* Make a node containing an InDel * * b The input read * cigar_op_num The operation number of the Insertion/Deletion * * returns a node, that must be either inserted into the linked list or * destroyed with destroyNode() */ InDel *makeNode(bam1_t *b, int cigar_op_num) { int i, op, oplen, quit = 0, type; int32_t start = b->core.pos-1; int32_t end; uint32_t *cigar = bam_get_cigar(b); InDel *node; for(i=0; i<cigar_op_num; i++) { oplen = bam_cigar_oplen(cigar[i]); type = bam_cigar_type(bam_cigar_op(cigar[i])); if(type & 2) start += oplen; } end = ++start; for(i=cigar_op_num; i<b->core.n_cigar; i++) { op = bam_cigar_op(cigar[i]); oplen = bam_cigar_oplen(cigar[i]); switch(op) { case 1: //I case 2: //D end = (end>start+oplen) ? end : start+oplen; break; default : quit = 1; break; } if(quit) break; } //Make the node node = calloc(1, sizeof(InDel)); node->tid = b->core.tid; node->start = start; node->end = end; node->count = 1; return node; }
void CigarParser::advance() { int type = bam_cigar_type(currentOp_); if (type & BAM_CONSUME_REFERENCE) { refPos_ += currentOpLen_; } if (type & BAM_CONSUME_QUERY) { readPos_ += currentOpLen_; } ++currentOpIdx_; assert(currentOpIdx_ < len_); currentOp_ = bam_cigar_op(cigar_[currentOpIdx_]); currentOpLen_ = bam_cigar_oplen(cigar_[currentOpIdx_]); }
static void unpad_seq(bam1_t *b, kstring_t *s) { int k, j, i; uint32_t *cigar = bam1_cigar(b); uint8_t *seq = bam1_seq(b); ks_resize(s, b->core.l_qseq); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP); if (op == BAM_CMATCH) { for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j); ++j; } else if (op == BAM_CSOFT_CLIP) { j += ol; } else { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } } }
static int unpad_seq(bam1_t *b, kstring_t *s) { // Returns 0 on success, -1 on an error int k, j, i; int length; int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */ uint32_t *cigar = bam_get_cigar(b); uint8_t *seq = bam_get_seq(b); // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) // We need the padded length after alignment from the CIGAR (excluding // soft clips S, but including pads from CIGAR D operations) length = bam_cigar2rlen(b->core.n_cigar, cigar); ks_resize(s, length); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j); } else if (op == BAM_CSOFT_CLIP) { j += ol; } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } else if (op == BAM_CREF_SKIP) { /* Treat CIGAR N as D (not ideal, but better than ignoring it) */ for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } return length != s->l; }
void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { double w; if (no_fractional_weight) w = 1.0; else { uint8_t *p_tag = bam_aux_get(b, "ZW"); if (p_tag == NULL) return; w = bam_aux2f(p_tag); } int pos = b->core.pos; uint32_t *p = bam_get_cigar(b); for (int i = 0; i < (int)b->core.n_cigar; ++i, ++p) { char op = bam_cigar_op(*p); int op_len = bam_cigar_oplen(*p); if (op == BAM_CMATCH) for (int j = 0; j < op_len; ++j, ++pos) wiggle.read_depth[pos] += w; else pos += ((bam_cigar_type(op) & 2) ? op_len : 0); } }
unsigned alignment_bounds_from_bin_cigar (uint32_t* cigar_bin, unsigned cigar_bin_sz, uint8_t forward, unsigned qry_len, unsigned* q_beg, unsigned* q_end, unsigned* r_beg, unsigned* r_end) { unsigned oplen, op, constype; uint32_t *sent; *q_beg = *q_end = *r_beg = *r_end = 0; unsigned allen = 0; uint32_t tail = 0; for (sent = cigar_bin + cigar_bin_sz; cigar_bin != sent; ++cigar_bin) { oplen = bam_cigar_oplen (*cigar_bin); op = bam_cigar_op (*cigar_bin); constype = bam_cigar_type (*cigar_bin); if (tail && (op == BAM_CHARD_CLIP || op == BAM_CSOFT_CLIP)) // the aligned zone ended, clip started. Note that tail indels are not valid, so we do not assume they are possible.. break; if (op != BAM_CHARD_CLIP && op != BAM_CSOFT_CLIP) tail = 1; if (constype & CONSUME_QRY) { if (!tail) *q_beg += oplen; *q_end += oplen; } if (constype & CONSUME_REF) { if (!tail) *r_beg += oplen; *r_end += oplen; } allen += oplen; } if (!forward) { unsigned tmp = qry_len - *q_beg; *q_beg = qry_len - *q_end; *q_end = tmp; } return allen; }
void cigar_print (FILE* f, uint32_t* cigar, unsigned cigar_sz) { uint32_t* sent; for (sent = cigar+cigar_sz; cigar != sent; ++cigar) { uint32_t curop = bam_cigar_op (*cigar); uint32_t count = bam_cigar_oplen (*cigar); char schar; switch (curop) { case BAM_CHARD_CLIP: schar = 'H'; break; case BAM_CSOFT_CLIP: // skip schar = 'S'; break; case BAM_CMATCH: schar = 'M'; break; case BAM_CEQUAL: schar = '='; break; case BAM_CDIFF: schar = '#'; break; case BAM_CINS: schar = 'I'; break; case BAM_CDEL: schar = 'I'; break; default: schar = '?'; } fprintf (f, "%d%c", count, schar); } }
int bsstrand_func(bam1_t *b, const samfile_t *in, samfile_t *out, void *data) { bsstrand_data_t *d = (bsstrand_data_t*)data; bsstrand_conf_t *conf = d->conf; const bam1_core_t *c = &b->core; if (c->flag & BAM_FUNMAP){ if (out) samwrite(out, b); d->n_unmapped++; return 0; } fetch_refseq(d->rs, in->header->target_name[c->tid], c->pos, c->pos+1); uint32_t rpos=c->pos+1, qpos=0; int i, nC2T = 0, nG2A = 0; uint32_t j; char rbase, qbase; for (i=0; i<c->n_cigar; ++i) { uint32_t op = bam_cigar_op(bam1_cigar(b)[i]); uint32_t oplen = bam_cigar_oplen(bam1_cigar(b)[i]); switch(op) { case BAM_CMATCH: for(j=0; j<oplen; ++j) { rbase = toupper(getbase_refseq(d->rs, rpos+j)); qbase = bscall(bam1_seq(b), qpos+j); if (rbase == 'C' && qbase == 'T') nC2T += 1; if (rbase == 'G' && qbase == 'A') nG2A += 1; /* printf("%c vs %c\n", toupper(rbase), qbase); */ } rpos += oplen; qpos += oplen; break; case BAM_CINS: qpos += oplen; break; case BAM_CDEL: rpos += oplen; break; case BAM_CSOFT_CLIP: qpos += oplen; break; default: fprintf(stderr, "Unknown cigar, %u\n", op); abort(); } } char key[2] = {'Z','S'}; unsigned char *bsstrand = bam_aux_get(b, key); if (bsstrand) { bsstrand++; double s = similarity(nG2A, nC2T); if (nG2A > 1 && nC2T > 1 && s > 0.5) { if (conf->output_read || conf->output_all_read) printf("F\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '?'; d->n_fail++; } else if (*bsstrand == '+' && nG2A > nC2T + 2) { if (conf->output_read || conf->output_all_read) printf("W2C\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '-'; d->n_corr++; } else if (*bsstrand == '-' && nC2T > nG2A + 2) { if (conf->output_read || conf->output_all_read) printf("C2W\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '+'; d->n_corr++; } else if (conf->output_all_read) { printf("N\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); } } else if (!(c->flag & BAM_FUNMAP) && conf->infer_bsstrand) { char bss[3]; if (similarity(nG2A, nC2T) < 0.5) { strcpy(bss, "??"); } else if (nC2T > nG2A) { strcpy(bss, c->flag & BAM_FREVERSE ? "+-" : "++"); } else { strcpy(bss, c->flag & BAM_FREVERSE ? "-+" : "--"); } bam_aux_append(b, "ZS", 'Z', 3, (uint8_t*) bss); } if (out) samwrite(out, b); d->n_mapped++; return 0; }
// prepare internal structures for clipping and alignment // returns true if realignment was performed bool RealignImp::compute_alignment ( const char* q_seq, unsigned q_len, const char* r_seq, unsigned r_len, int r_pos, bool forward, const uint32_t* cigar, unsigned cigar_sz, uint32_t*& cigar_dest, unsigned& cigar_dest_sz, int& new_pos, bool& already_perfect, bool& clip_failed, bool& alignment_failed, bool& unclip_failed) { already_perfect = false; alignment_failed = false; unclip_failed = false; unsigned oplen; const char* q_seq_clipped = q_seq; const uint32_t* cigar_clipped = cigar; unsigned cigar_sz_clipped = cigar_sz; unsigned sclip_q_len, sclip_r_len, sclip_al_len; assert (cigar_sz); // reset realigner Reset (); // set clipping SetClipping ((int) cliptype_, forward); // clip out the hard and soft clipping zones from 5" and 3" // The 'cut out' of the q_seq is done by switching to downstream pointer. if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (*cigar); ClipStart (oplen); q_seq_clipped += oplen; ++cigar_clipped; --cigar_sz_clipped; } if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (cigar [cigar_sz - 1]); ClipEnd (oplen); --cigar_sz_clipped; } // cigar defines q_seq and t_seq lengths sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len); const std::string query (q_seq_clipped, sclip_q_len); const std::string target (r_seq, sclip_r_len); std::string pretty_al; pretty_al.reserve (sclip_al_len); pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al); // Realigner requires strings of proper size to be passed to SetSequences SetSequences (query, target, pretty_al, forward); if (!ClipAnchors (clip_failed)) { already_perfect = true; return false; // alignment already good, no imperfect zone to realign found } // TODO avoid automatic vectors to prevent unneeded heap usage vector<MDelement> new_md_vec; vector<CigarOp> new_cigar_vec; unsigned int start_pos_shift; if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift)) { alignment_failed = true; return false; } if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len)) { unclip_failed = true; return false; // error adding back clipped out zones } if (!LeftAnchorClipped () && start_pos_shift != 0) { // build cigar data only if it is needed // TODO avoid automatic vectors to prevent unneeded heap usage std::vector <CigarOp> cigar_vec; cigar_vector_from_bin (cigar, cigar_sz, cigar_vec); new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos); } else new_pos = r_pos; // free (cigar_dest); // TODO: switch to better alignment memory management, avoid heap operations cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest"); cigar_dest_sz = new_cigar_vec.size (); cigar_vector_to_bin (new_cigar_vec, cigar_dest); return true; }
int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) { bam1_t *b = 0; kstring_t r, q; int r_tid = -1; uint32_t *cigar2 = 0; int ret = 0, n2 = 0, m2 = 0, *posmap = 0; b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; int read_ret; while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in' // Cannot depad unmapped CRAM data if (b->core.flag & BAM_FUNMAP) goto next_seq; uint32_t *cigar = bam_get_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); r_tid = b->core.tid; if (0!=unpad_seq(b, &r)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); return -1; }; if (h->target_len[r_tid] != r.l) { fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { // Check the embedded reference matches the FASTA file if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); return -1; } assert(r.l == q.l); int i; for (i = 0; i < r.l; ++i) { if (r.s[i] != q.s[i]) { // Show gaps as ASCII 45 fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", h->target_name[b->core.tid], i+1, r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); return -1; } } } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = update_posmap(posmap, r); } else if (b->core.n_cigar > 0) { int i, k, op; if (b->core.tid < 0) { fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); return -1; } else if (b->core.tid == r_tid) { ; // good case, reference available //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); } else if (fai) { if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); r_tid = b->core.tid; // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); } else { fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); return -1; } if (0!=unpad_seq(b, &q)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); return -1; }; if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[1]); } } /* Determine CIGAR operator for each base in the aligned read */ for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); /* Include any pads if starts with an insert */ if (q.s[0] == BAM_CINS) { for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); k = 0; } else if (q.s[0] == BAM_CPAD) { // Join 'k' CPAD to our first cigar op CPAD too. for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); } else { k = 0; } /* Count consecutive CIGAR operators to turn into a CIGAR string */ for (i = 1, k++, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); } write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */ int pre_op, post_op; for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) { pre_op = bam_cigar_op(cigar2[i-2]); post_op = bam_cigar_op(cigar2[i]); /* Note don't need to check for X/= as code above will use M only */ if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) { /* This is a redundant P operator */ cigar2[i-1] = 0; // i.e. 0M /* If had same operator either side, combine them in post_op */ if (pre_op == post_op) { /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/ cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op); cigar2[i-2] = 0; // i.e. 0M } } } /* Remove the zero'd operators (0M) */ for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); } /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */ if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; if (b->core.mtid < 0 || b->core.mpos < 0) { /* Nice case, no mate to worry about*/ // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); /* TODO - Warning if FLAG says mate should be mapped? */ /* Clean up funny input where mate position is given but mate reference is missing: */ b->core.mtid = -1; b->core.mpos = -1; } else if (b->core.mtid == b->core.tid) { /* Nice case, same reference */ // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); b->core.mpos = posmap[b->core.mpos]; } else { /* Nasty case, Must load alternative posmap */ // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); if (!fai) { fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); return -1; } /* Temporarily load the other reference sequence */ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); return -1; } posmap = update_posmap(posmap, r); b->core.mpos = posmap[b->core.mpos]; /* Restore the reference and posmap*/ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); } /* Most reads will have been moved so safest to always recalculate the BIN value */ b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b)); next_seq: sam_write1(out, h, b); } if (read_ret < -1) { fprintf(stderr, "[depad] truncated file.\n"); ret = 1; } free(r.s); free(q.s); free(posmap); bam_destroy1(b); return ret; }