void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) { bam1_t *swap; int i, end; uint32_t *cigar; str->l = 0; if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } end = bam_calend(&b1->core, cigar); kputw(b2->core.pos - end, str); kputc('T', str); kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); }
static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) { unsigned k; bam1_t *b = p->b; bam1_core_t *c = &b->core; uint32_t x = c->pos, y = 0; int ret = 1, is_restart = 1; if (c->flag&BAM_FUNMAP) return 0; // unmapped read assert(x <= pos); // otherwise a bug p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0; for (k = 0; k < c->n_cigar; ++k) { int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip if (x + l > pos) { // overlap with pos p->indel = p->is_del = 0; p->qpos = y + (pos - x); if (x == pos && is_restart) p->is_head = 1; if (x + l - 1 == pos) { // come to the end of a match if (k < c->n_cigar - 1) { // there are additional operation(s) uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP) p->is_tail = 1; // tail } else p->is_tail = 1; // this is the last operation; set tail }
void GBamRecord::set_cigar(const char* cigar) { //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call int doff=b->core.l_qname; uint8_t* after_cigar=NULL; int after_cigar_len=0; uint8_t* prev_bdata=NULL; if (b->data_len>doff) { //cigar string already allocated, replace it int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data after_cigar=b->data+d; after_cigar_len=b->data_len-d; } const char *s; char *t; int i, op; long x; b->core.n_cigar = 0; if (cigar != NULL && strcmp(cigar, "*") != 0) { for (s = cigar; *s; ++s) { if (isalpha(*s)) b->core.n_cigar++; else if (!isdigit(*s)) { GError("Error: invalid CIGAR character (%s)\n",cigar); } } if (after_cigar_len>0) { //replace/insert into existing full data prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); free(prev_bdata); } else { realloc_bdata(b, doff + b->core.n_cigar * 4); } for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true; else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true; else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true; else if (op == 'P') op = BAM_CPAD; else GError("Error: invalid CIGAR operation (%s)\n",cigar); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; } if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); } else {//no CIGAR string given if (!(b->core.flag&BAM_FUNMAP)) { GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); b->core.flag |= BAM_FUNMAP; } b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); } setupCoordinates(); } //set_cigar()
// currently, this function ONLY works if each read has one hit void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end; kstring_t str; str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.tid < 0) continue; cur_end = bam_calend(&cur->core, bam1_cigar(cur)); if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; else cur->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; } else { // unpaired or singleton pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; if (pre->core.flag & BAM_FPAIRED) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; } bam_write1(out, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
int bamGetTargetLength(const bam1_t *bam) /* Tally up the alignment's length on the reference sequence from * bam's packed-int CIGAR representation. */ { unsigned int *cigarPacked = bam1_cigar(bam); const bam1_core_t *core = &bam->core; int tLength=0; int i; for (i = 0; i < core->n_cigar; i++) { char op; int n = bamUnpackCigarElement(cigarPacked[i], &op); switch (op) { case 'M': // match or mismatch (gapless aligned block) case '=': // match case 'X': // mismatch tLength += n; break; case 'I': // inserted in query break; case 'D': // deleted from query case 'N': // long deletion from query (intron as opposed to small del) tLength += n; break; case 'S': // skipped query bases at beginning or end ("soft clipping") case 'H': // skipped query bases not stored in record's query sequence ("hard clipping") case 'P': // P="silent deletion from padded reference sequence" -- ignore these. break; default: errAbort("bamGetTargetLength: unrecognized CIGAR op %c -- update me", op); } } return tLength; }
unsigned char gt_sam_alignment_cigar_i_operation(GtSamAlignment *sam_alignment, uint16_t i) { gt_assert(sam_alignment != NULL); switch ((unsigned char) bam1_cigar(sam_alignment->s_alignment)[i] & BAM_CIGAR_MASK) { case BAM_CMATCH: return 'M'; case BAM_CINS: return 'I'; case BAM_CDEL: return 'D'; case BAM_CREF_SKIP: return 'N'; case BAM_CSOFT_CLIP: return 'S'; case BAM_CHARD_CLIP: return 'H'; case BAM_CPAD: return 'P'; case BAM_CEQUAL: return '='; case BAM_CDIFF: return 'X'; default: exit(GT_EXIT_PROGRAMMING_ERROR); } }
int bam_plp_push(bam_plp_t iter, const bam1_t *b) { if (iter->error) return -1; if (b) { if (b->core.tid < 0) return 0; if (b->core.flag & iter->flag_mask) return 0; if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0; bam_copy1(&iter->tail->b, b); iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t if (b->core.tid < iter->max_tid) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); iter->error = 1; return -1; } if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); iter->error = 1; return -1; } iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { iter->tail->next = mp_alloc(iter->mp); iter->tail = iter->tail->next; } } else iter->is_eof = 1; return 0; }
void bam_print(bam1_t* bam_p, int base_quality) { printf("\n------------------------------------------------------------------->\n"); printf("bam_p->data (qname): %s\n", bam1_qname(bam_p)); printf("bam_p->data (seq): %s\n", convert_to_sequence_string(bam1_seq(bam_p), bam_p->core.l_qseq)); //quality printf("bam_p->data (qual): "); char* quality = (char*) bam1_qual(bam_p); for (int i = 0; i < bam_p->core.l_qseq; i++) { printf("%c", (quality[i] + base_quality)); } printf("\n"); printf("bam_p->data (cigar): %s\n", convert_to_cigar_string(bam1_cigar(bam_p), bam_p->core.n_cigar)); //aux(optional) data printf("bam_p->data (aux): "); char* optional_fields = (char*) bam1_aux(bam_p); for (int i = 0; i < bam_p->l_aux; i++) { printf("%c", optional_fields[i]); } printf("\n"); //lengths printf("bam_p->l_aux: %i\n", bam_p->l_aux); printf("bam_p->data_len: %i\n", bam_p->data_len); printf("bam_p->m_data: %i\n", bam_p->m_data); //core printf("bam_p->core.tid: %i\n", bam_p->core.tid); printf("bam_p->core.pos: %i\n", bam_p->core.pos); printf("bam_p->core.bin: %u\n", bam_p->core.bin); printf("bam_p->core.qual: %u\n", bam_p->core.qual); printf("bam_p->core.l_qname: %u\n", bam_p->core.l_qname); printf("bam_p->core.flag (16 bits): %u\n", bam_p->core.flag); printf("bam_p->core.n_cigar: %u\n", bam_p->core.n_cigar); printf("bam_p->core.l_qseq: %i\n", bam_p->core.l_qseq); printf("bam_p->core.mtid: %i\n", bam_p->core.mtid); printf("bam_p->core.mpos: %i\n", bam_p->core.mpos); printf("bam_p->core.isize: %i\n", bam_p->core.isize); printf("\nbam1_t.core flags\n"); printf("-----------------------\n"); printf("flag (is_paired_end): %i\n", (bam_p->core.flag & BAM_FPAIRED) ? 1 : 0); printf("flag (is_paired_end_mapped): %i\n", (bam_p->core.flag & BAM_FPROPER_PAIR) ? 1 : 0); printf("flag (is_seq_unmapped): %i\n", (bam_p->core.flag & BAM_FUNMAP) ? 1 : 0); printf("flag (is_mate_unmapped): %i\n", (bam_p->core.flag & BAM_FMUNMAP) ? 1 : 0); printf("flag (seq_strand): %i\n", (bam_p->core.flag & BAM_FREVERSE) ? 1 : 0); printf("flag (mate_strand): %i\n", (bam_p->core.flag & BAM_FMREVERSE) ? 1 : 0); printf("flag (pair_num_1): %i\n", (bam_p->core.flag & BAM_FREAD1) ? 1 : 0); printf("flag (pair_num_2): %i\n", (bam_p->core.flag & BAM_FREAD2) ? 1 : 0); printf("flag (primary_alignment): %i\n", (bam_p->core.flag & BAM_FSECONDARY) ? 1 : 0); printf("flag (fails_quality_check): %i\n", (bam_p->core.flag & BAM_FQCFAIL) ? 1 : 0); printf("flag (pc_optical_duplicate): %i\n", (bam_p->core.flag & BAM_FDUP) ? 1 : 0); }
struct ffAli *bamToFfAli(const bam1_t *bam, struct dnaSeq *target, int targetOffset, boolean useStrand, char **retQSeq) /* Convert from bam to ffAli format. If retQSeq is non-null, set it to the * query sequence into which ffAli needle pointers point. (Adapted from psl.c's pslToFfAli.) */ { struct ffAli *ffList = NULL, *ff; const bam1_core_t *core = &bam->core; boolean isRc = useStrand && bamIsRc(bam); DNA *needle = (DNA *)bamGetQuerySequence(bam, useStrand); if (retQSeq) *retQSeq = needle; if (isRc) reverseComplement(target->dna, target->size); DNA *haystack = target->dna; unsigned int *cigarPacked = bam1_cigar(bam); int tStart = targetOffset, qStart = 0, i; // If isRc, need to go through the CIGAR ops backwards, but sequence offsets still count up. int iStart = isRc ? (core->n_cigar - 1) : 0; int iIncr = isRc ? -1 : 1; for (i = iStart; isRc ? (i >= 0) : (i < core->n_cigar); i += iIncr) { char op; int size = bamUnpackCigarElement(cigarPacked[i], &op); switch (op) { case 'M': // match or mismatch (gapless aligned block) case '=': // match case 'X': // mismatch AllocVar(ff); ff->left = ffList; ffList = ff; ff->nStart = needle + qStart; ff->nEnd = ff->nStart + size; ff->hStart = haystack + tStart - targetOffset; ff->hEnd = ff->hStart + size; tStart += size; qStart += size; break; case 'I': // inserted in query case 'S': // skipped query bases at beginning or end ("soft clipping") qStart += size; break; case 'D': // deleted from query case 'N': // long deletion from query (intron as opposed to small del) tStart += size; break; case 'H': // skipped query bases not stored in record's query sequence ("hard clipping") case 'P': // P="silent deletion from padded reference sequence" -- ignore these. break; default: errAbort("bamToFfAli: unrecognized CIGAR op %c -- update me", op); } } ffList = ffMakeRightLinks(ffList); ffCountGoodEnds(ffList); return ffList; }
static int32_t sw_align_get_soft_clip(bam1_t *b, int32_t is_end) { int32_t n; n = bam1_cigar(b)[(0 == is_end) ? 0 : b->core.n_cigar-1]; if(BAM_CSOFT_CLIP == (n & BAM_CIGAR_MASK)) { // soft-clipping return (n >> BAM_CIGAR_SHIFT); } return 0; }
alignment_t* alignment_new_by_bam(bam1_t* bam_p, int base_quality) { //memory allocation for the structure alignment_t* alignment_p = (alignment_t*) calloc(1, sizeof(alignment_t)); //numeric data alignment_p->num_cigar_operations = (int) bam_p->core.n_cigar; alignment_p->chromosome = bam_p->core.tid; alignment_p->position = bam_p->core.pos; alignment_p->mate_chromosome = bam_p->core.mtid; alignment_p->mate_position = bam_p->core.mpos; alignment_p->map_quality = bam_p->core.qual; alignment_p->template_length = bam_p->core.isize; //memory allocation for inner fields according to indicated sizes alignment_p->query_name = (char*) calloc(bam_p->core.l_qname, sizeof(char)); alignment_p->sequence = (char*) calloc(bam_p->core.l_qseq + 1, sizeof(char)); alignment_p->quality = (char*) calloc(bam_p->core.l_qseq + 1, sizeof(char)); //same length as sequence alignment_p->cigar = (char*) calloc(max(MIN_ALLOCATED_SIZE_FOR_CIGAR_STRING, alignment_p->num_cigar_operations << 2), sizeof(char)); alignment_p->optional_fields = (uint8_t*) calloc(bam_p->l_aux, sizeof(uint8_t)); alignment_p->optional_fields_length = bam_p->l_aux; //copy the data between structures strcpy(alignment_p->query_name, bam1_qname(bam_p)); strcpy(alignment_p->sequence, convert_to_sequence_string(bam1_seq(bam_p), bam_p->core.l_qseq)); //char* quality_string = (char *)malloc(sizeof(char)*(quality_length + 1)); convert_to_quality_string_length(alignment_p->quality, bam1_qual(bam_p), bam_p->core.l_qseq, base_quality); //strcpy(alignment_p->quality, quality_string); //free(quality_string); strcpy(alignment_p->cigar, convert_to_cigar_string(bam1_cigar(bam_p), alignment_p->num_cigar_operations)); memcpy(alignment_p->optional_fields, bam1_aux(bam_p), bam_p->l_aux); //flags uint32_t flag = (uint32_t) bam_p->core.flag; alignment_p->is_paired_end = (flag & BAM_FPAIRED) ? 1 : 0; alignment_p->is_paired_end_mapped = (flag & BAM_FPROPER_PAIR) ? 1 : 0; alignment_p->is_seq_mapped = (flag & BAM_FUNMAP) ? 0 : 1; //in bam structure is negative flag!!! alignment_p->is_mate_mapped = (flag & BAM_FMUNMAP) ? 0 : 1; //in bam structure is negative flag!!! alignment_p->seq_strand = (flag & BAM_FREVERSE) ? 1 : 0; alignment_p->mate_strand = (flag & BAM_FMREVERSE) ? 1 : 0; if (flag & BAM_FREAD1) { alignment_p->pair_num = 1; } else if (flag & BAM_FREAD2) { alignment_p->pair_num = 2; } else { alignment_p->pair_num = 0; } alignment_p->primary_alignment = (flag & BAM_FSECONDARY) ? 1 : 0; alignment_p->fails_quality_check = (flag & BAM_FQCFAIL) ? 1 : 0; alignment_p->pc_optical_duplicate = (flag & BAM_FDUP) ? 1 : 0; return alignment_p; }
static int fill_buf(samfile_t *in, buffer_t *buf) { int i, ret, last_tid, min_rpos = 0x7fffffff, capacity; bam1_t *b = bam_init1(); bam1_core_t *c = &b->core; // squeeze out the empty cells at the beginning for (i = 0; i < buf->n; ++i) if (buf->buf[i].b) break; if (i < buf->n) { // squeeze if (i > 0) { memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i)); buf->n = buf->n - i; } } else buf->n = 0; // calculate min_rpos for (i = 0; i < buf->n; ++i) { elem_t *e = buf->buf + i; if (e->b && e->rpos >= 0 && e->rpos < min_rpos) min_rpos = buf->buf[i].rpos; } // fill the buffer buf->x = -1; last_tid = buf->n? buf->buf[0].b->core.tid : -1; capacity = buf->n + BLOCK_SIZE; while ((ret = samread(in, b)) >= 0) { elem_t *e; uint8_t *qual = bam1_qual(b); int is_mapped; if (last_tid < 0) last_tid = c->tid; if (c->tid != last_tid) { if (buf->x < 0) buf->x = buf->n; } if (buf->n >= buf->max) { // enlarge buf->max = buf->max? buf->max<<1 : 8; buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max); } e = &buf->buf[buf->n++]; e->b = bam_dup1(b); e->rpos = -1; e->score = 0; for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1; e->score = (double)e->score / sqrt(c->l_qseq + 1); is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1; if (!is_mapped) e->score = -1; if (is_mapped && (c->flag & BAM_FREVERSE)) { e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b)); if (min_rpos > e->rpos) min_rpos = e->rpos; } if (buf->n >= capacity) { if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE; else break; } } if (ret >= 0 && buf->x < 0) buf->x = buf->n; bam_destroy1(b); return buf->n; }
// This function reads a BAM alignment from one BAM file. static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); if (!(b->core.flag&BAM_FUNMAP)) { if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP; } return ret; }
int bam_pad2unpad(bamFile in, bamFile out) { bam_header_t *h; bam1_t *b; kstring_t r, q; uint32_t *cigar2 = 0; int n2 = 0, m2 = 0, *posmap = 0; h = bam_header_read(in); bam_header_write(out, h); b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; while (bam_read1(in, b) >= 0) { uint32_t *cigar = bam1_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { int i, k; unpad_seq(b, &r); write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = realloc(posmap, r.m * sizeof(int)); for (i = k = 0; i < r.l; ++i) { posmap[i] = k; // note that a read should NOT start at a padding if (r.s[i]) ++k; } } else { int i, k, op; unpad_seq(b, &q); if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); for (i = k = 1, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); b->core.pos = posmap[b->core.pos]; } bam_write1(out, b); } free(r.s); free(q.s); free(posmap); bam_destroy1(b); bam_header_destroy(h); return 0; }
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) { uint8_t *s = bam1_seq(b), *t = bam1_qual(b); int i; const bam1_core_t *c = &b->core; kstring_t str; str.l = str.m = 0; str.s = 0; ksprintf(&str, "%s\t", bam1_qname(b)); if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag); else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); else { // BAM_OFSTR for (i = 0; i < 16; ++i) if ((c->flag & 1<<i) && bam_flag2char_table[i]) kputc(bam_flag2char_table[i], &str); kputc('\t', &str); } if (c->tid < 0) kputs("*\t", &str); else ksprintf(&str, "%s\t", header->target_name[c->tid]); ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); if (c->n_cigar == 0) kputc('*', &str); else { for (i = 0; i < c->n_cigar; ++i) ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); } kputc('\t', &str); if (c->mtid < 0) kputs("*\t", &str); else if (c->mtid == c->tid) kputs("=\t", &str); else ksprintf(&str, "%s\t", header->target_name[c->mtid]); ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); kputc('\t', &str); if (t[0] == 0xff) kputc('*', &str); else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); } else ksprintf(&str, "*\t*"); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; ksprintf(&str, "\t%c%c:", key[0], key[1]); if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } else if (type == 'c') { ksprintf(&str, "i:%d", *(int8_t*)s); ++s; } else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } } return str.s; }
GtUword gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment) { gt_assert(sam_alignment != NULL); if (sam_alignment->rightmost == GT_UNDEF_UWORD) { sam_alignment->rightmost = (GtUword)bam_calend( &sam_alignment->s_alignment->core, bam1_cigar(sam_alignment->s_alignment)); } return sam_alignment->rightmost; }
unsigned long gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment) { gt_assert(sam_alignment != NULL); if (sam_alignment->rightmost == GT_UNDEF_ULONG) { sam_alignment->rightmost = (unsigned long)bam_calend( &sam_alignment->s_alignment->core, bam1_cigar(sam_alignment->s_alignment)); } return sam_alignment->rightmost; }
// Adapted from samtools/bam.c int32_t b2g_bam_clippedlength(bam1_t *b) { const bam1_core_t c = b->core; const uint32_t *cigar = bam1_cigar(b); uint32_t k; int32_t l = 0; for (k = 0; k < c.n_cigar; ++k) { if ('S' == bam_cigar_opchr(bam_cigar_op(cigar[k]))) { l += bam_cigar_oplen(cigar[k]); } } return l; }
static double TGM_GetMismatchRate(const bam1_t* pAlignment) { uint32_t* cigar = bam1_cigar(pAlignment); unsigned int numMismatch = 0; for (unsigned i = 0; i != pAlignment->core.n_cigar; ++i) { int type = (cigar[i] & BAM_CIGAR_MASK); if (type == BAM_CINS || type == BAM_CDEL || type == BAM_CSOFT_CLIP || type == BAM_CMISMATCH) { numMismatch += (cigar[i] >> BAM_CIGAR_SHIFT); } }
/* from char *bam_format1_core(const bam_header_t *header, const * bam1_t *b, int of) */ char * cigar_str_from_bam(const bam1_t *b) { const bam1_core_t *c = &b->core; kstring_t str; int i; str.l = str.m = 0; str.s = 0; for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } return str.s; }
void bamUnpackCigar(const bam1_t *bam, struct dyString *dyCigar) /* Unpack CIGAR string into dynamic string */ { unsigned int *cigarPacked = bam1_cigar(bam); const bam1_core_t *core = &bam->core; int i; for (i = 0; i < core->n_cigar; i++) { char op; int n = bamUnpackCigarElement(cigarPacked[i], &op); dyStringPrintf(dyCigar, "%d", n); dyStringAppendC(dyCigar, op); } }
static int countBam(const bam1_t *bam, void *data) /* bam_fetch() calls this on each bam alignment retrieved. */ { struct bamWigTrackData *btd = (struct bamWigTrackData *)data; const bam1_core_t *core = &bam->core; int tLength=0, tPos = core->pos, qPos = 0; unsigned int *cigar = bam1_cigar(bam); int i; double scale = btd->scale; for (i = 0; i < core->n_cigar; i++) { char op; int n = bamUnpackCigarElement(cigar[i], &op); switch (op) { case 'X': // mismatch (gapless aligned block) case '=': // match (gapless aligned block) case 'M': // match or mismatch (gapless aligned block) { int start = (int)(scale * (tPos - winStart)); int end = (int)(scale * ((tPos + n) - winStart)); for(i=start; i < end; i++) btd->preDraw[i + btd->preDrawZero].count++; tPos = tPos + n; qPos = qPos + n; tLength += n; break; } case 'I': // inserted in query qPos += n; break; case 'D': // deleted from query case 'N': // long deletion from query (intron as opposed to small del) tPos += n; tLength += n; break; case 'S': // skipped query bases at beginning or end ("soft clipping") case 'H': // skipped query bases not stored in record's query sequence ("hard clipping") case 'P': // P="silent deletion from padded reference sequence" -- ignore these. break; default: errAbort("countBam: unrecognized CIGAR op %c -- update me", op); } } return 0; }
struct simpleFeature *sfFromNumericCigar(const bam1_t *bam, int *retLength) /* Translate BAM's numeric CIGAR encoding into a list of simpleFeatures, * and tally up length on reference sequence while we're at it. */ { const bam1_core_t *core = &bam->core; struct simpleFeature *sf, *sfList = NULL; int tLength=0, tPos = core->pos, qPos = 0; unsigned int *cigar = bam1_cigar(bam); int i; for (i = 0; i < core->n_cigar; i++) { char op; int n = bamUnpackCigarElement(cigar[i], &op); switch (op) { case 'X': // mismatch (gapless aligned block) case '=': // match (gapless aligned block) case 'M': // match or mismatch (gapless aligned block) AllocVar(sf); sf->start = tPos; sf->qStart = qPos; tPos = sf->end = tPos + n; qPos = sf->qEnd = qPos + n; slAddHead(&sfList, sf); tLength += n; break; case 'I': // inserted in query qPos += n; break; case 'D': // deleted from query case 'N': // long deletion from query (intron as opposed to small del) tPos += n; tLength += n; break; case 'S': // skipped query bases at beginning or end ("soft clipping") case 'H': // skipped query bases not stored in record's query sequence ("hard clipping") case 'P': // P="silent deletion from padded reference sequence" -- ignore these. break; default: errAbort("sfFromNumericCigar: unrecognized CIGAR op %c -- update me", op); } } if (retLength != NULL) *retLength = tLength; slReverse(&sfList); return sfList; }
void bamShowCigarEnglish(const bam1_t *bam) /* Print out cigar in English e.g. "20 (mis)Match, 1 Deletion, 3 (mis)Match" */ { unsigned int *cigarPacked = bam1_cigar(bam); const bam1_core_t *core = &bam->core; int i; for (i = 0; i < core->n_cigar; i++) { char op; int n = bamUnpackCigarElement(cigarPacked[i], &op); if (i > 0) printf(", "); switch (op) { case 'M': // match or mismatch (gapless aligned block) printf("%d (mis)Match", n); break; case '=': // match printf("%d Match", n); break; case 'X': // mismatch printf("%d Mismatch", n); break; case 'I': // inserted in query printf("%d Insertion", n); break; case 'S': // skipped query bases at beginning or end ("soft clipping") printf("%d Skipped", n); break; case 'D': // deleted from query printf("%d Deletion", n); break; case 'N': // long deletion from query (intron as opposed to small del) printf("%d deletioN", n); break; case 'H': // skipped query bases not stored in record's query sequence ("hard clipping") printf("%d Hard clipped query", n); break; case 'P': // P="silent deletion from padded reference sequence" printf("%d Padded / silent deletion", n); break; default: errAbort("bamShowCigarEnglish: unrecognized CIGAR op %c -- update me", op); } } }
void GBamRecord::add_sequence(const char* qseq, int slen) { //must be called AFTER set_cigar (cannot replace existing sequence for now) if (qseq==NULL) return; //should we ever care about this? if (slen<0) slen=strlen(qseq); int doff = b->core.l_qname + b->core.n_cigar * 4; if (strcmp(qseq, "*")!=0) { b->core.l_qseq=slen; if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) GError("Error: CIGAR and sequence length are inconsistent!(%s)\n", qseq); uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; //also allocated quals memory memset(p, 0, (b->core.l_qseq+1)/2); for (int i = 0; i < b->core.l_qseq; ++i) p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); } else b->core.l_qseq = 0; }
/** * @brief Updates a buffered_read with information from bamread * * @param bufread Buffered reads * @param bamread Read info stored in the samtools struct bam1_t * @param rm Results from quality_check() * @return void * @details Copies information from bamread and destroys bamread after passing * @todo nothing */ void store_read(buffered_read_t *bufread, bam1_t *bamread,read_metrics_t *rm){ bufread->chrom_index=bamread->core.tid; if(sizeof(uint32_t)*bamread->core.n_cigar>MAX_NCIGAR)bufread->cigar=(uint32_t *)Realloc(bufread->cigar,bamread->core.n_cigar,uint32_t); memcpy(bufread->cigar,bam1_cigar(bamread),sizeof(uint32_t)*bamread->core.n_cigar); bufread->l_seq=rm->read_length; bufread->tlen=bamread->core.isize; bufread->mapq=bamread->core.qual; bufread->n_cigar=bamread->core.n_cigar; bufread->pos=bamread->core.pos;// this program uses 1 based positions bufread->revcomp=rm->revcomp; bufread->proper_pair=bam1_ppair(bamread); bufread->written=0; bufread->genomic_end=rm->genomic_end; bam_destroy1(bamread); }
void bamGetSoftClipping(const bam1_t *bam, int *retLow, int *retHigh, int *retClippedQLen) /* If retLow is non-NULL, set it to the number of "soft-clipped" (skipped) bases at * the beginning of the query sequence and quality; likewise for retHigh at end. * For convenience, retClippedQLen is the original query length minus soft clipping * (and the length of the query sequence that will be returned). */ { unsigned int *cigarPacked = bam1_cigar(bam); const bam1_core_t *core = &bam->core; char op; int n = bamUnpackCigarElement(cigarPacked[0], &op); int low = (op == 'S') ? n : 0; n = bamUnpackCigarElement(cigarPacked[core->n_cigar-1], &op); int high = (op == 'S') ? n : 0; if (retLow != NULL) *retLow = low; if (retHigh != NULL) *retHigh = high; if (retClippedQLen != NULL) *retClippedQLen = (core->l_qseq - low - high); }
void GBamRecord::setupCoordinates() { const bam1_core_t *c = &b->core; if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */ uint32_t *p = bam1_cigar(b); //--- prevent alignment error here (reported by UB-sanitazer): uint32_t *cigar= new uint32_t[c->n_cigar]; memcpy(cigar, p, c->n_cigar * sizeof(uint32_t)); //--- UBsan protection end int l=0; mapped_len=0; clipL=0; clipR=0; start=c->pos+1; //genomic start coordinate, 1-based (BAM core.pos is 0-based) int exstart=c->pos; for (int i = 0; i < c->n_cigar; ++i) { int op = cigar[i]&0xf; if (op == BAM_CMATCH || op==BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL) { l += cigar[i]>>4; } else if (op == BAM_CREF_SKIP) { //N
static void unpad_seq(bam1_t *b, kstring_t *s) { int k, j, i; uint32_t *cigar = bam1_cigar(b); uint8_t *seq = bam1_seq(b); ks_resize(s, b->core.l_qseq); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP); if (op == BAM_CMATCH) { for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j); ++j; } else if (op == BAM_CSOFT_CLIP) { j += ol; } else { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } } }
int TGM_GetNumMismatchFromBam(const bam1_t* pAlgn) { int numMM = 0; uint32_t* cigar = bam1_cigar(pAlgn); for (unsigned i = 0; i != pAlgn->core.n_cigar; ++i) { int type = (cigar[i] & BAM_CIGAR_MASK); if (type == BAM_CINS || type == BAM_CDEL) numMM += (cigar[i] >> BAM_CIGAR_SHIFT); } uint8_t* mdPos = bam_aux_get(pAlgn, "MD"); if (mdPos != NULL) { const char* mdStr = bam_aux2Z(mdPos); const char* mdFieldPos = mdStr; while (mdFieldPos != NULL && *mdFieldPos != '\0') { if (isdigit(*mdFieldPos)) { ++mdFieldPos; continue; } const char* mdFieldEnd = mdFieldPos + 1; while (!isdigit(*mdFieldEnd) && *mdFieldEnd != '\0') ++mdFieldEnd; if (*mdFieldPos != '^') numMM += mdFieldEnd - mdFieldPos; mdFieldPos = mdFieldEnd; } } return numMM; }