Beispiel #1
0
void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
{
	bam1_t *swap;
	int i, end;
	uint32_t *cigar;
	str->l = 0;
	if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip
	if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
	kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	end = bam_calend(&b1->core, cigar);
	kputw(b2->core.pos - end, str);
	kputc('T', str);
	kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); 
}
static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
{
	unsigned k;
	bam1_t *b = p->b;
	bam1_core_t *c = &b->core;
	uint32_t x = c->pos, y = 0;
	int ret = 1, is_restart = 1;

	if (c->flag&BAM_FUNMAP) return 0; // unmapped read
	assert(x <= pos); // otherwise a bug
	p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
	for (k = 0; k < c->n_cigar; ++k) {
		int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
		int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
		if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip
			if (x + l > pos) { // overlap with pos
				p->indel = p->is_del = 0;
				p->qpos = y + (pos - x);
				if (x == pos && is_restart) p->is_head = 1;
				if (x + l - 1 == pos) { // come to the end of a match
					if (k < c->n_cigar - 1) { // there are additional operation(s)
						uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
						int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
						if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
						else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
						if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
							p->is_tail = 1; // tail
					} else p->is_tail = 1; // this is the last operation; set tail
				}
Beispiel #3
0
 void GBamRecord::set_cigar(const char* cigar) {
   //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
   int doff=b->core.l_qname;
   uint8_t* after_cigar=NULL;
   int after_cigar_len=0;
   uint8_t* prev_bdata=NULL;
   if (b->data_len>doff) {
      //cigar string already allocated, replace it
      int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
      after_cigar=b->data+d;
      after_cigar_len=b->data_len-d;
      }
   const char *s;
   char *t;
   int i, op;
   long x;
   b->core.n_cigar = 0;
   if (cigar != NULL && strcmp(cigar, "*") != 0) {
        for (s = cigar; *s; ++s) {
            if (isalpha(*s)) b->core.n_cigar++;
            else if (!isdigit(*s)) {
                 GError("Error: invalid CIGAR character (%s)\n",cigar);
                 }
            }
        if (after_cigar_len>0) { //replace/insert into existing full data
             prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
             memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
             free(prev_bdata);
             }
           else {
             realloc_bdata(b, doff + b->core.n_cigar * 4);
             }
        for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
            x = strtol(s, &t, 10);
            op = toupper(*t);
            if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
            else if (op == 'I') op = BAM_CINS;
            else if (op == 'D') op = BAM_CDEL;
            else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true;
            else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true;
            else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true;
            else if (op == 'P') op = BAM_CPAD;
            else GError("Error: invalid CIGAR operation (%s)\n",cigar);
            s = t + 1;
            bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
        }
        if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
        b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
    } else {//no CIGAR string given
        if (!(b->core.flag&BAM_FUNMAP)) {
            GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
            b->core.flag |= BAM_FUNMAP;
        }
        b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
    }
   setupCoordinates();
   } //set_cigar()
Beispiel #4
0
// currently, this function ONLY works if each read has one hit
void bam_mating_core(bamFile in, bamFile out)
{
	bam_header_t *header;
	bam1_t *b[2];
	int curr, has_prev, pre_end = 0, cur_end;
	kstring_t str;

	str.l = str.m = 0; str.s = 0;
	header = bam_header_read(in);
	bam_header_write(out, header);

	b[0] = bam_init1();
	b[1] = bam_init1();
	curr = 0; has_prev = 0;
	while (bam_read1(in, b[curr]) >= 0) {
		bam1_t *cur = b[curr], *pre = b[1-curr];
		if (cur->core.tid < 0) continue;
		cur_end = bam_calend(&cur->core, bam1_cigar(cur));
		if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP;
		if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments
		if (has_prev) {
			if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
				cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
				pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
				if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
					&& !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE
				{
					uint32_t cur5, pre5;
					cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
					pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
					cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
				} else cur->core.isize = pre->core.isize = 0;
				if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
				else cur->core.flag &= ~BAM_FMREVERSE;
				if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
				else pre->core.flag &= ~BAM_FMREVERSE;
				if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
				if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
				bam_template_cigar(pre, cur, &str);
				bam_write1(out, pre);
				bam_write1(out, cur);
				has_prev = 0;
			} else { // unpaired or singleton
				pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
				if (pre->core.flag & BAM_FPAIRED) {
					pre->core.flag |= BAM_FMUNMAP;
					pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
				}
				bam_write1(out, pre);
			}
		} else has_prev = 1;
		curr = 1 - curr;
		pre_end = cur_end;
	}
	if (has_prev) bam_write1(out, b[1-curr]);
	bam_header_destroy(header);
	bam_destroy1(b[0]);
	bam_destroy1(b[1]);
	free(str.s);
}
Beispiel #5
0
int bamGetTargetLength(const bam1_t *bam)
/* Tally up the alignment's length on the reference sequence from
 * bam's packed-int CIGAR representation. */
{
    unsigned int *cigarPacked = bam1_cigar(bam);
    const bam1_core_t *core = &bam->core;
    int tLength=0;
    int i;
    for (i = 0;  i < core->n_cigar;  i++)
    {
        char op;
        int n = bamUnpackCigarElement(cigarPacked[i], &op);
        switch (op)
        {
        case 'M': // match or mismatch (gapless aligned block)
        case '=': // match
        case 'X': // mismatch
            tLength += n;
            break;
        case 'I': // inserted in query
            break;
        case 'D': // deleted from query
        case 'N': // long deletion from query (intron as opposed to small del)
            tLength += n;
            break;
        case 'S': // skipped query bases at beginning or end ("soft clipping")
        case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
        case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
            break;
        default:
            errAbort("bamGetTargetLength: unrecognized CIGAR op %c -- update me", op);
        }
    }
    return tLength;
}
Beispiel #6
0
unsigned char gt_sam_alignment_cigar_i_operation(GtSamAlignment *sam_alignment,
                                                 uint16_t i)
{
  gt_assert(sam_alignment != NULL);
  switch ((unsigned char) bam1_cigar(sam_alignment->s_alignment)[i]
                          & BAM_CIGAR_MASK) {
    case BAM_CMATCH:
      return 'M';
    case BAM_CINS:
      return 'I';
    case BAM_CDEL:
      return 'D';
    case BAM_CREF_SKIP:
      return 'N';
    case BAM_CSOFT_CLIP:
      return 'S';
    case BAM_CHARD_CLIP:
      return 'H';
    case BAM_CPAD:
      return 'P';
    case BAM_CEQUAL:
      return '=';
    case BAM_CDIFF:
      return 'X';
    default:
      exit(GT_EXIT_PROGRAMMING_ERROR);
  }
}
Beispiel #7
0
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
{
	if (iter->error) return -1;
	if (b) {
		if (b->core.tid < 0) return 0;
		if (b->core.flag & iter->flag_mask) return 0;
		if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
		bam_copy1(&iter->tail->b, b);
		iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
		iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
		if (b->core.tid < iter->max_tid) {
			fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
			iter->error = 1;
			return -1;
		}
		if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
			fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
			iter->error = 1;
			return -1;
		}
		iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
		if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
			iter->tail->next = mp_alloc(iter->mp);
			iter->tail = iter->tail->next;
		}
	} else iter->is_eof = 1;
	return 0;
}
Beispiel #8
0
void bam_print(bam1_t* bam_p, int base_quality) {
    printf("\n------------------------------------------------------------------->\n");
    printf("bam_p->data (qname): %s\n", bam1_qname(bam_p));
    printf("bam_p->data (seq):  %s\n", convert_to_sequence_string(bam1_seq(bam_p), bam_p->core.l_qseq));

    //quality
    printf("bam_p->data (qual): ");

    char* quality = (char*) bam1_qual(bam_p);

    for (int i = 0; i < bam_p->core.l_qseq; i++) {
        printf("%c", (quality[i] + base_quality));
    }
    printf("\n");

    printf("bam_p->data (cigar): %s\n", convert_to_cigar_string(bam1_cigar(bam_p), bam_p->core.n_cigar));

    //aux(optional) data
    printf("bam_p->data (aux): ");

    char* optional_fields = (char*) bam1_aux(bam_p);

    for (int i = 0; i < bam_p->l_aux; i++) {
        printf("%c", optional_fields[i]);
    }
    printf("\n");

    //lengths
    printf("bam_p->l_aux: %i\n", bam_p->l_aux);
    printf("bam_p->data_len: %i\n", bam_p->data_len);
    printf("bam_p->m_data: %i\n", bam_p->m_data);

    //core
    printf("bam_p->core.tid: %i\n", bam_p->core.tid);
    printf("bam_p->core.pos: %i\n", bam_p->core.pos);
    printf("bam_p->core.bin: %u\n", bam_p->core.bin);
    printf("bam_p->core.qual: %u\n", bam_p->core.qual);
    printf("bam_p->core.l_qname: %u\n", bam_p->core.l_qname);
    printf("bam_p->core.flag (16 bits): %u\n", bam_p->core.flag);
    printf("bam_p->core.n_cigar: %u\n", bam_p->core.n_cigar);
    printf("bam_p->core.l_qseq: %i\n", bam_p->core.l_qseq);
    printf("bam_p->core.mtid: %i\n", bam_p->core.mtid);
    printf("bam_p->core.mpos: %i\n", bam_p->core.mpos);
    printf("bam_p->core.isize: %i\n", bam_p->core.isize);

    printf("\nbam1_t.core flags\n");
    printf("-----------------------\n");
    printf("flag (is_paired_end): %i\n", (bam_p->core.flag & BAM_FPAIRED) ? 1 : 0);
    printf("flag (is_paired_end_mapped): %i\n", (bam_p->core.flag & BAM_FPROPER_PAIR) ? 1 : 0);
    printf("flag (is_seq_unmapped): %i\n", (bam_p->core.flag & BAM_FUNMAP) ? 1 : 0);
    printf("flag (is_mate_unmapped): %i\n", (bam_p->core.flag & BAM_FMUNMAP) ? 1 : 0);
    printf("flag (seq_strand): %i\n", (bam_p->core.flag & BAM_FREVERSE) ? 1 : 0);
    printf("flag (mate_strand): %i\n", (bam_p->core.flag & BAM_FMREVERSE) ? 1 : 0);
    printf("flag (pair_num_1): %i\n", (bam_p->core.flag & BAM_FREAD1) ? 1 : 0);
    printf("flag (pair_num_2): %i\n", (bam_p->core.flag & BAM_FREAD2) ? 1 : 0);
    printf("flag (primary_alignment): %i\n", (bam_p->core.flag & BAM_FSECONDARY) ? 1 : 0);
    printf("flag (fails_quality_check): %i\n", (bam_p->core.flag & BAM_FQCFAIL) ? 1 : 0);
    printf("flag (pc_optical_duplicate): %i\n", (bam_p->core.flag & BAM_FDUP) ? 1 : 0);
}
Beispiel #9
0
struct ffAli *bamToFfAli(const bam1_t *bam, struct dnaSeq *target, int targetOffset,
			 boolean useStrand, char **retQSeq)
/* Convert from bam to ffAli format.  If retQSeq is non-null, set it to the 
 * query sequence into which ffAli needle pointers point. (Adapted from psl.c's pslToFfAli.) */
{
struct ffAli *ffList = NULL, *ff;
const bam1_core_t *core = &bam->core;
boolean isRc = useStrand && bamIsRc(bam);
DNA *needle = (DNA *)bamGetQuerySequence(bam, useStrand);
if (retQSeq)
    *retQSeq = needle;
if (isRc)
    reverseComplement(target->dna, target->size);
DNA *haystack = target->dna;
unsigned int *cigarPacked = bam1_cigar(bam);
int tStart = targetOffset, qStart = 0, i;
// If isRc, need to go through the CIGAR ops backwards, but sequence offsets still count up.
int iStart = isRc ? (core->n_cigar - 1) : 0;
int iIncr = isRc ? -1 : 1;
for (i = iStart;  isRc ? (i >= 0) : (i < core->n_cigar);  i += iIncr)
    {
    char op;
    int size = bamUnpackCigarElement(cigarPacked[i], &op);
    switch (op)
	{
	case 'M': // match or mismatch (gapless aligned block)
	case '=': // match
	case 'X': // mismatch
	    AllocVar(ff);
	    ff->left = ffList;
	    ffList = ff;
	    ff->nStart = needle + qStart;
	    ff->nEnd = ff->nStart + size;
	    ff->hStart = haystack + tStart - targetOffset;
	    ff->hEnd = ff->hStart + size;
	    tStart += size;
	    qStart += size;
	    break;
	case 'I': // inserted in query
	case 'S': // skipped query bases at beginning or end ("soft clipping")
	    qStart += size;
	    break;
	case 'D': // deleted from query
	case 'N': // long deletion from query (intron as opposed to small del)
	    tStart += size;
	    break;
	case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
	case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
	    break;
	default:
	    errAbort("bamToFfAli: unrecognized CIGAR op %c -- update me", op);
	}
    }
ffList = ffMakeRightLinks(ffList);
ffCountGoodEnds(ffList);
return ffList;
}
Beispiel #10
0
static int32_t sw_align_get_soft_clip(bam1_t *b, int32_t is_end)
{
	int32_t n;
	n = bam1_cigar(b)[(0 == is_end) ? 0 : b->core.n_cigar-1];
	if(BAM_CSOFT_CLIP == (n & BAM_CIGAR_MASK)) { // soft-clipping
		return (n >> BAM_CIGAR_SHIFT);
	}
	return 0;
}
Beispiel #11
0
alignment_t* alignment_new_by_bam(bam1_t* bam_p, int base_quality) {
    //memory allocation for the structure
    alignment_t* alignment_p = (alignment_t*) calloc(1, sizeof(alignment_t));

    //numeric data
    alignment_p->num_cigar_operations = (int) bam_p->core.n_cigar;
    alignment_p->chromosome = bam_p->core.tid;
    alignment_p->position = bam_p->core.pos;
    alignment_p->mate_chromosome = bam_p->core.mtid;
    alignment_p->mate_position = bam_p->core.mpos;
    alignment_p->map_quality = bam_p->core.qual;
    alignment_p->template_length = bam_p->core.isize;

    //memory allocation for inner fields according to indicated sizes
    alignment_p->query_name = (char*) calloc(bam_p->core.l_qname, sizeof(char));
    alignment_p->sequence = (char*) calloc(bam_p->core.l_qseq + 1, sizeof(char));
    alignment_p->quality = (char*) calloc(bam_p->core.l_qseq + 1, sizeof(char));   //same length as sequence
    alignment_p->cigar = (char*) calloc(max(MIN_ALLOCATED_SIZE_FOR_CIGAR_STRING, alignment_p->num_cigar_operations << 2), sizeof(char));
    alignment_p->optional_fields = (uint8_t*) calloc(bam_p->l_aux, sizeof(uint8_t));
    alignment_p->optional_fields_length = bam_p->l_aux;

    //copy the data between structures
    strcpy(alignment_p->query_name, bam1_qname(bam_p));
    strcpy(alignment_p->sequence, convert_to_sequence_string(bam1_seq(bam_p), bam_p->core.l_qseq));

    //char* quality_string = (char *)malloc(sizeof(char)*(quality_length + 1));
    convert_to_quality_string_length(alignment_p->quality, bam1_qual(bam_p), bam_p->core.l_qseq, base_quality);
    //strcpy(alignment_p->quality, quality_string);
    //free(quality_string);

    strcpy(alignment_p->cigar, convert_to_cigar_string(bam1_cigar(bam_p), alignment_p->num_cigar_operations));
    memcpy(alignment_p->optional_fields, bam1_aux(bam_p), bam_p->l_aux);

    //flags
    uint32_t flag = (uint32_t) bam_p->core.flag;
    alignment_p->is_paired_end = (flag & BAM_FPAIRED) ? 1 : 0;
    alignment_p->is_paired_end_mapped = (flag & BAM_FPROPER_PAIR) ? 1 : 0;
    alignment_p->is_seq_mapped = (flag & BAM_FUNMAP) ? 0 : 1; //in bam structure is negative flag!!!
    alignment_p->is_mate_mapped = (flag & BAM_FMUNMAP) ? 0 : 1; //in bam structure is negative flag!!!
    alignment_p->seq_strand = (flag & BAM_FREVERSE) ? 1 : 0;
    alignment_p->mate_strand = (flag & BAM_FMREVERSE) ? 1 : 0;

    if (flag & BAM_FREAD1) {
        alignment_p->pair_num = 1;
    } else if (flag & BAM_FREAD2) {
        alignment_p->pair_num = 2;
    } else {
        alignment_p->pair_num = 0;
    }

    alignment_p->primary_alignment = (flag & BAM_FSECONDARY) ? 1 : 0;
    alignment_p->fails_quality_check = (flag & BAM_FQCFAIL) ? 1 : 0;
    alignment_p->pc_optical_duplicate = (flag & BAM_FDUP) ? 1 : 0;

    return alignment_p;
}
Beispiel #12
0
static int fill_buf(samfile_t *in, buffer_t *buf)
{
	int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
	bam1_t *b = bam_init1();
	bam1_core_t *c = &b->core;
	// squeeze out the empty cells at the beginning
	for (i = 0; i < buf->n; ++i)
		if (buf->buf[i].b) break;
	if (i < buf->n) { // squeeze
		if (i > 0) {
			memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
			buf->n = buf->n - i;
		}
	} else buf->n = 0;
	// calculate min_rpos
	for (i = 0; i < buf->n; ++i) {
		elem_t *e = buf->buf + i;
		if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
			min_rpos = buf->buf[i].rpos;
	}
	// fill the buffer
	buf->x = -1;
	last_tid = buf->n? buf->buf[0].b->core.tid : -1;
	capacity = buf->n + BLOCK_SIZE;
	while ((ret = samread(in, b)) >= 0) {
		elem_t *e;
		uint8_t *qual = bam1_qual(b);
		int is_mapped;
		if (last_tid < 0) last_tid = c->tid;
		if (c->tid != last_tid) {
			if (buf->x < 0) buf->x = buf->n;
		}
		if (buf->n >= buf->max) { // enlarge
			buf->max = buf->max? buf->max<<1 : 8;
			buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
		}
		e = &buf->buf[buf->n++];
		e->b = bam_dup1(b);
		e->rpos = -1; e->score = 0;
		for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
		e->score = (double)e->score / sqrt(c->l_qseq + 1);
		is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
		if (!is_mapped) e->score = -1;
		if (is_mapped && (c->flag & BAM_FREVERSE)) {
			e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
			if (min_rpos > e->rpos) min_rpos = e->rpos;
		}
		if (buf->n >= capacity) {
			if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
			else break;
		}
	}
	if (ret >= 0 && buf->x < 0) buf->x = buf->n;
	bam_destroy1(b);
	return buf->n;
}
Beispiel #13
0
// This function reads a BAM alignment from one BAM file.
static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
{
	aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
	int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
	if (!(b->core.flag&BAM_FUNMAP)) {
		if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
		else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP;
	}
	return ret;
}
Beispiel #14
0
int bam_pad2unpad(bamFile in, bamFile out)
{
	bam_header_t *h;
	bam1_t *b;
	kstring_t r, q;
	uint32_t *cigar2 = 0;
	int n2 = 0, m2 = 0, *posmap = 0;

	h = bam_header_read(in);
	bam_header_write(out, h);
	b = bam_init1();
	r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
	while (bam_read1(in, b) >= 0) {
		uint32_t *cigar = bam1_cigar(b);
		n2 = 0;
		if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
			int i, k;
			unpad_seq(b, &r);
			write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
			replace_cigar(b, n2, cigar2);
			posmap = realloc(posmap, r.m * sizeof(int));
			for (i = k = 0; i < r.l; ++i) {
				posmap[i] = k; // note that a read should NOT start at a padding
				if (r.s[i]) ++k;
			}
		} else {
			int i, k, op;
			unpad_seq(b, &q);
			if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]);
			for (i = 0, k = b->core.pos; i < q.l; ++i, ++k)
				q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD);
			for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
				if (op != q.s[i]) {
					write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
					op = q.s[i]; k = 1;
				} else ++k;
			}
			write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
			if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
			for (i = 2; i < n2; ++i)
				if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH)
					cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0;
			for (i = k = 0; i < n2; ++i)
				if (cigar2[i]) cigar2[k++] = cigar2[i];
			n2 = k;
			replace_cigar(b, n2, cigar2);
			b->core.pos = posmap[b->core.pos];
		}
		bam_write1(out, b);
	}
	free(r.s); free(q.s); free(posmap);
	bam_destroy1(b);
	bam_header_destroy(h);
	return 0;
}
Beispiel #15
0
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
{
	uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
	int i;
	const bam1_core_t *c = &b->core;
	kstring_t str;
	str.l = str.m = 0; str.s = 0;

	ksprintf(&str, "%s\t", bam1_qname(b));
	if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag);
	else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
	else { // BAM_OFSTR
		for (i = 0; i < 16; ++i)
			if ((c->flag & 1<<i) && bam_flag2char_table[i])
				kputc(bam_flag2char_table[i], &str);
		kputc('\t', &str);
	}
	if (c->tid < 0) kputs("*\t", &str);
	else ksprintf(&str, "%s\t", header->target_name[c->tid]);
	ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual);
	if (c->n_cigar == 0) kputc('*', &str);
	else {
		for (i = 0; i < c->n_cigar; ++i)
			ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]);
	}
	kputc('\t', &str);
	if (c->mtid < 0) kputs("*\t", &str);
	else if (c->mtid == c->tid) kputs("=\t", &str);
	else ksprintf(&str, "%s\t", header->target_name[c->mtid]);
	ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize);
	if (c->l_qseq) {
		for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
		kputc('\t', &str);
		if (t[0] == 0xff) kputc('*', &str);
		else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
	} else ksprintf(&str, "*\t*");
	s = bam1_aux(b);
	while (s < b->data + b->data_len) {
		uint8_t type, key[2];
		key[0] = s[0]; key[1] = s[1];
		s += 2; type = *s; ++s;
		ksprintf(&str, "\t%c%c:", key[0], key[1]);
		if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; }
		else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; }
		else if (type == 'c') { ksprintf(&str, "i:%d", *(int8_t*)s); ++s; }
		else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; }
		else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; }
		else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; }
		else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; }
		else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
		else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
		else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; }
	}
	return str.s;
}
Beispiel #16
0
GtUword gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment)
{
  gt_assert(sam_alignment != NULL);
  if (sam_alignment->rightmost == GT_UNDEF_UWORD)
  {
    sam_alignment->rightmost = (GtUword)bam_calend(
        &sam_alignment->s_alignment->core,
        bam1_cigar(sam_alignment->s_alignment));
  }
  return sam_alignment->rightmost;
}
Beispiel #17
0
unsigned long gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment)
{
  gt_assert(sam_alignment != NULL);
  if (sam_alignment->rightmost == GT_UNDEF_ULONG)
  {
    sam_alignment->rightmost = (unsigned long)bam_calend(
        &sam_alignment->s_alignment->core,
        bam1_cigar(sam_alignment->s_alignment));
  }
  return sam_alignment->rightmost;
}
Beispiel #18
0
// Adapted from samtools/bam.c
int32_t b2g_bam_clippedlength(bam1_t *b) {
  const bam1_core_t c = b->core;
  const uint32_t *cigar = bam1_cigar(b);
  uint32_t k;
  int32_t l = 0;
  for (k = 0; k < c.n_cigar; ++k) {
    if ('S' == bam_cigar_opchr(bam_cigar_op(cigar[k]))) {
      l += bam_cigar_oplen(cigar[k]);
    }
  }
  return l;
}
Beispiel #19
0
static double TGM_GetMismatchRate(const bam1_t* pAlignment)
{
    uint32_t* cigar = bam1_cigar(pAlignment);
    unsigned int numMismatch = 0;
    for (unsigned i = 0; i != pAlignment->core.n_cigar; ++i)
    {
        int type = (cigar[i] & BAM_CIGAR_MASK);
        if (type == BAM_CINS || type == BAM_CDEL || type == BAM_CSOFT_CLIP || type == BAM_CMISMATCH)
        {
            numMismatch += (cigar[i] >> BAM_CIGAR_SHIFT);
        }
    }
Beispiel #20
0
/* from char *bam_format1_core(const bam_header_t *header, const
 * bam1_t *b, int of) 
 */
char *
cigar_str_from_bam(const bam1_t *b)
{
     const bam1_core_t *c = &b->core;
     kstring_t str;
     int i;
     str.l = str.m = 0; str.s = 0;
     for (i = 0; i < c->n_cigar; ++i) {
          kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
          kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
     }
     return str.s;
}
Beispiel #21
0
void bamUnpackCigar(const bam1_t *bam, struct dyString *dyCigar)
/* Unpack CIGAR string into dynamic string */
{
    unsigned int *cigarPacked = bam1_cigar(bam);
    const bam1_core_t *core = &bam->core;
    int i;
    for (i = 0;  i < core->n_cigar;  i++)
    {
        char op;
        int n = bamUnpackCigarElement(cigarPacked[i], &op);
        dyStringPrintf(dyCigar, "%d", n);
        dyStringAppendC(dyCigar, op);
    }
}
Beispiel #22
0
static int countBam(const bam1_t *bam, void *data)
/* bam_fetch() calls this on each bam alignment retrieved.  */
{
struct bamWigTrackData *btd = (struct bamWigTrackData *)data;
const bam1_core_t *core = &bam->core;

int tLength=0, tPos = core->pos, qPos = 0;
unsigned int *cigar = bam1_cigar(bam);
int i;
double scale = btd->scale;
for (i = 0;  i < core->n_cigar;  i++)
    {
    char op;
    int n = bamUnpackCigarElement(cigar[i], &op);
    switch (op)
	{
	case 'X': // mismatch (gapless aligned block)
	case '=': // match (gapless aligned block)
	case 'M': // match or mismatch (gapless aligned block)
	    {
	    int start = (int)(scale * (tPos - winStart));
	    int end = (int)(scale * ((tPos + n) - winStart));
	    for(i=start; i < end; i++)
		btd->preDraw[i + btd->preDrawZero].count++;
	    tPos =  tPos + n;
	    qPos =  qPos + n;
	    tLength += n;
	    break;
	    }
	case 'I': // inserted in query
	    qPos += n;
	    break;
	case 'D': // deleted from query
	case 'N': // long deletion from query (intron as opposed to small del)
	    tPos += n;
	    tLength += n;
	    break;
	case 'S': // skipped query bases at beginning or end ("soft clipping")
	case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
	case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
	    break;
	default:
	    errAbort("countBam: unrecognized CIGAR op %c -- update me", op);
	}

    }
return 0;
}
Beispiel #23
0
struct simpleFeature *sfFromNumericCigar(const bam1_t *bam, int *retLength)
/* Translate BAM's numeric CIGAR encoding into a list of simpleFeatures,
 * and tally up length on reference sequence while we're at it. */
{
const bam1_core_t *core = &bam->core;
struct simpleFeature *sf, *sfList = NULL;
int tLength=0, tPos = core->pos, qPos = 0;
unsigned int *cigar = bam1_cigar(bam);
int i;
for (i = 0;  i < core->n_cigar;  i++)
    {
    char op;
    int n = bamUnpackCigarElement(cigar[i], &op);
    switch (op)
	{
	case 'X': // mismatch (gapless aligned block)
	case '=': // match (gapless aligned block)
	case 'M': // match or mismatch (gapless aligned block)
	    AllocVar(sf);
	    sf->start = tPos;
	    sf->qStart = qPos;
	    tPos = sf->end = tPos + n;
	    qPos = sf->qEnd = qPos + n;
	    slAddHead(&sfList, sf);
	    tLength += n;
	    break;
	case 'I': // inserted in query
	    qPos += n;
	    break;
	case 'D': // deleted from query
	case 'N': // long deletion from query (intron as opposed to small del)
	    tPos += n;
	    tLength += n;
	    break;
	case 'S': // skipped query bases at beginning or end ("soft clipping")
	case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
	case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
	    break;
	default:
	    errAbort("sfFromNumericCigar: unrecognized CIGAR op %c -- update me", op);
	}
    }
if (retLength != NULL)
    *retLength = tLength;
slReverse(&sfList);
return sfList;
}
Beispiel #24
0
void bamShowCigarEnglish(const bam1_t *bam)
/* Print out cigar in English e.g. "20 (mis)Match, 1 Deletion, 3 (mis)Match" */
{
    unsigned int *cigarPacked = bam1_cigar(bam);
    const bam1_core_t *core = &bam->core;
    int i;
    for (i = 0;  i < core->n_cigar;  i++)
    {
        char op;
        int n = bamUnpackCigarElement(cigarPacked[i], &op);
        if (i > 0)
            printf(", ");
        switch (op)
        {
        case 'M': // match or mismatch (gapless aligned block)
            printf("%d (mis)Match", n);
            break;
        case '=': // match
            printf("%d Match", n);
            break;
        case 'X': // mismatch
            printf("%d Mismatch", n);
            break;
        case 'I': // inserted in query
            printf("%d Insertion", n);
            break;
        case 'S': // skipped query bases at beginning or end ("soft clipping")
            printf("%d Skipped", n);
            break;
        case 'D': // deleted from query
            printf("%d Deletion", n);
            break;
        case 'N': // long deletion from query (intron as opposed to small del)
            printf("%d deletioN", n);
            break;
        case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
            printf("%d Hard clipped query", n);
            break;
        case 'P': // P="silent deletion from padded reference sequence"
            printf("%d Padded / silent deletion", n);
            break;
        default:
            errAbort("bamShowCigarEnglish: unrecognized CIGAR op %c -- update me", op);
        }
    }
}
Beispiel #25
0
 void GBamRecord::add_sequence(const char* qseq, int slen) {
   //must be called AFTER set_cigar (cannot replace existing sequence for now)
   if (qseq==NULL) return; //should we ever care about this?
   if (slen<0) slen=strlen(qseq);
   int doff = b->core.l_qname + b->core.n_cigar * 4;
   if (strcmp(qseq, "*")!=0) {
       b->core.l_qseq=slen;
       if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
           GError("Error: CIGAR and sequence length are inconsistent!(%s)\n",
                  qseq);
       uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
       //also allocated quals memory
       memset(p, 0, (b->core.l_qseq+1)/2);
       for (int i = 0; i < b->core.l_qseq; ++i)
           p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
       } else b->core.l_qseq = 0;
   }
Beispiel #26
0
/**
* @brief Updates a buffered_read with information from bamread
*
* @param bufread Buffered reads
* @param bamread Read info stored in the samtools struct bam1_t
* @param rm Results from quality_check()
* @return void
* @details Copies information from bamread and destroys bamread after passing
* @todo nothing
*/
void store_read(buffered_read_t *bufread, bam1_t *bamread,read_metrics_t *rm){

	bufread->chrom_index=bamread->core.tid;

	if(sizeof(uint32_t)*bamread->core.n_cigar>MAX_NCIGAR)bufread->cigar=(uint32_t *)Realloc(bufread->cigar,bamread->core.n_cigar,uint32_t);

	memcpy(bufread->cigar,bam1_cigar(bamread),sizeof(uint32_t)*bamread->core.n_cigar);

	bufread->l_seq=rm->read_length;
	bufread->tlen=bamread->core.isize;
	bufread->mapq=bamread->core.qual;
	bufread->n_cigar=bamread->core.n_cigar;
	bufread->pos=bamread->core.pos;// this program uses 1 based positions
	bufread->revcomp=rm->revcomp;
	bufread->proper_pair=bam1_ppair(bamread);
	bufread->written=0;
	bufread->genomic_end=rm->genomic_end;
	bam_destroy1(bamread);
}
Beispiel #27
0
void bamGetSoftClipping(const bam1_t *bam, int *retLow, int *retHigh, int *retClippedQLen)
/* If retLow is non-NULL, set it to the number of "soft-clipped" (skipped) bases at
 * the beginning of the query sequence and quality; likewise for retHigh at end.
 * For convenience, retClippedQLen is the original query length minus soft clipping
 * (and the length of the query sequence that will be returned). */
{
    unsigned int *cigarPacked = bam1_cigar(bam);
    const bam1_core_t *core = &bam->core;
    char op;
    int n = bamUnpackCigarElement(cigarPacked[0], &op);
    int low = (op == 'S') ? n : 0;
    n = bamUnpackCigarElement(cigarPacked[core->n_cigar-1], &op);
    int high = (op == 'S') ? n : 0;
    if (retLow != NULL)
        *retLow = low;
    if (retHigh != NULL)
        *retHigh = high;
    if (retClippedQLen != NULL)
        *retClippedQLen = (core->l_qseq - low - high);
}
Beispiel #28
0
void GBamRecord::setupCoordinates() {
	const bam1_core_t *c = &b->core;
	if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */
	uint32_t *p = bam1_cigar(b);
	//--- prevent alignment error here (reported by UB-sanitazer):
	uint32_t *cigar= new uint32_t[c->n_cigar];
	memcpy(cigar, p, c->n_cigar * sizeof(uint32_t));
	//--- UBsan protection end
	int l=0;
	mapped_len=0;
	clipL=0;
	clipR=0;
	start=c->pos+1; //genomic start coordinate, 1-based (BAM core.pos is 0-based)
	int exstart=c->pos;
	for (int i = 0; i < c->n_cigar; ++i) {
		int op = cigar[i]&0xf;
		if (op == BAM_CMATCH || op==BAM_CEQUAL ||
				op == BAM_CDIFF || op == BAM_CDEL) {
			l += cigar[i]>>4;
		}
		else if (op == BAM_CREF_SKIP) { //N
Beispiel #29
0
static void unpad_seq(bam1_t *b, kstring_t *s)
{
	int k, j, i;
	uint32_t *cigar = bam1_cigar(b);
	uint8_t *seq = bam1_seq(b);
	ks_resize(s, b->core.l_qseq);
	for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
		int op, ol;
		op = bam_cigar_op(cigar[k]);
		ol = bam_cigar_oplen(cigar[k]);
		assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP);
		if (op == BAM_CMATCH) {
			for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j);
			++j;
		} else if (op == BAM_CSOFT_CLIP) {
			j += ol;
		} else {
			for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
		}
	}
}
Beispiel #30
0
int TGM_GetNumMismatchFromBam(const bam1_t* pAlgn)
{
    int numMM = 0;
    uint32_t* cigar = bam1_cigar(pAlgn);
    for (unsigned i = 0; i != pAlgn->core.n_cigar; ++i)
    {
        int type = (cigar[i] & BAM_CIGAR_MASK);
        if (type == BAM_CINS || type == BAM_CDEL)
            numMM += (cigar[i] >> BAM_CIGAR_SHIFT);
    }

    uint8_t* mdPos = bam_aux_get(pAlgn, "MD");
    if (mdPos != NULL)
    {
        const char* mdStr = bam_aux2Z(mdPos);
        const char* mdFieldPos = mdStr;
        while (mdFieldPos != NULL && *mdFieldPos != '\0')
        {
            if (isdigit(*mdFieldPos))
            {
                ++mdFieldPos;
                continue;
            }

            const char* mdFieldEnd = mdFieldPos + 1;
            while (!isdigit(*mdFieldEnd) && *mdFieldEnd != '\0')
                ++mdFieldEnd;

            if (*mdFieldPos != '^')
                numMM += mdFieldEnd - mdFieldPos;

            mdFieldPos = mdFieldEnd;
        }
    }

    return numMM;
}