int bam_plp_push(bam_plp_t iter, const bam1_t *b) { if (iter->error) return -1; if (b) { if (b->core.tid < 0) return 0; if (b->core.flag & iter->flag_mask) return 0; if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0; bam_copy1(&iter->tail->b, b); iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t if (b->core.tid < iter->max_tid) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); iter->error = 1; return -1; } if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); iter->error = 1; return -1; } iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { iter->tail->next = mp_alloc(iter->mp); iter->tail = iter->tail->next; } } else iter->is_eof = 1; return 0; }
void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) { bam1_t *swap; int i, end; uint32_t *cigar; str->l = 0; if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } end = bam_calend(&b1->core, cigar); kputw(b2->core.pos - end, str); kputc('T', str); kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); }
// currently, this function ONLY works if each read has one hit void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end; kstring_t str; str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.tid < 0) continue; cur_end = bam_calend(&cur->core, bam1_cigar(cur)); if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; else cur->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; } else { // unpaired or singleton pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; if (pre->core.flag & BAM_FPAIRED) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; } bam_write1(out, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
void GBamRecord::set_cigar(const char* cigar) { //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call int doff=b->core.l_qname; uint8_t* after_cigar=NULL; int after_cigar_len=0; uint8_t* prev_bdata=NULL; if (b->data_len>doff) { //cigar string already allocated, replace it int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data after_cigar=b->data+d; after_cigar_len=b->data_len-d; } const char *s; char *t; int i, op; long x; b->core.n_cigar = 0; if (cigar != NULL && strcmp(cigar, "*") != 0) { for (s = cigar; *s; ++s) { if (isalpha(*s)) b->core.n_cigar++; else if (!isdigit(*s)) { GError("Error: invalid CIGAR character (%s)\n",cigar); } } if (after_cigar_len>0) { //replace/insert into existing full data prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); free(prev_bdata); } else { realloc_bdata(b, doff + b->core.n_cigar * 4); } for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true; else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true; else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true; else if (op == 'P') op = BAM_CPAD; else GError("Error: invalid CIGAR operation (%s)\n",cigar); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; } if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); } else {//no CIGAR string given if (!(b->core.flag&BAM_FUNMAP)) { GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); b->core.flag |= BAM_FUNMAP; } b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); } setupCoordinates(); } //set_cigar()
static int fill_buf(samfile_t *in, buffer_t *buf) { int i, ret, last_tid, min_rpos = 0x7fffffff, capacity; bam1_t *b = bam_init1(); bam1_core_t *c = &b->core; // squeeze out the empty cells at the beginning for (i = 0; i < buf->n; ++i) if (buf->buf[i].b) break; if (i < buf->n) { // squeeze if (i > 0) { memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i)); buf->n = buf->n - i; } } else buf->n = 0; // calculate min_rpos for (i = 0; i < buf->n; ++i) { elem_t *e = buf->buf + i; if (e->b && e->rpos >= 0 && e->rpos < min_rpos) min_rpos = buf->buf[i].rpos; } // fill the buffer buf->x = -1; last_tid = buf->n? buf->buf[0].b->core.tid : -1; capacity = buf->n + BLOCK_SIZE; while ((ret = samread(in, b)) >= 0) { elem_t *e; uint8_t *qual = bam1_qual(b); int is_mapped; if (last_tid < 0) last_tid = c->tid; if (c->tid != last_tid) { if (buf->x < 0) buf->x = buf->n; } if (buf->n >= buf->max) { // enlarge buf->max = buf->max? buf->max<<1 : 8; buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max); } e = &buf->buf[buf->n++]; e->b = bam_dup1(b); e->rpos = -1; e->score = 0; for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1; e->score = (double)e->score / sqrt(c->l_qseq + 1); is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1; if (!is_mapped) e->score = -1; if (is_mapped && (c->flag & BAM_FREVERSE)) { e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b)); if (min_rpos > e->rpos) min_rpos = e->rpos; } if (buf->n >= capacity) { if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE; else break; } } if (ret >= 0 && buf->x < 0) buf->x = buf->n; bam_destroy1(b); return buf->n; }
unsigned long gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment) { gt_assert(sam_alignment != NULL); if (sam_alignment->rightmost == GT_UNDEF_ULONG) { sam_alignment->rightmost = (unsigned long)bam_calend( &sam_alignment->s_alignment->core, bam1_cigar(sam_alignment->s_alignment)); } return sam_alignment->rightmost; }
GtUword gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment) { gt_assert(sam_alignment != NULL); if (sam_alignment->rightmost == GT_UNDEF_UWORD) { sam_alignment->rightmost = (GtUword)bam_calend( &sam_alignment->s_alignment->core, bam1_cigar(sam_alignment->s_alignment)); } return sam_alignment->rightmost; }
static int mplp_func(void *data, bam1_t *b) { extern int bam_realn(bam1_t *b, const char *ref); extern int bam_prob_realn_core(bam1_t *b, const char *ref, int); extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); mplp_aux_t *ma = (mplp_aux_t*)data; int ret, skip = 0; do { int has_ref; ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); if (ret < 0) break; if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads skip = 1; continue; } if (ma->conf->bed) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); if (skip) continue; } if (ma->conf->rghash) { // exclude read groups uint8_t *rg = bam_aux_get(b, "RG"); skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); if (skip) continue; } if (ma->conf->flag & MPLP_ILLUMINA13) { int i; uint8_t *qual = bam1_qual(b); for (i = 0; i < b->core.l_qseq; ++i) qual[i] = qual[i] > 31? qual[i] - 31 : 0; } has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; skip = 0; if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1); if (has_ref && ma->conf->capQ_thres > 10) { int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; } else if (b->core.qual < ma->conf->min_mq) skip = 1; else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; } while (skip); return ret; }
/** * @brief Major quality check point for each read * * @param rm Empty read_metrics_t to be updated with the results * @param temp_read The read to be assessed * @param user_args User arguments to be considered during assessment * @param bresults Block wide results of the parsing * @param lpos Current position * @return void * @details Major quality check point for each read * @todo nothing */ void quality_check(read_metrics_t *rm,bam1_t *temp_read,user_arguments_t *user_args,seq_block_t *bresults,int lpos){ static int pos_dupcounter=0,neg_dupcounter=0; rm->skip=0; rm->read_length=0; rm->genomic_end= bam_calend(&temp_read->core,bam1_cigar(temp_read)); /* Determine read length */ if(bam1_pair(temp_read)){ ++bresults->paired; if (bam1_ppair(temp_read))++bresults->ppairs; } ++bresults->total_reads; if(temp_read->core.qual < user_args->TMAPQ || bam1_unmapped(temp_read)){ ++bresults->lowqual; rm->skip=1; return; } if(user_args->UNIQUE && bam1_multimap(temp_read)){ rm->skip=1; return; } if(!user_args->PAIRED){ rm->revcomp=bam1_strand(temp_read); rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read)); } else if (bam1_ppair(temp_read) && !bam1_notprimary(temp_read)){ rm->revcomp=bam1_revpair(temp_read); if(!user_args->READTHROUGH){rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read));//sets the read length only!! }else if(temp_read->core.isize!=0 ){ if((bam1_firstr(temp_read)&&!bam1_revpair(temp_read))||(bam1_secondr(temp_read)&&bam1_mrevpair(temp_read))){ rm->read_length=temp_read->core.isize; } else { rm->skip=1; return; } } else{ warning("ISIZE not set in SAM/BAM file. Re-run without using the readthrough_pairs option\n"); rm->skip=-4; return; } } else{ rm->skip=1; return; } if(!rm->read_length){ rm->read_length=temp_read->core.l_qseq; if(!rm->read_length){ warning("Read length neither found in core.isize=%d, core.l_qseq=%d or cigar=%d!\n",temp_read->core.isize,temp_read->core.l_qseq,bam1_cigar(temp_read)); rm->skip=-4; return; } } /* END */ if(user_args->STRANDED!=0){ if((user_args->STRANDED==-1 && !rm->revcomp) || (user_args->STRANDED==1 && rm->revcomp)){ rm->skip=1;return; } } if(user_args->COLLAPSE>0){ if(lpos==temp_read->core.pos){ if(!rm->revcomp)++pos_dupcounter; else ++neg_dupcounter; if(pos_dupcounter>=user_args->COLLAPSE || neg_dupcounter>=user_args->COLLAPSE){ ++bresults->collapsed; rm->skip=1; return; } }else{ pos_dupcounter=0; neg_dupcounter=0; } } if(!rm->skip){ rm->revcomp ? ++bresults->neg_strand : ++bresults->pos_strand; ++bresults->filtered_reads; bresults->mapmass+=rm->read_length; } }
int32_t getEnd() const {assert(m_dataPtr); return bam_calend(&m_dataPtr->core, bam1_cigar(m_dataPtr)) + 1;}
/* Count matches (OP_MATCH), mismatches (OP_MISMATCH), insertions * (OP_INS) and deletions (OP_DEL) for an aligned read. Written to * (preallocated, size 4) counts at indices given above. Will ignore * all mis-/match bases if their bq is below min_bq. * * Returns the total number of operations counted (excl. clipped bases * or those with bq<min_bq) or -1 on error. Consecutive indels are * counted as one operation, using INDEL_QUAL_DEFAULT, which is * suboptimal. 0 is a valid return value, e.g. if all bases are below * the quality threshold. * * If quals is not NULL it will be used as a two dim array (has to be * preallocated) with OPs as first dim (len NUM_OP_CATS) and the * qualities of the bases as second dim. NOTE/FIXME: this uses bq for * mis/matches and INDEL_QUAL_DEFAULT for now in case of indels. The * number of elements corresponds to the count entry and can be at max * readlen. * * If target is non-NULL will ignore preloaded variant positions via * var_in_ign_list * * WARNING code duplication with calc_read_alnerrprof but merging the * two functions was too complicated (and the latter is unused anyway) */ int count_cigar_ops(int *counts, int **quals, const bam1_t *b, const char *ref, int min_bq, char *target) { #if 0 #define TRACE 1 #endif int num_ops = 0; /* modelled after bam.c:bam_calend(), bam_format1_core() and * pysam's aligned_pairs (./pysam/csamtools.pyx) */ uint32_t *cigar = bam1_cigar(b); const bam1_core_t *c = &b->core; uint32_t tpos = c->pos; /* pos on genome */ uint32_t qpos = 0; /* pos on read/query */ uint32_t k, i; #if 0 int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */ #else int qlen = b->core.l_qseq; /* read length */ #endif if (! ref) { return -1; } if (! counts) { return -1; } memset(counts, 0, NUM_OP_CATS*sizeof(int)); /* loop over cigar to get aligned bases * * read: bam_format1_core(NULL, b, BAM_OFDEC); */ for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */ int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */ uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT; /* following conditionals could be collapsed to much shorter * code, but we keep them roughly as they were in pysam's * aligned_pairs to make later comparison and handling of * indels easier */ if (op == BAM_CMATCH || op == BAM_CDIFF) { for (i=tpos; i<tpos+l; i++) { int actual_op; assert(qpos < qlen); char ref_nt = ref[i]; char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)]; int bq = bam1_qual(b)[qpos]; if (ref_nt != read_nt || op == BAM_CDIFF) { actual_op = OP_MISMATCH; } else { actual_op = OP_MATCH; } /* ignoring base if below min_bq, independent of type */ if (bq<min_bq) { #ifdef TRACE fprintf(stderr, "TRACE(%s): [M]MATCH ignoring base because of bq=%d at %d (qpos %d)\n", bam1_qname(b), bq, i, qpos); #endif qpos += 1; continue; } /* for mismatches only */ if (target && actual_op == OP_MISMATCH) { var_t fake_var; memset(&fake_var, 0, sizeof(var_t)); fake_var.chrom = target; fake_var.pos = i; /* FIXME evil, evil hack. only works as long as var_in_ign_list only uses chrom and pos */ if (var_in_ign_list(&fake_var)) { #ifdef TRACE fprintf(stderr, "TRACE(%s): MM: ignoring because in ign list at %d (qpos %d)\n", bam1_qname(b), i, qpos); #endif qpos += 1; continue; } } #ifdef TRACE fprintf(stderr, "TRACE(%s): adding [M]MATCH qpos,tpos,ref,read,bq = %d,%d,%c,%c,%d\n", bam1_qname(b), qpos, tpos, ref_nt, read_nt, bq); #endif counts[actual_op] += 1; if (quals) { quals[actual_op][counts[actual_op]-1] = bq; } qpos += 1; } tpos += l; } else if (op == BAM_CINS || op == BAM_CDEL) { if (target) { /* vcf: * indel at tpos 1 means, that qpos 2 is an insertion (e.g. A to AT) * del at tpos 1 means, that qpos 2 is missing (e.g. AT to A) */ var_t fake_var; fake_var.chrom = target; fake_var.pos = tpos; if (op==BAM_CINS) { fake_var.pos -= 1; } /* FIXME see above: only works as long as var_in_ign_list only uses chrom and pos */ if (var_in_ign_list(&fake_var)) { if (op == BAM_CINS) { qpos += l; } #ifdef TRACE fprintf(stderr, "TRACE(%s): %c: ignoring because in ign list at tpos %d (qpos %d)\n", bam1_qname(b), op == BAM_CINS? 'I':'D', tpos, qpos); #endif continue; } } #ifdef TRACE fprintf(stderr, "TRACE(%s): adding %c qpos,tpos = %d,%d\n", bam1_qname(b), op==BAM_CINS?'I':'D', qpos, tpos); #endif if (op == BAM_CINS) { counts[OP_INS] += 1; /* counts indel as 1 operation only */ if (quals) { quals[OP_INS][counts[OP_INS]-1] = INDEL_QUAL_DEFAULT; /* FIXME use iq */ } qpos += l;/* forward query pos by length of operation */ } else if (op == BAM_CDEL) { counts[OP_DEL] += 1; /* counts indel as 1 operation only */ if (quals) { quals[OP_DEL][counts[OP_DEL]-1] = INDEL_QUAL_DEFAULT; /* FIXME use dq */ } tpos += l; /* forward genome pos by length of operation */ } else { LOG_FATAL("%s\n", "INTERNAL ERROR: should never get here"); exit(1); } } else if (op == BAM_CREF_SKIP) { tpos += l; } else if (op == BAM_CSOFT_CLIP) { #if 0 printf("SOFT CLIP qpos = %d\n", qpos); #endif qpos += l; } else if (op != BAM_CHARD_CLIP) { LOG_WARN("Untested op %d in cigar %s\n", op, cigar_str_from_bam(b)); /* don't think we need to do anything here */ } } /* for k */ assert(qpos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */ if (qpos != qlen) { LOG_WARN("got qpos=%d and qlen=%d for cigar %s l_qseq %d in read %s\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq, bam1_qname(b)); } assert(qpos == qlen); num_ops = 0; for (i=0; i<NUM_OP_CATS; i++) { num_ops += counts[i]; #ifdef TRACE int j; for (j=0; j<counts[i]; j++) { fprintf(stderr, "TRACE(%s) op %s #%d: %d\n", bam1_qname(b), op_cat_str[i], j, quals[i][j]); } #endif } return num_ops; }
/* Counts probability of non-match count along the read after * subtracting error prob at that position (using the original * orientation). used_pos is an array of ints indicating whether * position was used or not (trimmed, clipped etc). alnerrprof and * used_pos must be of at least length b->core.l_qseq. Note: will add * to alnerrprof and used_pos, i.e. arrays should be initialized to 0 if * you don't want aggregate values. * * WARNING code duplication with count_cigar_ops but merging the two * functions is messy. */ void calc_read_alnerrprof(double *alnerrprof, unsigned long int *used_pos, const bam1_t *b, const char *ref) { /* modelled after bam.c:bam_calend(), bam_format1_core() and * pysam's aligned_pairs (./pysam/csamtools.pyx) */ uint32_t *cigar = bam1_cigar(b); uint32_t k, i; const bam1_core_t *c = &b->core; #if 0 int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */ #else int qlen = b->core.l_qseq; /* read length */ #endif uint32_t pos = c->pos; /* pos on genome */ uint32_t qpos = 0; /* pos on read/query */ uint32_t qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;/* original qpos before mapping as possible reverse */ /* loop over cigar to get aligned bases * * read: bam_format1_core(NULL, b, BAM_OFDEC); */ for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */ int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */ uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT; /* following conditionals could be collapsed to much shorter * code, but we keep them as they were in pysam's * aligned_pairs to make later handling of indels easier */ if (op == BAM_CMATCH || op == BAM_CDIFF) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); /* case agnostic */ char ref_nt = ref[i]; char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)]; int bq = bam1_qual(b)[qpos]; #if 0 printf("[M]MATCH qpos,i,ref,read = %d,%d,%c,%c\n", qpos, i, ref_nt, read_nt); #endif if (ref_nt != 'N') { if (ref_nt != read_nt || op == BAM_CDIFF) { alnerrprof[qpos_org] += (1.0 - PHREDQUAL_TO_PROB(bq)); } /* otherwise leave at 0.0 but count anyway */ used_pos[qpos_org] += 1; } qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } pos += l; } else if (op == BAM_CINS) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; #if 0 printf("INS qpos,i = %d,None\n", qpos); #endif qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { for (i=pos; i<pos+l; i++) { #if 0 printf("DEL qpos,i = None,%d\n", i); #endif if (op == BAM_CDEL) { alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; } } pos += l; /* deletion: don't increase qpos */ } else if (op == BAM_CSOFT_CLIP) { #if 0 printf("SOFT CLIP qpos = %d\n", qpos); #endif qpos += l; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } else if (op != BAM_CHARD_CLIP) { LOG_WARN("Unknown op %d in cigar %s\n", op, cigar_str_from_bam(b)); } } /* for k */ assert(pos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */ if (qpos != qlen) { LOG_FIXME("got qpos=%d and qlen=%d for cigar %s l_qseq %d\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq); } assert(qpos == qlen); /* FIXME correct assert? What if hard clipped? */ #if 0 fprintf(stderr, "%s:", __FUNCTION__); for (i=0; i< b->core.l_qseq; i++) { fprintf(stderr, " %g/%d", alnerrprof[i], used_pos[i]); } fprintf(stderr, "\n"); #endif }
inline uint32_t last_pos(BamEntry const& e) { return bam_calend(&e->core, bam1_cigar(e)); }
bam1_t* convert_to_bam(alignment_t* alignment_p, int base_quality) { bam1_t* bam_p = bam_init1(); // -------------------------> 0s. int data_length, sequence_length, copy_length, index_to_data = 0; uint8_t* data; sequence_length = strlen(alignment_p->sequence); //data length is the sum of lengths if five codified fields (cigar, query name, sequence, quality and optional info) data_length = (4 * alignment_p->num_cigar_operations) + strlen(alignment_p->query_name) + 1 + ((sequence_length + 1) / 2) + sequence_length + alignment_p->optional_fields_length; //memory allocation for data vector from data length data = (uint8_t*) calloc(data_length, sizeof(uint8_t)); //copy query name copy_length = strlen(alignment_p->query_name) + 1; strcat(alignment_p->query_name, "\0"); memcpy(&data[index_to_data], alignment_p->query_name, copy_length); index_to_data += copy_length; //convert cigar to uint32_t format convert_to_cigar_uint32_t(&data[index_to_data], alignment_p->cigar, alignment_p->num_cigar_operations); copy_length = (4 * alignment_p->num_cigar_operations); index_to_data += copy_length; //convert sequence to uint8_t format convert_to_sequence_uint8_t(&data[index_to_data], alignment_p->sequence, sequence_length); copy_length = ((sequence_length + 1) / 2); index_to_data += copy_length; //convert quality to uint8_t format convert_to_quality_uint8_t(&data[index_to_data], alignment_p->quality, sequence_length, base_quality); copy_length = sequence_length; index_to_data += copy_length; //copy optional fields memcpy(&data[index_to_data], alignment_p->optional_fields, alignment_p->optional_fields_length); //finally data is assigned bam_p->data = data; //filling bam1_t (not core data) bam_p->l_aux = alignment_p->optional_fields_length; bam_p->data_len = data_length; bam_p->m_data = data_length; //filling bam1_core_t structure bam_p->core.tid = (int32_t) alignment_p->chromosome; bam_p->core.pos = (int32_t) alignment_p->position; bam_p->core.mtid = (int32_t) alignment_p->mate_chromosome; bam_p->core.mpos = (int32_t) alignment_p->mate_position; bam_p->core.qual = (uint32_t) alignment_p->map_quality; bam_p->core.isize = (int32_t) alignment_p->template_length; bam_p->core.l_qname = strlen(alignment_p->query_name) + 1; bam_p->core.n_cigar = (uint32_t) alignment_p->num_cigar_operations; bam_p->core.l_qseq = (int32_t)(int32_t)bam_cigar2qlen(&bam_p->core, bam1_cigar(bam_p)); //lenght from CIGAR //setting flags if (alignment_p->is_paired_end) bam_p->core.flag += BAM_FPAIRED; if (alignment_p->is_paired_end_mapped) bam_p->core.flag += BAM_FPROPER_PAIR; if (!alignment_p->is_seq_mapped) bam_p->core.flag += BAM_FUNMAP; //in bam structure is negative flag!!! if ((!alignment_p->is_mate_mapped) && (alignment_p->is_paired_end)) bam_p->core.flag += BAM_FMUNMAP; //in bam structure is negative flag!!! if (alignment_p->seq_strand) bam_p->core.flag += BAM_FREVERSE; if (alignment_p->mate_strand) bam_p->core.flag += BAM_FMREVERSE; if (alignment_p->pair_num == 1) { bam_p->core.flag += BAM_FREAD1; } else if (alignment_p->pair_num == 2) { bam_p->core.flag += BAM_FREAD2; } if (alignment_p->primary_alignment) bam_p->core.flag += BAM_FSECONDARY; if (alignment_p->fails_quality_check) bam_p->core.flag += BAM_FQCFAIL; if (alignment_p->pc_optical_duplicate) bam_p->core.flag += BAM_FDUP; //bin field requieres core bam_p->core.bin = bam_reg2bin(alignment_p->position, bam_calend(&bam_p->core, bam1_cigar(bam_p))); return bam_p; }
// TODO soft-clipping bam1_t *sw_align(graph_t *g, bam1_t *b, node_t *n, sw_heap_t *heap, char *rg_id, int32_t offset, cov_cutoffs_t *cutoffs, uint8_t correct_bases, uint8_t use_qualities, int32_t max_total_coverage, int32_t max_heap_size) { char *colors = NULL; char *color_qualities = NULL; char base, qual; uint8_t space = SRMA_SPACE_NT; uint8_t strand; int32_t i, j, aln_start; int32_t num_start_nodes_added=0; int32_t sw_node_i=-1, sw_node_best_i=-1, sw_node_cur_i=-1, sw_node_next_i=-1; int32_t soft_clip_start_l = 0, soft_clip_end_l = 0; strand = bam1_strand(b); // soft-clipping if(1 == strand) { //reverse // going from 3'->5' soft_clip_start_l = sw_align_get_soft_clip(b, 1); soft_clip_end_l = sw_align_get_soft_clip(b, 0); } else { // going from 5'->3' soft_clip_start_l = sw_align_get_soft_clip(b, 0); soft_clip_end_l = sw_align_get_soft_clip(b, 1); } // FOR NOW if(0 < soft_clip_start_l || 0 < soft_clip_end_l) { return b; } // Check color space colors = sw_align_get_cs(b); if(NULL == colors) { space = SRMA_SPACE_NT; } else { space = SRMA_SPACE_CS; color_qualities = sw_align_get_cq(b); // Some aligners include a quality value for the adapter. A quality value // IMHO should not be given for an unobserved (assumed) peice of data. Trim // the first quality in this case if(strlen(colors) == strlen(color_qualities)) { // ignore leading quality color_qualities++; } if(0 < soft_clip_start_l || 0 < soft_clip_end_l) { srma_error(__func__, "Soft clipping not supported for color space", Exit, OutOfRange); } } // remove mate info b->core.flag &= ~(BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FMUNMAP); b->core.mtid = -1; b->core.mpos = -1; b->core.isize = 0; // re-type heap heap->type = (1 == strand) ? SRMA_SW_HEAP_MAX : SRMA_SW_HEAP_MIN; // bound with original alignment sw_node_best_i = sw_align_bound(g, b, n, heap, strand, colors, color_qualities, space, cutoffs, use_qualities, max_total_coverage, max_heap_size); if(0 <= sw_node_best_i) { /* sw_heap_reset(heap); // reset the heap, keep old nodes fprintf(stderr, "BOUNDED score=%d coverage_sum=%hu\n", heap->nodes[sw_node_best_i].score, heap->nodes[sw_node_best_i].coverage_sum); // DEBUG */ } else { //fprintf(stderr, "NOT BOUNDED\n"); // DEBUG // nodes do not need to be preserved sw_heap_clear(heap); } //return b; // HERE DEBUG HERE BUG // add start nodes if(strand) { if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1]]; qual = color_qualities[0]; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1)]; qual = bam1_qual(b)[b->core.l_qseq-1] + 33; } aln_start = bam_calend(&b->core, bam1_cigar(b)); for(i=aln_start+offset;aln_start-offset<=i;i--) { int32_t pos = graph_get_node_list_index_at_or_before(g, i); node_list_t *list = graph_get_node_list(g, pos); if(1 != pos && NULL != list) { for(j=0;j<list->length;j++) { node_t *node = list->nodes[j]; int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage); if(0 == pass) { sw_node_i = sw_heap_get_node_i(heap); sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } if(node->position < i) { i = node->position; } num_start_nodes_added++; } } } } else { if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1]]; qual = color_qualities[0]; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), 0)]; qual = bam1_qual(b)[0] + 33; } aln_start = b->core.pos; for(i=aln_start-offset;i<=aln_start+offset;i++) { int32_t pos = graph_get_node_list_index_at_or_after(g, i); node_list_t *list = graph_get_node_list(g, pos); if(0 != pos && NULL != list) { for(j=0;j<list->length;j++) { node_t *node = list->nodes[j]; int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage); if(0 == pass) { sw_node_i = sw_heap_get_node_i(heap); sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } if(node->position < i) { i = node->position; } num_start_nodes_added++; } } } } if(0 == num_start_nodes_added) { srma_error(__func__, "Did not add any start nodes", Exit, OutOfRange); } sw_node_cur_i = sw_heap_poll_i(heap); while(0 <= sw_node_cur_i) { if(max_heap_size < heap->queue_end - heap->queue_start + 1) { // too many to consider sw_heap_clear(heap); // clear heap return b; } sw_node_next_i = sw_heap_peek_i(heap); assert(0 <= sw_node_cur_i); // DEBUG while(NODE_INSERTION != __node_type(heap->nodes[sw_node_cur_i].node) && 0 <= sw_node_next_i && 0 == sw_node_compare(&heap->nodes[sw_node_cur_i], &heap->nodes[sw_node_next_i], heap->type)) { if(heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_next_i].score || (heap->nodes[sw_node_cur_i].score == heap->nodes[sw_node_next_i].score && heap->nodes[sw_node_cur_i].coverage_sum < heap->nodes[sw_node_next_i].coverage_sum)) { sw_node_cur_i = sw_heap_poll_i(heap); } else { // ignore the next node sw_heap_poll_i(heap); } sw_node_next_i = sw_heap_peek_i(heap); } sw_node_next_i = -1; if(heap->nodes[sw_node_cur_i].read_offset == b->core.l_qseq-1) { // found, keep best if(sw_node_best_i < 0 || heap->nodes[sw_node_best_i].score < heap->nodes[sw_node_cur_i].score || (heap->nodes[sw_node_best_i].score == heap->nodes[sw_node_cur_i].score && heap->nodes[sw_node_best_i].coverage_sum < heap->nodes[sw_node_cur_i].coverage_sum)) { //fprintf(stderr, "FOUND BEST\n"); // DEBUG sw_node_best_i = sw_node_cur_i; } } else if(0 <= sw_node_best_i && heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_best_i].score) { // ignore, under the assumption that scores can only // become more negative. } else { edge_list_t *list = NULL; if(1 == strand) { // reverse list = heap->nodes[sw_node_cur_i].node->prev; } else { list = heap->nodes[sw_node_cur_i].node->next; } { // get the base and quality if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1 + (heap->nodes[sw_node_cur_i].read_offset+1)]]; qual = color_qualities[heap->nodes[sw_node_cur_i].read_offset+1]; } else { if(strand) { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1)]; qual = bam1_qual(b)[b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1] + 33; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), (heap->nodes[sw_node_cur_i].read_offset+1))]; qual = bam1_qual(b)[(heap->nodes[sw_node_cur_i].read_offset+1)] + 33; } } } /* node_t *node = heap->nodes[sw_node_cur_i].node; fprintf(stderr, "NODE %d:%d offset=%d coverage=%d base=%d\n", node->contig, node->position, node->offset, node->coverage, node->base); fprintf(stderr, "SW_NODE read_offset=%d score=%d coverage_sum=%d start_position=%d space=%d\n", heap->nodes[sw_node_cur_i].read_offset, heap->nodes[sw_node_cur_i].score, heap->nodes[sw_node_cur_i].coverage_sum, heap->nodes[sw_node_cur_i].start_position, space); */ for(i=0;i<list->length;i++) { node_t *node_cur = list->nodes[i]; uint16_t coverage_cur = list->coverages[i]; int32_t pass = pass_filters(g, node_cur, coverage_cur, cutoffs, max_total_coverage); if(0 == pass) { // add to the heap sw_node_i = sw_heap_get_node_i(heap); // DEBUG assert(0 <= sw_node_cur_i); assert(0 <= heap->nodes[sw_node_cur_i].read_offset); sw_node_init(&heap->nodes[sw_node_i], &heap->nodes[sw_node_cur_i], node_cur, coverage_cur, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } } } // get the next node sw_node_cur_i = sw_heap_poll_i(heap); } /* fprintf(stderr, "sw_node_best_i=%d\n", sw_node_best_i); // DEBUG if(0 <= sw_node_best_i) { fprintf(stderr, "END score=%d coverage_sum=%hu\n", heap->nodes[sw_node_best_i].score, heap->nodes[sw_node_best_i].coverage_sum); // DEBUG } */ // update SAM/BAM b = sw_align_update_bam(b, rg_id, heap, sw_node_best_i, space, colors, color_qualities, strand, correct_bases); sw_heap_clear(heap); // clear heap return b; }
int main(int argc,char* argv[]) { time_t timestamp, current; int i,j,k; int a,n; char *pc; FILE *input_file; FILE *output_file; FILE* log_file=stderr; bamFile bam_input; bam_header_t *header; bam1_t* b; bam1_core_t *c; char cps_file_name[MAXFILEBUFFLENGTH]=""; char bam_file_name[MAXFILEBUFFLENGTH]=""; char out_file_name[MAXFILEBUFFLENGTH]=""; char log_file_name[MAXFILEBUFFLENGTH]=""; char buff[MAXFILEBUFFLENGTH]; char chr[MAXFILEBUFFLENGTH]; int beg, beg_prev, end, pos, offset; int ref_id, ref_id_prev, label; int s, side; int read_type, mapped_strand; char ch; int limit_counts = 0; int* contig_count[2]; int* contig_index[2]; splice_site** contig_sites[2]; long int n_reads[N_READ_TYPES][2]; long int n_total_reads = 0; long int n_skipped_reads = 0; int max_intron_length=0; int min_intron_length=0; int ignore_gene_labels = 0; int stranded = 1; int rev_compl[2] = {1,0}; int other_end, the_end, donor_id, acceptor_id; int *cigar; int flagged = 0; int margin = 4; /** reading input from the command line **/ timestamp = time(NULL); if(argc==1) { fprintf(stderr, "BAM2SSJ is the utility for fast counting reads covering splice junctions\nCommand line use:\n"); fprintf(stderr, "%s -cps <cps_file> -bam <bam_file> [-out <out_file>] [-log <log_file>] [-maxlen <max_intron_length>] [-minlen <min_intron_length>] [-margin <length>] ",argv[0]); fprintf(stderr, "[-v suppress verbose output] [-read1 0/1] [-read2 0/1] [-g ignore gene labels] [-u unstranded] [-f count reads flagged 0x800 only]\ntype %s -h for more info\n",argv[0]); exit(1); } for(i=1;i<argc;i++) { pc = argv[i]; if(*pc == '-') { if(strcmp(pc+1,"cps") == 0) sscanf(argv[++i], "%s", &cps_file_name[0]); if(strcmp(pc+1,"bam") == 0) sscanf(argv[++i], "%s", &bam_file_name[0]); if(strcmp(pc+1,"out") == 0) sscanf(argv[++i], "%s", &out_file_name[0]); if(strcmp(pc+1,"log") == 0) sscanf(argv[++i], "%s", &log_file_name[0]); if(strcmp(pc+1,"read1") == 0) sscanf(argv[++i], "%i", &rev_compl[0]); if(strcmp(pc+1,"read2") == 0) sscanf(argv[++i], "%i", &rev_compl[1]); if(strcmp(pc+1,"lim") == 0) sscanf(argv[++i], "%i", &limit_counts); if(strcmp(pc+1,"minlen") == 0) sscanf(argv[++i], "%i", &min_intron_length); if(strcmp(pc+1,"maxlen") == 0) sscanf(argv[++i], "%i", &max_intron_length); if(strcmp(pc+1,"margin") == 0) sscanf(argv[++i], "%i", &margin); if(strcmp(pc+1,"v") == 0) verbose = 0; if(strcmp(pc+1,"g") == 0) ignore_gene_labels = 1; if(strcmp(pc+1,"u") == 0) stranded = 0; if(strcmp(pc+1,"f") == 0) flagged = 1; if(strcmp(pc+1,"h") ==0 ) { fprintf(stderr, "Input: (1) sorted BAM file\n"); fprintf(stderr, "\t(2) CPS (chromosome-position-strand) tab-delimited file sorted by position (chr1 100 + etc)\n\n"); fprintf(stderr, "\tIn order to get CPS file from gtf, use the utility gtf2cps.sh\n"); fprintf(stderr, "\tImportant: CPS must be sorted by position ONLY!\n\n"); fprintf(stderr, "\tIf the 4th column contains (a numeric) gene label then only splice junctions within the same gene will be considered (unless the '-g' option is active)\n"); fprintf(stderr, "\tThe utility to generate CPS with gene labels is gtf2cps_with_gene_id.sh (or update the script accordingly if you are using genome other than human)\n\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "\t-maxlen <upper limit on intron length>; 0 = no limit (default=%i)\n",max_intron_length); fprintf(stderr, "\t-minlen <lower limit on intron length>; 0 = no limit (default=%i)\n",min_intron_length); fprintf(stderr, "\t-margin <length> minimum number of flanking nucleotides in the read in order to support SJ or cover EB, (default=%i)\n",margin); fprintf(stderr, "\t-read1 0/1, reverse complement read1 no/yes (default=%i)\n",rev_compl[0]); fprintf(stderr, "\t-read2 0/1, reverse complement read2 no/yes (default=%i)\n",rev_compl[1]); fprintf(stderr, "\t-g ignore gene labels (column 4 of cps), default=%s\n", ignore_gene_labels ? "ON" : "OFF"); fprintf(stderr, "\t-u ignore strand (all reads map to the correct strand), default=%s\n", stranded ? "OFF" : "ON"); fprintf(stderr, "\t-f count only reads that are flagged 0x800 (uniquely mapped reads), default=%s\n", flagged ? "ON" : "OFF"); fprintf(stderr, "Output: tab-delimited (default=stdout)\n"); fprintf(stderr, "\tColumn 1 is splice_junction_id\n"); fprintf(stderr, "\tColumns 2-6 are counts of 53, 5X, X3, 50, and 03 reads for the correct (annotated) strand\n"); fprintf(stderr, "\tColumns 7-11 are similar counts for the incorrect (opposite to annotated) strand\n"); fprintf(stderr, "Descriptive read statistics are reported to stderr\n"); exit(1); } } } if(log_file_name[0]==0) { log_file = stderr; } else { log_file = fopen(log_file_name,"w"); if(log_file == NULL) log_file = stderr; } if(bam_file_name[0]==0) { fprintf(log_file,"Bam not specified, exiting\n"); exit(1); } if(cps_file_name[0]==0) { fprintf(log_file,"Input not specified, exiting\n"); exit(1); } if(out_file_name[0]==0) { fprintf(log_file,"[Warning: output set to stdout]\n"); output_file = stdout; } else { output_file = fopen(out_file_name,"w"); if(output_file == NULL) { fprintf(log_file,"[Warning: output set to stdout]\n"); output_file = stdout; } } if(max_intron_length>0) { if(verbose) fprintf(log_file,"[Warning: set max intron length=%i]\n",max_intron_length); } if(ignore_gene_labels) { if(verbose) fprintf(log_file,"[Warning: ignoring gene labels (column 4)]\n"); } if(flagged) { if(verbose) fprintf(log_file,"[Warning: only look at reads flagged 0x800]\n"); } if(margin>0) { if(verbose) fprintf(log_file,"[Warning: read margin set to %i]\n", margin); } if(verbose) { for(s = 0; s < 2; s++) if(rev_compl[s]) fprintf(log_file,"[Warning: take reverse complement of read %i]\n", s+1); fprintf(log_file,"[Warning: stranded = %s]\n", stranded ? "TRUE" : "FALSE (always correct strand)"); if(ignore_gene_labels) fprintf(log_file,"[Warning: ignore gene labels (column 4)]\n"); } for(i = 0; i < N_READ_TYPES; i++) for(s = 0; s < 2; s++) n_reads[i][s] = 0; /** initatializing BAM and header **/ bam_input = bam_open(bam_file_name, "r"); header = bam_header_read(bam_input); if(bam_input == NULL || header == NULL) { fprintf(log_file,"BAM can't be opened or contains no header, exiting\n"); exit(1); } /** reading input from CPS **/ input_file = fopen(cps_file_name, "r"); if(input_file == NULL) { fprintf(log_file,"CPS can't be opened, exiting\n"); exit(1); } /** populating gene structure arrays **/ for(s = 0; s < 2; s++) { contig_count[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN)); contig_index[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN)); contig_sites[s] = (splice_site**) malloc(sizeof(splice_site*) * (header->n_targets + ARRAY_MARGIN)); if(contig_count[s] == NULL || contig_sites[s] == NULL || contig_index[s] == NULL) { fprintf(log_file, "Not enought memory, exiting\n"); exit(1); } } for(s = 0; s < 2; s++) for(i=0; i < header->n_targets; i++) contig_count[s][i] = contig_index[s][i] = 0; if(verbose) fprintf(log_file, "Reading %s pass1", cps_file_name); while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) { sscanf(buff, "%s %*i %c", &chr[0], &ch); bam_parse_region(header, chr, &i, &beg, &end); s = (ch == '+' ? 0 : 1); if(i < header->n_targets && i>=0) contig_count[s][i]++; } for(s = 0; s < 2; s++) { for(i = 0;i < header->n_targets; i++) { contig_sites[s][i] = (splice_site*) malloc(sizeof(splice_site) * (contig_count[s][i] + ARRAY_MARGIN)); if(contig_sites[s][i] == NULL) { fprintf(log_file, "Not enought memory, exiting\n"); exit(1); } } } if(verbose) fprintf(log_file, "\n"); if(verbose) fprintf(log_file, "Reading %s pass2",cps_file_name); fseek(input_file, 0, SEEK_SET); while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) { sscanf(buff, "%s %i %c %i", &chr[0], &pos, &ch, &label); bam_parse_region(header, chr, &i, &beg, &end); s = (ch == '+' ? 0 : 1); if(i < header->n_targets && i>=0) { if(contig_index[s][i]>0) { if(pos < contig_sites[s][i][contig_index[s][i]-1].pos) { fprintf(log_file, "Splice sites weren't sorted, exiting\n"); exit(1); } } contig_sites[s][i][contig_index[s][i]].pos = pos; contig_sites[s][i][contig_index[s][i]].label = ignore_gene_labels ? 0 : label; for(side = 0; side < 2; side++) { contig_sites[s][i][contig_index[s][i]].count00[side] = 0; contig_sites[s][i][contig_index[s][i]].count5X[side] = 0; contig_sites[s][i][contig_index[s][i]].countX3[side] = 0; contig_sites[s][i][contig_index[s][i]].junctions = NULL; } contig_index[s][i]++; } } if(verbose) fprintf(log_file, "\n"); for(s = 0; s < 2; s++) for(i = 0;i < header->n_targets; i++) contig_index[s][i] = 0; /** analysis starts here **/ b = bam_init1(); k = 0; ref_id_prev = -1; beg_prev = -1; while(bam_read1(bam_input, b)>=0) { c = &b->core; ref_id = c->tid; if(ref_id<0) continue; if(flagged && ((c->flag & 0x800) == 0)) { n_skipped_reads++; continue; } if(stranded && ((c->flag & BAM_FREAD1) && (c->flag & BAM_FREAD2) || !(c->flag & BAM_FREAD1) && !(c->flag & BAM_FREAD2))) { n_skipped_reads++; continue; } cigar = bam1_cigar(b); if(ref_id != ref_id_prev && ref_id_prev >= 0) { if(contig_index[0][ref_id_prev] + contig_index[1][ref_id_prev] < contig_count[0][ref_id_prev] + contig_count[1][ref_id_prev]) { if(log_file==stderr) progressbar(1, 1, header->target_name[ref_id_prev], verbose); } beg_prev = -1; } /*if(ref_id < ref_id_prev) { fprintf(log_file,"BAM file wasn't sorted, exiting\n"); exit(1); }*/ ref_id_prev = ref_id; beg = c->pos + 1; if(beg < beg_prev) { fprintf(log_file,"BAM file wasn't sorted, exiting\n"); exit(1); } beg_prev = beg; s = ((c->flag & BAM_FREVERSE)>0); mapped_strand = (c->flag & BAM_FREAD1) ? (s + rev_compl[0]) & 1 : (s + rev_compl[1]) & 1; the_end = bam_calend(c, cigar); for(s = 0; s < 1 + stranded; s++) { end = beg; side = (s == mapped_strand) ? 0 : 1; side *= stranded; // keep reading until the currect site is on the same chromosome downstream of the read while(contig_sites[s][ref_id][contig_index[s][ref_id]].pos < beg && contig_index[s][ref_id] < contig_count[s][ref_id]) { contig_index[s][ref_id]++; if(log_file==stderr) progressbar(contig_index[0][ref_id]+contig_index[1][ref_id], contig_count[0][ref_id]+contig_count[1][ref_id], header->target_name[ref_id], verbose); } read_type = RT_OTHER; if(contig_index[s][ref_id]<contig_count[s][ref_id]) { // check if the read is a split read and find its other end read_type = RT_GENOME; for(i = 0; i < c->n_cigar; i++) { offset = cigar[i] >> 4; switch(cigar[i] & 0x0F) { case BAM_CMATCH: end += offset; // match to the reference break; case BAM_CINS: end += 0; // insertion to the reference, pointer stays unchanged break; case BAM_CDEL: end += offset; // deletion from the reference (technically the same as 'N') pointer moves break; case BAM_CREF_SKIP: other_end = end + offset; donor_id = acceptor_id = -INFTY; if(end - beg < margin) break; if(the_end - other_end < margin) break; for(j = contig_index[s][ref_id]; contig_sites[s][ref_id][j].pos <= other_end && j < contig_count[s][ref_id];j++) { if(contig_sites[s][ref_id][j].pos - end < min_intron_length && min_intron_length > 0) continue; if(contig_sites[s][ref_id][j].pos - end > max_intron_length && max_intron_length > 0) break; if(contig_sites[s][ref_id][j].label == contig_sites[s][ref_id][contig_index[s][ref_id]].label) { if(contig_sites[s][ref_id][j].pos == end - 1) donor_id = j; if(contig_sites[s][ref_id][j].pos == other_end) acceptor_id = j; } } if(donor_id>0 && acceptor_id>0) { update_count(&contig_sites[s][ref_id][donor_id].junctions, acceptor_id, side); contig_sites[s][ref_id][donor_id].count5X[side]++; contig_sites[s][ref_id][acceptor_id].countX3[side]++; read_type = RT_KJUNCT; } else { read_type = RT_UJUNCT; } end = other_end; break; case BAM_CSOFT_CLIP: case BAM_CHARD_CLIP: case BAM_CPAD: break; default: read_type = RT_OTHER; } } if(read_type == RT_GENOME) { for(j=contig_index[s][ref_id]; beg + margin <= contig_sites[s][ref_id][j].pos && contig_sites[s][ref_id][j].pos < end - margin && j<contig_count[s][ref_id]; j++) { contig_sites[s][ref_id][j].count00[side]++; read_type = RT_OVRLAP; k++; } } } n_reads[read_type][side]++; } n_total_reads++; if(k>limit_counts && limit_counts>0) break; }