示例#1
0
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
{
	if (iter->error) return -1;
	if (b) {
		if (b->core.tid < 0) return 0;
		if (b->core.flag & iter->flag_mask) return 0;
		if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
		bam_copy1(&iter->tail->b, b);
		iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
		iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
		if (b->core.tid < iter->max_tid) {
			fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
			iter->error = 1;
			return -1;
		}
		if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
			fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
			iter->error = 1;
			return -1;
		}
		iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
		if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
			iter->tail->next = mp_alloc(iter->mp);
			iter->tail = iter->tail->next;
		}
	} else iter->is_eof = 1;
	return 0;
}
示例#2
0
void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
{
	bam1_t *swap;
	int i, end;
	uint32_t *cigar;
	str->l = 0;
	if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip
	if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
	kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	end = bam_calend(&b1->core, cigar);
	kputw(b2->core.pos - end, str);
	kputc('T', str);
	kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); 
}
示例#3
0
// currently, this function ONLY works if each read has one hit
void bam_mating_core(bamFile in, bamFile out)
{
	bam_header_t *header;
	bam1_t *b[2];
	int curr, has_prev, pre_end = 0, cur_end;
	kstring_t str;

	str.l = str.m = 0; str.s = 0;
	header = bam_header_read(in);
	bam_header_write(out, header);

	b[0] = bam_init1();
	b[1] = bam_init1();
	curr = 0; has_prev = 0;
	while (bam_read1(in, b[curr]) >= 0) {
		bam1_t *cur = b[curr], *pre = b[1-curr];
		if (cur->core.tid < 0) continue;
		cur_end = bam_calend(&cur->core, bam1_cigar(cur));
		if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP;
		if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments
		if (has_prev) {
			if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
				cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
				pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
				if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
					&& !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE
				{
					uint32_t cur5, pre5;
					cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
					pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
					cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
				} else cur->core.isize = pre->core.isize = 0;
				if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
				else cur->core.flag &= ~BAM_FMREVERSE;
				if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
				else pre->core.flag &= ~BAM_FMREVERSE;
				if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
				if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
				bam_template_cigar(pre, cur, &str);
				bam_write1(out, pre);
				bam_write1(out, cur);
				has_prev = 0;
			} else { // unpaired or singleton
				pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
				if (pre->core.flag & BAM_FPAIRED) {
					pre->core.flag |= BAM_FMUNMAP;
					pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
				}
				bam_write1(out, pre);
			}
		} else has_prev = 1;
		curr = 1 - curr;
		pre_end = cur_end;
	}
	if (has_prev) bam_write1(out, b[1-curr]);
	bam_header_destroy(header);
	bam_destroy1(b[0]);
	bam_destroy1(b[1]);
	free(str.s);
}
示例#4
0
 void GBamRecord::set_cigar(const char* cigar) {
   //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
   int doff=b->core.l_qname;
   uint8_t* after_cigar=NULL;
   int after_cigar_len=0;
   uint8_t* prev_bdata=NULL;
   if (b->data_len>doff) {
      //cigar string already allocated, replace it
      int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
      after_cigar=b->data+d;
      after_cigar_len=b->data_len-d;
      }
   const char *s;
   char *t;
   int i, op;
   long x;
   b->core.n_cigar = 0;
   if (cigar != NULL && strcmp(cigar, "*") != 0) {
        for (s = cigar; *s; ++s) {
            if (isalpha(*s)) b->core.n_cigar++;
            else if (!isdigit(*s)) {
                 GError("Error: invalid CIGAR character (%s)\n",cigar);
                 }
            }
        if (after_cigar_len>0) { //replace/insert into existing full data
             prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
             memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
             free(prev_bdata);
             }
           else {
             realloc_bdata(b, doff + b->core.n_cigar * 4);
             }
        for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
            x = strtol(s, &t, 10);
            op = toupper(*t);
            if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
            else if (op == 'I') op = BAM_CINS;
            else if (op == 'D') op = BAM_CDEL;
            else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true;
            else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true;
            else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true;
            else if (op == 'P') op = BAM_CPAD;
            else GError("Error: invalid CIGAR operation (%s)\n",cigar);
            s = t + 1;
            bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
        }
        if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
        b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
    } else {//no CIGAR string given
        if (!(b->core.flag&BAM_FUNMAP)) {
            GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
            b->core.flag |= BAM_FUNMAP;
        }
        b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
    }
   setupCoordinates();
   } //set_cigar()
示例#5
0
static int fill_buf(samfile_t *in, buffer_t *buf)
{
	int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
	bam1_t *b = bam_init1();
	bam1_core_t *c = &b->core;
	// squeeze out the empty cells at the beginning
	for (i = 0; i < buf->n; ++i)
		if (buf->buf[i].b) break;
	if (i < buf->n) { // squeeze
		if (i > 0) {
			memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
			buf->n = buf->n - i;
		}
	} else buf->n = 0;
	// calculate min_rpos
	for (i = 0; i < buf->n; ++i) {
		elem_t *e = buf->buf + i;
		if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
			min_rpos = buf->buf[i].rpos;
	}
	// fill the buffer
	buf->x = -1;
	last_tid = buf->n? buf->buf[0].b->core.tid : -1;
	capacity = buf->n + BLOCK_SIZE;
	while ((ret = samread(in, b)) >= 0) {
		elem_t *e;
		uint8_t *qual = bam1_qual(b);
		int is_mapped;
		if (last_tid < 0) last_tid = c->tid;
		if (c->tid != last_tid) {
			if (buf->x < 0) buf->x = buf->n;
		}
		if (buf->n >= buf->max) { // enlarge
			buf->max = buf->max? buf->max<<1 : 8;
			buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
		}
		e = &buf->buf[buf->n++];
		e->b = bam_dup1(b);
		e->rpos = -1; e->score = 0;
		for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
		e->score = (double)e->score / sqrt(c->l_qseq + 1);
		is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
		if (!is_mapped) e->score = -1;
		if (is_mapped && (c->flag & BAM_FREVERSE)) {
			e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
			if (min_rpos > e->rpos) min_rpos = e->rpos;
		}
		if (buf->n >= capacity) {
			if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
			else break;
		}
	}
	if (ret >= 0 && buf->x < 0) buf->x = buf->n;
	bam_destroy1(b);
	return buf->n;
}
示例#6
0
unsigned long gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment)
{
  gt_assert(sam_alignment != NULL);
  if (sam_alignment->rightmost == GT_UNDEF_ULONG)
  {
    sam_alignment->rightmost = (unsigned long)bam_calend(
        &sam_alignment->s_alignment->core,
        bam1_cigar(sam_alignment->s_alignment));
  }
  return sam_alignment->rightmost;
}
示例#7
0
GtUword gt_sam_alignment_rightmost_pos(GtSamAlignment *sam_alignment)
{
  gt_assert(sam_alignment != NULL);
  if (sam_alignment->rightmost == GT_UNDEF_UWORD)
  {
    sam_alignment->rightmost = (GtUword)bam_calend(
        &sam_alignment->s_alignment->core,
        bam1_cigar(sam_alignment->s_alignment));
  }
  return sam_alignment->rightmost;
}
示例#8
0
文件: bam_plcmd.c 项目: 9beckert/TIR
static int mplp_func(void *data, bam1_t *b)
{
	extern int bam_realn(bam1_t *b, const char *ref);
	extern int bam_prob_realn_core(bam1_t *b, const char *ref, int);
	extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
	mplp_aux_t *ma = (mplp_aux_t*)data;
	int ret, skip = 0;
	do {
		int has_ref;
		ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b);
		if (ret < 0) break;
		if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads
			skip = 1;
			continue;
		}
		if (ma->conf->bed) { // test overlap
			skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
			if (skip) continue;
		}
		if (ma->conf->rghash) { // exclude read groups
			uint8_t *rg = bam_aux_get(b, "RG");
			skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0);
			if (skip) continue;
		}
		if (ma->conf->flag & MPLP_ILLUMINA13) {
			int i;
			uint8_t *qual = bam1_qual(b);
			for (i = 0; i < b->core.l_qseq; ++i)
				qual[i] = qual[i] > 31? qual[i] - 31 : 0;
		}
		has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
		skip = 0;
		if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1);
		if (has_ref && ma->conf->capQ_thres > 10) {
			int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres);
			if (q < 0) skip = 1;
			else if (b->core.qual > q) b->core.qual = q;
		}
		else if (b->core.qual < ma->conf->min_mq) skip = 1; 
		else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1;
	} while (skip);
	return ret;
}
示例#9
0
/**
* @brief Major quality check point for each read
*
* @param rm Empty read_metrics_t to be updated with the results
* @param temp_read The read to be assessed
* @param user_args User arguments to be considered during assessment
* @param bresults Block wide results of the parsing
* @param lpos Current position
* @return void
* @details Major quality check point for each read
* @todo nothing
*/
void quality_check(read_metrics_t *rm,bam1_t *temp_read,user_arguments_t *user_args,seq_block_t *bresults,int lpos){
	static int pos_dupcounter=0,neg_dupcounter=0;
	rm->skip=0;
	rm->read_length=0;

	rm->genomic_end= bam_calend(&temp_read->core,bam1_cigar(temp_read));

	/* Determine read length */
	if(bam1_pair(temp_read)){
		++bresults->paired;
		if (bam1_ppair(temp_read))++bresults->ppairs;
	}

	++bresults->total_reads;
	if(temp_read->core.qual < user_args->TMAPQ || bam1_unmapped(temp_read)){
		++bresults->lowqual;
		rm->skip=1;
		return;
	}

	if(user_args->UNIQUE && bam1_multimap(temp_read)){
		rm->skip=1;
		return;
	}

	if(!user_args->PAIRED){
		rm->revcomp=bam1_strand(temp_read);
		rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read));
	} else if (bam1_ppair(temp_read) && !bam1_notprimary(temp_read)){
		rm->revcomp=bam1_revpair(temp_read);
		if(!user_args->READTHROUGH){rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read));//sets the read length only!!
		}else if(temp_read->core.isize!=0 ){
				if((bam1_firstr(temp_read)&&!bam1_revpair(temp_read))||(bam1_secondr(temp_read)&&bam1_mrevpair(temp_read))){
					rm->read_length=temp_read->core.isize;
				} else {
					rm->skip=1;
					return;
				}
		} else{
			warning("ISIZE not set in SAM/BAM file. Re-run without using the readthrough_pairs option\n");
			rm->skip=-4;
			return;
		}
	} else{
		rm->skip=1;
		return;
	}

	if(!rm->read_length){
		rm->read_length=temp_read->core.l_qseq;
		if(!rm->read_length){
			warning("Read length neither found in core.isize=%d, core.l_qseq=%d or cigar=%d!\n",temp_read->core.isize,temp_read->core.l_qseq,bam1_cigar(temp_read));
			rm->skip=-4;
			return;
		}
	}
	/* END */

	if(user_args->STRANDED!=0){
		if((user_args->STRANDED==-1 && !rm->revcomp) || (user_args->STRANDED==1 && rm->revcomp)){
			rm->skip=1;return;
		}
	}

	if(user_args->COLLAPSE>0){
		if(lpos==temp_read->core.pos){
			if(!rm->revcomp)++pos_dupcounter;
			else ++neg_dupcounter;
			if(pos_dupcounter>=user_args->COLLAPSE || neg_dupcounter>=user_args->COLLAPSE){
				++bresults->collapsed;
				rm->skip=1;
				return;
			}
		}else{
			pos_dupcounter=0;
			neg_dupcounter=0;
		}
	}

	if(!rm->skip){
		rm->revcomp ? ++bresults->neg_strand : ++bresults->pos_strand;
		++bresults->filtered_reads;
		bresults->mapmass+=rm->read_length;
	}
}
示例#10
0
文件: types.hpp 项目: Brainiarc7/TS
 int32_t getEnd() const {assert(m_dataPtr); return bam_calend(&m_dataPtr->core, bam1_cigar(m_dataPtr)) + 1;}
示例#11
0
文件: samutils.c 项目: CSB5/lofreq
/* Count matches (OP_MATCH), mismatches (OP_MISMATCH), insertions
 * (OP_INS) and deletions (OP_DEL) for an aligned read. Written to
 * (preallocated, size 4) counts at indices given above. Will ignore
 * all mis-/match bases if their bq is below min_bq.
 *
 * Returns the total number of operations counted (excl. clipped bases
 * or those with bq<min_bq) or -1 on error. Consecutive indels are
 * counted as one operation, using INDEL_QUAL_DEFAULT, which is
 * suboptimal. 0 is a valid return value, e.g. if all bases are below
 * the quality threshold.
 *
 * If quals is not NULL it will be used as a two dim array (has to be
 * preallocated) with OPs as first dim (len NUM_OP_CATS) and the
 * qualities of the bases as second dim. NOTE/FIXME: this uses bq for
 * mis/matches and INDEL_QUAL_DEFAULT for now in case of indels. The
 * number of elements corresponds to the count entry and can be at max
 * readlen.
 * 
 * If target is non-NULL will ignore preloaded variant positions via
 * var_in_ign_list
 *
 * WARNING code duplication with calc_read_alnerrprof but merging the
 * two functions was too complicated (and the latter is unused anyway)
 */
int
count_cigar_ops(int *counts, int **quals, const bam1_t *b,
                const char *ref, int min_bq, char *target)
{
#if 0
#define TRACE 1
#endif
     int num_ops = 0;
     /* modelled after bam.c:bam_calend(), bam_format1_core() and
      * pysam's aligned_pairs (./pysam/csamtools.pyx)
      */
     uint32_t *cigar = bam1_cigar(b);
     const bam1_core_t *c = &b->core;
     uint32_t tpos = c->pos; /* pos on genome */
     uint32_t qpos = 0; /* pos on read/query */
     uint32_t k, i;
#if 0
     int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */
#else
     int qlen = b->core.l_qseq; /* read length */
#endif

     if (! ref) {
          return -1;
     }
     if (! counts) {
          return -1;
     }

     memset(counts, 0, NUM_OP_CATS*sizeof(int));

     /* loop over cigar to get aligned bases
      *
      * read: bam_format1_core(NULL, b, BAM_OFDEC);
      */
     for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */
          int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */
          uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT;

          /* following conditionals could be collapsed to much shorter
           * code, but we keep them roughly as they were in pysam's
           * aligned_pairs to make later comparison and handling of
           * indels easier
           */
          if (op == BAM_CMATCH || op == BAM_CDIFF) {
               for (i=tpos; i<tpos+l; i++) {                             
                    int actual_op;
                    assert(qpos < qlen);
                    char ref_nt = ref[i];
                    char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)];
                    int bq = bam1_qual(b)[qpos];

                    if (ref_nt != read_nt || op == BAM_CDIFF) {
                         actual_op = OP_MISMATCH;
                    } else {
                         actual_op = OP_MATCH;
                    }

                    /* ignoring base if below min_bq, independent of type */
                    if (bq<min_bq) {
#ifdef TRACE
                         fprintf(stderr, "TRACE(%s): [M]MATCH ignoring base because of bq=%d at %d (qpos %d)\n", bam1_qname(b), bq, i, qpos);
#endif
                         qpos += 1;
                         continue;
                    }

                    /* for mismatches only */
                    if (target && actual_op == OP_MISMATCH) {
                         var_t fake_var;
                         memset(&fake_var, 0, sizeof(var_t));
                         fake_var.chrom = target;
                         fake_var.pos = i;
                         /* FIXME evil, evil hack. only works as long as var_in_ign_list only uses chrom and pos */
                         if (var_in_ign_list(&fake_var)) {

#ifdef TRACE
                              fprintf(stderr, "TRACE(%s): MM: ignoring because in ign list at %d (qpos %d)\n", bam1_qname(b), i, qpos);
#endif
                              qpos += 1;
                              continue;
                         } 
                    }

#ifdef TRACE
                    fprintf(stderr, "TRACE(%s): adding [M]MATCH qpos,tpos,ref,read,bq = %d,%d,%c,%c,%d\n", bam1_qname(b), qpos, tpos, ref_nt, read_nt, bq);
#endif                    
                    counts[actual_op] += 1;
                    if (quals) {
                         quals[actual_op][counts[actual_op]-1] = bq;
                    }

                    qpos += 1;
               }
               tpos += l;

          } else if (op == BAM_CINS || op == BAM_CDEL) {

               if (target) {
                    /* vcf: 
                     * indel at tpos 1 means, that qpos 2 is an insertion  (e.g. A to AT)
                     * del at tpos 1 means, that qpos 2 is missing (e.g. AT to A)
                     */
                    var_t fake_var;
                    fake_var.chrom = target;
                    fake_var.pos = tpos;
                    if (op==BAM_CINS) {
                         fake_var.pos -= 1;
                    }
                    /* FIXME see above: only works as long as var_in_ign_list only uses chrom and pos */
                    if (var_in_ign_list(&fake_var)) {
                         if (op == BAM_CINS) {
                              qpos += l;
                         }
#ifdef TRACE
                         fprintf(stderr, "TRACE(%s): %c: ignoring because in ign list at tpos %d (qpos %d)\n", bam1_qname(b), op == BAM_CINS? 'I':'D', tpos, qpos);
#endif
                         continue;
                    }
               }

#ifdef TRACE
               fprintf(stderr, "TRACE(%s): adding %c qpos,tpos = %d,%d\n", bam1_qname(b), op==BAM_CINS?'I':'D', qpos, tpos);
#endif                    

               if (op == BAM_CINS) {
                    counts[OP_INS] += 1; /* counts indel as 1 operation only */
                    if (quals) {
                         quals[OP_INS][counts[OP_INS]-1] = INDEL_QUAL_DEFAULT; /* FIXME use iq */
                    }
                    qpos += l;/* forward query pos by length of operation */

               } else if (op == BAM_CDEL) {
                    counts[OP_DEL] += 1; /* counts indel as 1 operation only */
                    if (quals) {
                         quals[OP_DEL][counts[OP_DEL]-1] = INDEL_QUAL_DEFAULT; /* FIXME use dq */
                    }
                    tpos += l; /* forward genome pos by length of operation */

               } else {
                    LOG_FATAL("%s\n", "INTERNAL ERROR: should never get here");
                    exit(1);
               }

          } else if (op == BAM_CREF_SKIP) {
               tpos += l;

          } else if (op == BAM_CSOFT_CLIP) {
#if 0
               printf("SOFT CLIP qpos = %d\n", qpos);
#endif
               qpos += l;

          } else if (op != BAM_CHARD_CLIP) {
               LOG_WARN("Untested op %d in cigar %s\n", op, cigar_str_from_bam(b));
               /* don't think we need to do anything here */
          }
     } /* for k */

     assert(qpos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */
     if (qpos != qlen) {
          LOG_WARN("got qpos=%d and qlen=%d for cigar %s l_qseq %d in read %s\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq, bam1_qname(b));
     }
     assert(qpos == qlen);

     num_ops = 0;
     for (i=0; i<NUM_OP_CATS; i++) {
          num_ops += counts[i];
#ifdef TRACE
          int j;
          for (j=0; j<counts[i]; j++) {
               fprintf(stderr, "TRACE(%s) op %s #%d: %d\n", bam1_qname(b), op_cat_str[i], j, quals[i][j]);
          }
#endif
     }
     return num_ops;
}
示例#12
0
文件: samutils.c 项目: CSB5/lofreq
/* Counts probability of non-match count along the read after
 * subtracting error prob at that position (using the original
 * orientation). used_pos is an array of ints indicating whether
 * position was used or not (trimmed, clipped etc). alnerrprof and
 * used_pos must be of at least length b->core.l_qseq. Note: will add
 * to alnerrprof and used_pos, i.e. arrays should be initialized to 0 if
 * you don't want aggregate values.
 *
 * WARNING code duplication with count_cigar_ops but merging the two
 * functions is messy.
 */
void
calc_read_alnerrprof(double *alnerrprof, unsigned long int *used_pos, 
                   const bam1_t *b, const char *ref)
{
     /* modelled after bam.c:bam_calend(), bam_format1_core() and
      * pysam's aligned_pairs (./pysam/csamtools.pyx)
      */
     uint32_t *cigar = bam1_cigar(b);
     uint32_t k, i;
     const bam1_core_t *c = &b->core;
#if 0
     int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */
#else
     int qlen = b->core.l_qseq; /* read length */
#endif
     uint32_t pos = c->pos; /* pos on genome */
     uint32_t qpos = 0; /* pos on read/query */
     uint32_t qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;/* original qpos before mapping as possible reverse */


     /* loop over cigar to get aligned bases
      *
      * read: bam_format1_core(NULL, b, BAM_OFDEC);
      */
     for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */
          int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */
          uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT;

          /* following conditionals could be collapsed to much shorter
           * code, but we keep them as they were in pysam's
           * aligned_pairs to make later handling of indels easier
           */
          if (op == BAM_CMATCH || op == BAM_CDIFF) {
               for (i=pos; i<pos+l; i++) {                             
                    assert(qpos < qlen);
                    /* case agnostic */
                    char ref_nt = ref[i];
                    char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)];
                    int bq = bam1_qual(b)[qpos];
#if 0
                    printf("[M]MATCH qpos,i,ref,read = %d,%d,%c,%c\n", qpos, i, ref_nt, read_nt);
#endif                    

                    if (ref_nt != 'N') {
                         if (ref_nt != read_nt || op == BAM_CDIFF) {
                              alnerrprof[qpos_org] += (1.0 - PHREDQUAL_TO_PROB(bq));
                         } /* otherwise leave at 0.0 but count anyway */
                         used_pos[qpos_org] += 1;
                    }
                    qpos += 1;
                    qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;
               }
               pos += l;

          } else if (op == BAM_CINS) {
               for (i=pos; i<pos+l; i++) {
                    assert(qpos < qlen);
                    
                    alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT));
                    used_pos[qpos] += 1;
#if 0
                    printf("INS qpos,i = %d,None\n", qpos);
#endif
                    qpos += 1;
                    qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;
               }
               
          } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
               for (i=pos; i<pos+l; i++) {
#if 0
                    printf("DEL qpos,i = None,%d\n", i);
#endif

                    if (op == BAM_CDEL) {
                         alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT));
                         used_pos[qpos] += 1;
                    }
               }
               pos += l;
               /* deletion: don't increase qpos */

          } else if (op == BAM_CSOFT_CLIP) {
#if 0
               printf("SOFT CLIP qpos = %d\n", qpos);
#endif
               qpos += l;
               qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;

          } else if (op != BAM_CHARD_CLIP) {
               LOG_WARN("Unknown op %d in cigar %s\n", op, cigar_str_from_bam(b));

          }
     } /* for k */
     assert(pos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */
     if (qpos != qlen) {
          LOG_FIXME("got qpos=%d and qlen=%d for cigar %s l_qseq %d\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq);
     }
     assert(qpos == qlen); /* FIXME correct assert? What if hard clipped? */

#if 0
     fprintf(stderr, "%s:", __FUNCTION__);
     for (i=0; i< b->core.l_qseq; i++) {
          fprintf(stderr, " %g/%d", alnerrprof[i], used_pos[i]);
     }
     fprintf(stderr, "\n");
#endif
}
示例#13
0
inline
uint32_t last_pos(BamEntry const& e) {
    return bam_calend(&e->core, bam1_cigar(e));
}
示例#14
0
bam1_t* convert_to_bam(alignment_t* alignment_p, int base_quality) {
    bam1_t* bam_p = bam_init1();  // -------------------------> 0s.

    int data_length, sequence_length, copy_length, index_to_data = 0;
    uint8_t* data;

    sequence_length = strlen(alignment_p->sequence);

    //data length is the sum of lengths if five codified fields (cigar, query name, sequence, quality and optional info)
    data_length = (4 * alignment_p->num_cigar_operations) + strlen(alignment_p->query_name) + 1 + ((sequence_length + 1) / 2) + sequence_length + alignment_p->optional_fields_length;

    //memory allocation for data vector from data length
    data = (uint8_t*) calloc(data_length, sizeof(uint8_t));

    //copy query name
    copy_length = strlen(alignment_p->query_name) + 1;
    strcat(alignment_p->query_name, "\0");
    memcpy(&data[index_to_data], alignment_p->query_name, copy_length);
    index_to_data += copy_length;

    //convert cigar to uint32_t format
    convert_to_cigar_uint32_t(&data[index_to_data], alignment_p->cigar, alignment_p->num_cigar_operations);

    copy_length = (4 * alignment_p->num_cigar_operations);
    index_to_data += copy_length;

    //convert sequence to uint8_t format
    convert_to_sequence_uint8_t(&data[index_to_data], alignment_p->sequence, sequence_length);

    copy_length = ((sequence_length + 1) / 2);
    index_to_data += copy_length;

    //convert quality to uint8_t format
    convert_to_quality_uint8_t(&data[index_to_data], alignment_p->quality, sequence_length, base_quality);

    copy_length = sequence_length;
    index_to_data += copy_length;

    //copy optional fields
    memcpy(&data[index_to_data], alignment_p->optional_fields, alignment_p->optional_fields_length);

    //finally data is assigned
    bam_p->data = data;

    //filling bam1_t (not core data)
    bam_p->l_aux = alignment_p->optional_fields_length;
    bam_p->data_len = data_length;
    bam_p->m_data = data_length;

    //filling bam1_core_t structure
    bam_p->core.tid = (int32_t) alignment_p->chromosome;
    bam_p->core.pos = (int32_t) alignment_p->position;
    bam_p->core.mtid = (int32_t) alignment_p->mate_chromosome;
    bam_p->core.mpos = (int32_t) alignment_p->mate_position;
    bam_p->core.qual = (uint32_t) alignment_p->map_quality;
    bam_p->core.isize = (int32_t) alignment_p->template_length;
    bam_p->core.l_qname = strlen(alignment_p->query_name) + 1;
    bam_p->core.n_cigar = (uint32_t) alignment_p->num_cigar_operations;
    bam_p->core.l_qseq = (int32_t)(int32_t)bam_cigar2qlen(&bam_p->core, bam1_cigar(bam_p)); //lenght from CIGAR

    //setting flags
    if (alignment_p->is_paired_end)   bam_p->core.flag += BAM_FPAIRED;
    if (alignment_p->is_paired_end_mapped) bam_p->core.flag += BAM_FPROPER_PAIR;
    if (!alignment_p->is_seq_mapped)   bam_p->core.flag += BAM_FUNMAP;   //in bam structure is negative flag!!!
    if ((!alignment_p->is_mate_mapped) && (alignment_p->is_paired_end))   bam_p->core.flag += BAM_FMUNMAP; //in bam structure is negative flag!!!
    if (alignment_p->seq_strand)    bam_p->core.flag += BAM_FREVERSE;
    if (alignment_p->mate_strand)   bam_p->core.flag += BAM_FMREVERSE;

    if (alignment_p->pair_num == 1) {
        bam_p->core.flag += BAM_FREAD1;
    } else if (alignment_p->pair_num == 2) {
        bam_p->core.flag += BAM_FREAD2;
    }

    if (alignment_p->primary_alignment)    bam_p->core.flag += BAM_FSECONDARY;
    if (alignment_p->fails_quality_check)  bam_p->core.flag += BAM_FQCFAIL;
    if (alignment_p->pc_optical_duplicate) bam_p->core.flag += BAM_FDUP;

    //bin field requieres core
    bam_p->core.bin = bam_reg2bin(alignment_p->position, bam_calend(&bam_p->core, bam1_cigar(bam_p)));

    return bam_p;
}
示例#15
0
文件: sw_align.c 项目: nh13/SRMA
// TODO soft-clipping
bam1_t *sw_align(graph_t *g, bam1_t *b, node_t *n, sw_heap_t *heap, char *rg_id, int32_t offset, cov_cutoffs_t *cutoffs, uint8_t correct_bases, uint8_t use_qualities, int32_t max_total_coverage, int32_t max_heap_size)
{
	char *colors = NULL;
	char *color_qualities = NULL;
	char base, qual;
	uint8_t space = SRMA_SPACE_NT;
	uint8_t strand;
	int32_t i, j, aln_start;
	int32_t num_start_nodes_added=0;
	int32_t sw_node_i=-1, sw_node_best_i=-1, sw_node_cur_i=-1, sw_node_next_i=-1;
	int32_t soft_clip_start_l = 0, soft_clip_end_l = 0;


	strand = bam1_strand(b);

	// soft-clipping
	if(1 == strand) { //reverse
		// going from 3'->5'
		soft_clip_start_l = sw_align_get_soft_clip(b, 1); 
		soft_clip_end_l = sw_align_get_soft_clip(b, 0);
	}
	else {
		// going from 5'->3'
		soft_clip_start_l = sw_align_get_soft_clip(b, 0); 
		soft_clip_end_l = sw_align_get_soft_clip(b, 1);
	}
	// FOR NOW
	if(0 < soft_clip_start_l || 0 < soft_clip_end_l) {
		return b;
	}

	// Check color space
	colors = sw_align_get_cs(b);
	if(NULL == colors) {
		space = SRMA_SPACE_NT;
	}
	else {
		space = SRMA_SPACE_CS;
		color_qualities  = sw_align_get_cq(b);
		// Some aligners include a quality value for the adapter.  A quality value
		// IMHO should not be given for an unobserved (assumed) peice of data.  Trim
		// the first quality in this case
		if(strlen(colors) == strlen(color_qualities)) {  // ignore leading quality
			color_qualities++;
		}
		if(0 < soft_clip_start_l || 0 < soft_clip_end_l) {
			srma_error(__func__, "Soft clipping not supported for color space", Exit, OutOfRange);
		}
	}	

	// remove mate info 
	b->core.flag &= ~(BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FMUNMAP);
	b->core.mtid = -1;
	b->core.mpos = -1;
	b->core.isize = 0;

	// re-type heap
	heap->type = (1 == strand) ? SRMA_SW_HEAP_MAX : SRMA_SW_HEAP_MIN;

	// bound with original alignment
	sw_node_best_i = sw_align_bound(g, b, n, heap, strand, colors, color_qualities, space, cutoffs, use_qualities, max_total_coverage, max_heap_size);
	if(0 <= sw_node_best_i) {
		/*
		sw_heap_reset(heap); // reset the heap, keep old nodes
		   fprintf(stderr, "BOUNDED score=%d coverage_sum=%hu\n", 
		   heap->nodes[sw_node_best_i].score,
		   heap->nodes[sw_node_best_i].coverage_sum); // DEBUG
		   */
	}
	else {
		//fprintf(stderr, "NOT BOUNDED\n"); // DEBUG
		// nodes do not need to be preserved
		sw_heap_clear(heap);
	}
	//return b; // HERE DEBUG HERE BUG
                                        
	// add start nodes
	if(strand) {
		if(SRMA_SPACE_CS == space) {
			base = nt2int_table[(int)colors[1]];
			qual = color_qualities[0]; 
		}
		else {
			base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1)];
			qual = bam1_qual(b)[b->core.l_qseq-1] + 33;
		}
		aln_start = bam_calend(&b->core, bam1_cigar(b));
		for(i=aln_start+offset;aln_start-offset<=i;i--) {
			int32_t pos = graph_get_node_list_index_at_or_before(g, i);
			node_list_t *list = graph_get_node_list(g, pos);
			if(1 != pos && NULL != list) {
				for(j=0;j<list->length;j++) {
					node_t *node = list->nodes[j];
					int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage);
					if(0 == pass) {
						sw_node_i = sw_heap_get_node_i(heap);
						sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); 
						sw_heap_add_i(heap, sw_node_i);
					}
					else if(pass < 0) {
						sw_heap_clear(heap); // clear heap
						return b;
					}
					if(node->position < i) {
						i = node->position;
					}
					num_start_nodes_added++;
				}
			}
		}
	}
	else {
		if(SRMA_SPACE_CS == space) {
			base = nt2int_table[(int)colors[1]];
			qual = color_qualities[0]; 
		}
		else {
			base = nt4bit_to_int[bam1_seqi(bam1_seq(b), 0)];
			qual = bam1_qual(b)[0] + 33;
		}
		aln_start = b->core.pos;
		for(i=aln_start-offset;i<=aln_start+offset;i++) {
			int32_t pos = graph_get_node_list_index_at_or_after(g, i);
			node_list_t *list = graph_get_node_list(g, pos);
			if(0 != pos && NULL != list) {
				for(j=0;j<list->length;j++) {
					node_t *node = list->nodes[j];
					int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage);
					if(0 == pass) {
						sw_node_i = sw_heap_get_node_i(heap);
						sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); 
						sw_heap_add_i(heap, sw_node_i);
					}
					else if(pass < 0) {
						sw_heap_clear(heap); // clear heap
						return b;
					}
					if(node->position < i) {
						i = node->position;
					}
					num_start_nodes_added++;
				}
			}
		}
	}
	if(0 == num_start_nodes_added) {
		srma_error(__func__, "Did not add any start nodes", Exit, OutOfRange);
	}

	sw_node_cur_i = sw_heap_poll_i(heap);
	while(0 <= sw_node_cur_i) {
                    
		if(max_heap_size < heap->queue_end - heap->queue_start + 1) {
			// too many to consider
			sw_heap_clear(heap); // clear heap
			return b;
		}

		sw_node_next_i = sw_heap_peek_i(heap);
		assert(0 <= sw_node_cur_i); // DEBUG
		while(NODE_INSERTION != __node_type(heap->nodes[sw_node_cur_i].node)
				&& 0 <= sw_node_next_i
				&& 0 == sw_node_compare(&heap->nodes[sw_node_cur_i], &heap->nodes[sw_node_next_i], heap->type)) {
			if(heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_next_i].score ||
					(heap->nodes[sw_node_cur_i].score == heap->nodes[sw_node_next_i].score &&
					 heap->nodes[sw_node_cur_i].coverage_sum < heap->nodes[sw_node_next_i].coverage_sum)) { 
				sw_node_cur_i = sw_heap_poll_i(heap);
			}
			else {
				// ignore the next node
				sw_heap_poll_i(heap);
			}
			sw_node_next_i = sw_heap_peek_i(heap);
		}
		sw_node_next_i = -1;

		if(heap->nodes[sw_node_cur_i].read_offset == b->core.l_qseq-1) { // found, keep best
			if(sw_node_best_i < 0 ||
					heap->nodes[sw_node_best_i].score < heap->nodes[sw_node_cur_i].score ||
					(heap->nodes[sw_node_best_i].score == heap->nodes[sw_node_cur_i].score && 
					 heap->nodes[sw_node_best_i].coverage_sum < heap->nodes[sw_node_cur_i].coverage_sum)) {
				//fprintf(stderr, "FOUND BEST\n"); // DEBUG
				sw_node_best_i = sw_node_cur_i;
			}
		}
                else if(0 <= sw_node_best_i && 
                        heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_best_i].score) {
                        // ignore, under the assumption that scores can only
                        // become more negative.
                }
		else {
			edge_list_t *list = NULL;
			if(1 == strand) { // reverse
				list = heap->nodes[sw_node_cur_i].node->prev;
			}
			else {
				list = heap->nodes[sw_node_cur_i].node->next;
			}
			{ // get the base and quality
				if(SRMA_SPACE_CS == space) {
					base = nt2int_table[(int)colors[1 + (heap->nodes[sw_node_cur_i].read_offset+1)]];
					qual = color_qualities[heap->nodes[sw_node_cur_i].read_offset+1]; 
				}
				else {
					if(strand) {
						base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1)];
						qual = bam1_qual(b)[b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1] + 33;
					}
					else {
						base = nt4bit_to_int[bam1_seqi(bam1_seq(b), (heap->nodes[sw_node_cur_i].read_offset+1))];
						qual = bam1_qual(b)[(heap->nodes[sw_node_cur_i].read_offset+1)] + 33;
					}
				}
			}
			/*
			   node_t *node = heap->nodes[sw_node_cur_i].node;
			   fprintf(stderr, "NODE %d:%d offset=%d coverage=%d base=%d\n",
			   node->contig, node->position, node->offset, node->coverage, node->base);
			   fprintf(stderr, "SW_NODE read_offset=%d score=%d coverage_sum=%d start_position=%d space=%d\n",
			   heap->nodes[sw_node_cur_i].read_offset, heap->nodes[sw_node_cur_i].score, heap->nodes[sw_node_cur_i].coverage_sum, heap->nodes[sw_node_cur_i].start_position, space);
			   */
			for(i=0;i<list->length;i++) {
				node_t *node_cur = list->nodes[i];
				uint16_t coverage_cur = list->coverages[i];
				int32_t pass = pass_filters(g, node_cur, coverage_cur, cutoffs, max_total_coverage);
				if(0 == pass) {
					// add to the heap
					sw_node_i = sw_heap_get_node_i(heap);
					// DEBUG
					assert(0 <= sw_node_cur_i);
					assert(0 <= heap->nodes[sw_node_cur_i].read_offset);
					sw_node_init(&heap->nodes[sw_node_i], &heap->nodes[sw_node_cur_i], node_cur, coverage_cur, base, qual, use_qualities, space); 
					sw_heap_add_i(heap, sw_node_i);
				}
				else if(pass < 0) {
					sw_heap_clear(heap); // clear heap
					return b;
				}
			}
		}
		// get the next node
		sw_node_cur_i = sw_heap_poll_i(heap);
	}

        /*
	fprintf(stderr, "sw_node_best_i=%d\n", sw_node_best_i); // DEBUG
	if(0 <= sw_node_best_i) {
	fprintf(stderr, "END score=%d coverage_sum=%hu\n", 
	heap->nodes[sw_node_best_i].score,
	heap->nodes[sw_node_best_i].coverage_sum); // DEBUG
	}
        */
	// update SAM/BAM
	b = sw_align_update_bam(b, rg_id, heap, sw_node_best_i, space, colors, color_qualities, strand, correct_bases);
	sw_heap_clear(heap); // clear heap
	return b;
}	
示例#16
0
int main(int argc,char* argv[]) {
    time_t timestamp, current;
    int i,j,k;
    int a,n;
    char *pc;

    FILE *input_file;
    FILE *output_file;
    FILE* log_file=stderr;

    bamFile bam_input;
    bam_header_t *header;
    bam1_t* b;
    bam1_core_t *c;


    char cps_file_name[MAXFILEBUFFLENGTH]="";
    char bam_file_name[MAXFILEBUFFLENGTH]="";
    char out_file_name[MAXFILEBUFFLENGTH]="";
    char log_file_name[MAXFILEBUFFLENGTH]="";

    char buff[MAXFILEBUFFLENGTH];
    char chr[MAXFILEBUFFLENGTH];
    int beg, beg_prev, end, pos, offset; 
    int ref_id, ref_id_prev, label;
    int s, side;
    int read_type, mapped_strand;
    char ch;

    int limit_counts = 0;

    int* contig_count[2];
    int* contig_index[2];
    splice_site** contig_sites[2];

    long int n_reads[N_READ_TYPES][2];

    long int n_total_reads = 0;
    long int n_skipped_reads = 0;

    int max_intron_length=0;
    int min_intron_length=0;
    int ignore_gene_labels = 0;
    int stranded = 1;
    int rev_compl[2] = {1,0};

    int other_end, the_end, donor_id, acceptor_id;

    int *cigar;
    int flagged = 0;
    int margin = 4;


    /** reading input from the command line **/

    timestamp = time(NULL);

    if(argc==1) {
	fprintf(stderr, "BAM2SSJ is the utility for fast counting reads covering splice junctions\nCommand line use:\n");
        fprintf(stderr, "%s -cps <cps_file> -bam <bam_file> [-out <out_file>] [-log <log_file>] [-maxlen <max_intron_length>] [-minlen <min_intron_length>] [-margin <length>] ",argv[0]);
	fprintf(stderr, "[-v suppress verbose output] [-read1 0/1] [-read2 0/1] [-g ignore gene labels] [-u unstranded] [-f count reads flagged 0x800 only]\ntype %s -h for more info\n",argv[0]);
        exit(1);
    }

    for(i=1;i<argc;i++) {
        pc = argv[i];
        if(*pc == '-') {
            if(strcmp(pc+1,"cps") == 0) sscanf(argv[++i], "%s", &cps_file_name[0]);
	    if(strcmp(pc+1,"bam") == 0) sscanf(argv[++i], "%s", &bam_file_name[0]);
	    if(strcmp(pc+1,"out") == 0) sscanf(argv[++i], "%s", &out_file_name[0]);
            if(strcmp(pc+1,"log") == 0) sscanf(argv[++i], "%s", &log_file_name[0]);

            if(strcmp(pc+1,"read1") == 0) sscanf(argv[++i], "%i", &rev_compl[0]);
            if(strcmp(pc+1,"read2") == 0) sscanf(argv[++i], "%i", &rev_compl[1]);

	    if(strcmp(pc+1,"lim") == 0) sscanf(argv[++i], "%i", &limit_counts);
	    if(strcmp(pc+1,"minlen") == 0) sscanf(argv[++i], "%i", &min_intron_length);
	    if(strcmp(pc+1,"maxlen") == 0) sscanf(argv[++i], "%i", &max_intron_length);
	    if(strcmp(pc+1,"margin") == 0) sscanf(argv[++i], "%i", &margin);

	    if(strcmp(pc+1,"v") == 0) verbose = 0;
	    if(strcmp(pc+1,"g") == 0) ignore_gene_labels = 1;
	    if(strcmp(pc+1,"u") == 0) stranded = 0;
	    if(strcmp(pc+1,"f") == 0) flagged = 1;

	    if(strcmp(pc+1,"h") ==0 ) {
		fprintf(stderr, "Input:  (1) sorted BAM file\n");
		fprintf(stderr, "\t(2) CPS (chromosome-position-strand) tab-delimited file sorted by position (chr1 100 + etc)\n\n");
        	fprintf(stderr, "\tIn order to get CPS file from gtf, use the utility gtf2cps.sh\n");
        	fprintf(stderr, "\tImportant: CPS must be sorted by position ONLY!\n\n");
        	fprintf(stderr, "\tIf the 4th column contains (a numeric) gene label then only splice junctions within the same gene will be considered (unless the '-g' option is active)\n");
		fprintf(stderr, "\tThe utility to generate CPS with gene labels is gtf2cps_with_gene_id.sh (or update the script accordingly if you are using genome other than human)\n\n");
		fprintf(stderr, "Options:\n");
        	fprintf(stderr, "\t-maxlen <upper limit on intron length>; 0 = no limit (default=%i)\n",max_intron_length);
		fprintf(stderr, "\t-minlen <lower limit on intron length>; 0 = no limit (default=%i)\n",min_intron_length);
		fprintf(stderr, "\t-margin <length> minimum number of flanking nucleotides in the read in order to support SJ or cover EB, (default=%i)\n",margin);
        	fprintf(stderr, "\t-read1 0/1, reverse complement read1 no/yes (default=%i)\n",rev_compl[0]);
        	fprintf(stderr, "\t-read2 0/1, reverse complement read2 no/yes (default=%i)\n",rev_compl[1]);
        	fprintf(stderr, "\t-g ignore gene labels (column 4 of cps), default=%s\n", ignore_gene_labels ? "ON" : "OFF");
        	fprintf(stderr, "\t-u ignore strand (all reads map to the correct strand), default=%s\n", stranded ? "OFF" : "ON");
		fprintf(stderr, "\t-f count only reads that are flagged 0x800 (uniquely mapped reads), default=%s\n", flagged ? "ON" : "OFF");
		fprintf(stderr, "Output: tab-delimited  (default=stdout)\n");
        	fprintf(stderr, "\tColumn 1 is splice_junction_id\n");
        	fprintf(stderr, "\tColumns 2-6 are counts of 53, 5X, X3, 50, and 03 reads for the correct (annotated) strand\n");
        	fprintf(stderr, "\tColumns 7-11 are similar counts for the incorrect (opposite to annotated) strand\n");
		fprintf(stderr, "Descriptive read statistics are reported to stderr\n");
		exit(1);
	    }
	}
    }

    if(log_file_name[0]==0) {
	log_file = stderr;
    }
    else {
	log_file = fopen(log_file_name,"w");
	if(log_file == NULL) log_file = stderr;
    }

    if(bam_file_name[0]==0) {
	fprintf(log_file,"Bam not specified, exiting\n");
	exit(1); 
    }

    if(cps_file_name[0]==0) {
        fprintf(log_file,"Input not specified, exiting\n");
        exit(1);
    }

    if(out_file_name[0]==0) {
	fprintf(log_file,"[Warning: output set to stdout]\n");
	output_file = stdout;
    }
    else {
	output_file = fopen(out_file_name,"w");
	if(output_file == NULL) {
	    fprintf(log_file,"[Warning: output set to stdout]\n");
            output_file = stdout;
	}
    }

    if(max_intron_length>0) {
	if(verbose) fprintf(log_file,"[Warning: set max intron length=%i]\n",max_intron_length);
    }

    if(ignore_gene_labels) {
	if(verbose) fprintf(log_file,"[Warning: ignoring gene labels (column 4)]\n");
    }

    if(flagged) {
	if(verbose) fprintf(log_file,"[Warning: only look at reads flagged 0x800]\n");
    }

    if(margin>0) {
	if(verbose) fprintf(log_file,"[Warning: read margin set to %i]\n", margin);
    }

    if(verbose) {
	for(s = 0; s < 2; s++) if(rev_compl[s]) fprintf(log_file,"[Warning: take reverse complement of read %i]\n", s+1);
	fprintf(log_file,"[Warning: stranded = %s]\n", stranded ? "TRUE" : "FALSE (always correct strand)");
	if(ignore_gene_labels) fprintf(log_file,"[Warning: ignore gene labels (column 4)]\n");
    }


    for(i = 0; i < N_READ_TYPES; i++) for(s = 0; s < 2; s++) n_reads[i][s] = 0;

    /** initatializing BAM and header **/
   
    bam_input = bam_open(bam_file_name, "r");
    header = bam_header_read(bam_input);

    if(bam_input == NULL || header == NULL) {
        fprintf(log_file,"BAM can't be opened or contains no header, exiting\n");
        exit(1);
    }

    /** reading input from CPS **/

    input_file = fopen(cps_file_name, "r");
    if(input_file == NULL) {
	fprintf(log_file,"CPS can't be opened, exiting\n");
        exit(1);
    }

    /** populating gene structure arrays **/

    for(s = 0; s < 2; s++) {
    	contig_count[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN));
    	contig_index[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN));
    	contig_sites[s] = (splice_site**) malloc(sizeof(splice_site*) * (header->n_targets + ARRAY_MARGIN));

    	if(contig_count[s] == NULL || contig_sites[s] == NULL || contig_index[s] == NULL) {
	    fprintf(log_file, "Not enought memory, exiting\n");
            exit(1);
    	}
    }

    for(s = 0; s < 2; s++)
        for(i=0; i < header->n_targets; i++) 
	    contig_count[s][i] = contig_index[s][i] = 0;

    if(verbose) fprintf(log_file, "Reading %s pass1", cps_file_name);
    while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) {
	sscanf(buff, "%s %*i %c", &chr[0], &ch);
	bam_parse_region(header, chr, &i, &beg, &end);
	s = (ch == '+' ? 0 : 1);
	if(i < header->n_targets && i>=0) contig_count[s][i]++;
    }

    for(s = 0; s < 2; s++) {
    	for(i = 0;i < header->n_targets; i++) {
	    contig_sites[s][i] = (splice_site*) malloc(sizeof(splice_site) * (contig_count[s][i] + ARRAY_MARGIN));
	    if(contig_sites[s][i] == NULL) {
	    	fprintf(log_file, "Not enought memory, exiting\n");
            	exit(1);
	    }
	}
    }
    if(verbose) fprintf(log_file, "\n");

    if(verbose) fprintf(log_file, "Reading %s pass2",cps_file_name);
    fseek(input_file, 0, SEEK_SET);
    while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) {
        sscanf(buff, "%s %i %c %i", &chr[0], &pos, &ch, &label);
	bam_parse_region(header, chr, &i, &beg, &end);
	s = (ch == '+' ? 0 : 1);
	if(i < header->n_targets && i>=0) {
	    if(contig_index[s][i]>0) {
		if(pos < contig_sites[s][i][contig_index[s][i]-1].pos) {
		    fprintf(log_file, "Splice sites weren't sorted, exiting\n");
		    exit(1);
		}
	    }
	    contig_sites[s][i][contig_index[s][i]].pos = pos;
	    contig_sites[s][i][contig_index[s][i]].label = ignore_gene_labels ? 0 : label;
	    for(side = 0; side < 2; side++) {
                contig_sites[s][i][contig_index[s][i]].count00[side] = 0;
                contig_sites[s][i][contig_index[s][i]].count5X[side] = 0;
                contig_sites[s][i][contig_index[s][i]].countX3[side] = 0;
		contig_sites[s][i][contig_index[s][i]].junctions = NULL;
	    }
	    contig_index[s][i]++;
	}
    }
    if(verbose) fprintf(log_file, "\n");

    for(s = 0; s < 2; s++)
    	for(i = 0;i < header->n_targets; i++) 
	    contig_index[s][i] = 0;

    /** analysis starts here **/

    b = bam_init1();
    k = 0;
    ref_id_prev = -1;
    beg_prev = -1;
    while(bam_read1(bam_input, b)>=0) {
        c   = &b->core;
	ref_id = c->tid;
	if(ref_id<0) continue;

	if(flagged && ((c->flag & 0x800) == 0)) {
	    n_skipped_reads++;
	    continue;
	}

        if(stranded && ((c->flag & BAM_FREAD1) && (c->flag & BAM_FREAD2) || !(c->flag & BAM_FREAD1) && !(c->flag & BAM_FREAD2))) {
            n_skipped_reads++;
            continue;
        }

        cigar = bam1_cigar(b);

	if(ref_id != ref_id_prev  && ref_id_prev >= 0) {
	    if(contig_index[0][ref_id_prev] + contig_index[1][ref_id_prev] < contig_count[0][ref_id_prev] + contig_count[1][ref_id_prev]) {
		if(log_file==stderr) progressbar(1, 1, header->target_name[ref_id_prev], verbose);
	    }
	    beg_prev = -1;
	}

	/*if(ref_id < ref_id_prev) {
	    fprintf(log_file,"BAM file wasn't sorted, exiting\n");
            exit(1);
	}*/

	ref_id_prev = ref_id;

	beg = c->pos + 1;
	if(beg < beg_prev) {
	    fprintf(log_file,"BAM file wasn't sorted, exiting\n");
	    exit(1);
	}
	beg_prev = beg;

	s = ((c->flag & BAM_FREVERSE)>0);
	mapped_strand = (c->flag & BAM_FREAD1) ? (s + rev_compl[0]) & 1 : (s + rev_compl[1]) & 1;

	the_end = bam_calend(c, cigar);

	for(s = 0; s < 1 + stranded; s++) {
            end = beg;
	    side = (s == mapped_strand) ? 0 : 1;
	    side *= stranded;

	    // keep reading until the currect site is on the same chromosome downstream of the read 

	    while(contig_sites[s][ref_id][contig_index[s][ref_id]].pos < beg && contig_index[s][ref_id] < contig_count[s][ref_id]) {
		contig_index[s][ref_id]++;
	    	if(log_file==stderr) progressbar(contig_index[0][ref_id]+contig_index[1][ref_id], contig_count[0][ref_id]+contig_count[1][ref_id], header->target_name[ref_id], verbose);
	    }

	    read_type = RT_OTHER;

            if(contig_index[s][ref_id]<contig_count[s][ref_id]) {
	    	// check if the read is a split read and find its other end
	    	read_type = RT_GENOME;
            	for(i = 0; i < c->n_cigar; i++) {
	    	    offset = cigar[i] >> 4;
	    	    switch(cigar[i] & 0x0F) {
		    	case BAM_CMATCH: 	end += offset;  // match to the reference
					 	break;
		    	case BAM_CINS:		end += 0;	// insertion to the reference, pointer stays unchanged
						break;
		    	case BAM_CDEL:		end += offset;	// deletion from the reference (technically the same as 'N') pointer moves
						break; 
		    	case BAM_CREF_SKIP:	other_end = end + offset;
						donor_id = acceptor_id = -INFTY;
						if(end - beg < margin) break;
						if(the_end - other_end < margin) break;
						for(j = contig_index[s][ref_id]; contig_sites[s][ref_id][j].pos <= other_end && j < contig_count[s][ref_id];j++) {
						    if(contig_sites[s][ref_id][j].pos - end < min_intron_length && min_intron_length > 0) continue;
						    if(contig_sites[s][ref_id][j].pos - end > max_intron_length && max_intron_length > 0) break;
					    	    if(contig_sites[s][ref_id][j].label == contig_sites[s][ref_id][contig_index[s][ref_id]].label) {
					    	    	if(contig_sites[s][ref_id][j].pos == end - 1)   donor_id = j;
					    	    	if(contig_sites[s][ref_id][j].pos == other_end) acceptor_id = j;
					    	    }
					    	}
						if(donor_id>0 && acceptor_id>0) {
					    	    update_count(&contig_sites[s][ref_id][donor_id].junctions, acceptor_id, side);
					    	    contig_sites[s][ref_id][donor_id].count5X[side]++;
                                            	    contig_sites[s][ref_id][acceptor_id].countX3[side]++;
					    	    read_type = RT_KJUNCT;
						}
						else {
					    	    read_type = RT_UJUNCT;
						}
						end = other_end;
				 		break;
		    	case BAM_CSOFT_CLIP:
		    	case BAM_CHARD_CLIP:
		    	case BAM_CPAD:		break;
		    	default:		read_type = RT_OTHER;
	    	    }
            	}

	    	if(read_type == RT_GENOME) {
	            for(j=contig_index[s][ref_id]; beg + margin <= contig_sites[s][ref_id][j].pos  && contig_sites[s][ref_id][j].pos < end - margin && j<contig_count[s][ref_id]; j++) {
		    	contig_sites[s][ref_id][j].count00[side]++;
		    	read_type = RT_OVRLAP;
		    	k++;
	    	    }
	    	}
	    }

	    n_reads[read_type][side]++;
	}
	n_total_reads++;

	if(k>limit_counts && limit_counts>0) break;

    }